From 77bbe086dd81aaa7c54dc433ccd2a08c8f2fc8e7 Mon Sep 17 00:00:00 2001 From: Yanglei Zou Date: Tue, 29 Oct 2024 14:20:49 +0800 Subject: [PATCH 001/156] Add ggml-openvino base files --- ggml/include/ggml-openvino.h | 45 ++++++++++++++++++++++++++++++++++++ ggml/src/ggml-openvino.cpp | 23 ++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 ggml/include/ggml-openvino.h create mode 100644 ggml/src/ggml-openvino.cpp diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h new file mode 100644 index 0000000000000..e0229cf18c244 --- /dev/null +++ b/ggml/include/ggml-openvino.h @@ -0,0 +1,45 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API ggml_backend_t ggml_backend_openvino_init(int device); + +GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend); + +// device buffer +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_split_buffer_type(const float *tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU +// and GPU +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_host_buffer_type(void); + +// GGML_API int ggml_backend_openvino_get_device_count(void); +// GGML_API void ggml_backend_openvino_get_device_description(int device, +// char *description, +// size_t +// description_size); +// GGML_API void ggml_backend_openvino_get_device_memory(int device, size_t +// *free, +// size_t *total); + +// GGML_API bool ggml_backend_openvino_register_host_buffer(void *buffer, size_t +// size); GGML_API void ggml_backend_openvino_unregister_host_buffer(void +// *buffer); + +GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp new file mode 100644 index 0000000000000..177e51458da00 --- /dev/null +++ b/ggml/src/ggml-openvino.cpp @@ -0,0 +1,23 @@ +#include "ggml-openvino.h" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +// backend API +GGML_API ggml_backend_t ggml_backend_openvino_init(int device) {} + +GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) {} + +// device buffer +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_buffer_type(int device) {} + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_split_buffer_type(const float *tensor_split) {} + +// pinned host buffer for use with the CPU backend for faster copies between CPU +// and GPU +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_host_buffer_type(void) {} + +GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {} From a1c2d8048abf051351b0728e6bb74f3cdeac2e72 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 13 Nov 2024 13:32:44 +0800 Subject: [PATCH 002/156] add openvino as optional backend for Llama.cpp ggml --- ggml/include/ggml-openvino.h | 30 ++- ggml/src/ggml-openvino.cpp | 448 ++++++++++++++++++++++++++++++++++- 2 files changed, 470 insertions(+), 8 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index e0229cf18c244..9172414c291b5 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -1,12 +1,18 @@ #pragma once -#include "ggml-backend.h" #include "ggml.h" +#include "ggml-backend.h" + +#include +#include #ifdef __cplusplus extern "C" { #endif +#define GGML_OPENVINO_NAME "OPENVINO" +#define GGML_OPENVINO_MAX_DEVICES 16 + // backend API GGML_API ggml_backend_t ggml_backend_openvino_init(int device); @@ -25,7 +31,7 @@ ggml_backend_openvino_split_buffer_type(const float *tensor_split); GGML_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); -// GGML_API int ggml_backend_openvino_get_device_count(void); +GGML_API int ggml_backend_openvino_get_device_count(void); // GGML_API void ggml_backend_openvino_get_device_description(int device, // char *description, // size_t @@ -40,6 +46,26 @@ ggml_backend_openvino_host_buffer_type(void); GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void); +struct ggml_openvino_device_info { + int device_count; + + struct openvino_device_info { + int cc; // compute capability + int nsm; // number of streaming multiprocessors + size_t smpb; // max. shared memory per block + size_t smpbo; // max. shared memory per block (with opt-in) + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory + size_t total_vram; + }; + + openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {}; + + std::array default_tensor_split = {}; +}; + +const ggml_openvino_device_info & ggml_openvino_info(); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 177e51458da00..87047a2f30bda 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -2,22 +2,458 @@ #include "ggml-backend-impl.h" #include "ggml-impl.h" +#include +#include +#include +#include +#include +#include +#include + +struct ggml_backend_openvino_context { + int device; + std::string name; + std::string description; +}; + +static void ggml_backend_openvino_free(ggml_backend_t backend) { + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; + delete ctx; + delete backend; +} + +static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { + return GGML_OPENVINO_NAME; + GGML_UNUSED(backend); +} + +static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + GGML_UNUSED(backend); +} + +static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { + // Placeholder for OpenVINO add operation + GGML_ASSERT(ctx.device != 0); + GGML_ASSERT(dst->data != nullptr); +} + +static void test_op_for_NONE() { + GGML_LOG_DEBUG("...test_op_for_NONE... \n"); +} + +static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + // TODO + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_ADD: + // TODO + ggml_backend_openvino_add(*ctx, node); + break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + break; + case GGML_OP_NONE: + test_op_for_NONE(); + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +static const ggml_backend_i ggml_backend_openvino_interface = { + /* .get_name = */ ggml_backend_openvino_get_name, + /* .free = */ ggml_backend_openvino_free, + /* .get_default_buffer_type = */ ggml_backend_openvino_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_openvino_graph_compute, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, + /* .offload_op = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +int ggml_backend_openvino_get_device_count() { + return ggml_openvino_info().device_count; +} + +static ggml_guid_t ggml_backend_openvino_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + // backend API -GGML_API ggml_backend_t ggml_backend_openvino_init(int device) {} +GGML_API ggml_backend_t ggml_backend_openvino_init(int device) { + if (device < 0 || device >= ggml_backend_openvino_get_device_count()) { + GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device); + return nullptr; + } + + ggml_backend_openvino_context * ctx = new ggml_backend_openvino_context; + if (ctx == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate context\n", __func__); + return nullptr; + } -GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) {} + ggml_backend_t openvino_backend = new ggml_backend { + /* .guid = */ ggml_backend_openvino_guid(), + /* .interface = */ ggml_backend_openvino_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device), + /* .context = */ ctx, + }; + + return openvino_backend; +} + +GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) { + GGML_ASSERT(backend->context != nullptr); + return true; +} // device buffer GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_buffer_type(int device) {} +ggml_backend_openvino_buffer_type(int device) { + GGML_ASSERT(device >= 0); + return nullptr; +} // split tensor buffer that splits matrices by rows across multiple devices GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_split_buffer_type(const float *tensor_split) {} +ggml_backend_openvino_split_buffer_type(const float *tensor_split) { + GGML_ASSERT(tensor_split != nullptr); + return nullptr; +} // pinned host buffer for use with the CPU backend for faster copies between CPU // and GPU GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_host_buffer_type(void) {} +ggml_backend_openvino_host_buffer_type(void) { return nullptr;} + + +struct ggml_backend_openvino_buffer_type_context { + int device; + std::string name; +}; + +static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} +static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; +} + + +static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_OPENVINO_NAME "_Split"; + + GGML_UNUSED(buft); +} + +static bool ggml_backend_buft_is_openvino_split(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_split_buffer_type_get_name; +} + +struct ggml_backend_openvino_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ctx->description.c_str(); +} + +// TODO +static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + GGML_ASSERT(dev->context != nullptr); + GGML_ASSERT(free != nullptr); + GGML_ASSERT(total != nullptr); + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + // Placeholder + GGML_ASSERT(ctx->device >= 0); + // ggml_openvino_set_device(ctx->device); +} + +static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_CPU; + // return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_openvino_device_get_name(dev); + props->description = ggml_backend_openvino_device_get_description(dev); + props->type = ggml_backend_openvino_device_get_type(dev); + ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr; +#ifdef GGML_OPENVINO_NO_PEER_COPY + bool events = false; +#else + bool events = true; +#endif + + props->caps = { + /* .async = */ true, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ events, + }; +} + +static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ggml_backend_openvino_init(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ggml_backend_openvino_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_openvino_host_buffer_type(); +} + +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + GGML_ASSERT(dev->reg != nullptr); + // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; + + switch (op->op) { + case GGML_OP_UNARY: + return false; + case GGML_OP_NONE: + return true; + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + return false; + case GGML_OP_ADD: + { + ov::op::v1::Add add; + //add.evaluate(op->outputs[0], op->inputs[1]); + return false; + } + case GGML_OP_ADD1: + case GGML_OP_SUB: + { + ov::op::v1::Subtract sub; + //sub.evaluate(TensorVector& outputs, const TensorVector& inputs); + return false; + } + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + case GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_OPT_STEP_ADAMW: + return false; + default: + return false; + } +} + +static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_openvino_device_interface = { + /* .get_name = */ ggml_backend_openvino_device_get_name, + /* .get_description = */ ggml_backend_openvino_device_get_description, + /* .get_memory = */ ggml_backend_openvino_device_get_memory, + /* .get_type = */ ggml_backend_openvino_device_get_type, + /* .get_props = */ ggml_backend_openvino_device_get_props, + /* .init_backend = */ ggml_backend_openvino_device_init, + /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_openvino_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_openvino_device_supports_op, + /* .supports_buft = */ ggml_backend_openvino_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +struct ggml_backend_openvino_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) { + return GGML_OPENVINO_NAME; + GGML_UNUSED(reg); +} + +static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) { + return ggml_openvino_info().device_count; + GGML_UNUSED(reg); + + // TODO + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; + // GGML_ASSERT(index == 0); + + // static ggml_backend_device ggml_backend_openvino_device = { + // /* .iface = */ ggml_backend_openvino_device_interface, + // /* .reg = */ reg, + // /* .context = */ nullptr, + // }; + + // return &ggml_backend_openvino_device; + + // GGML_UNUSED(reg); + // GGML_UNUSED(index); +} + +static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_openvino_split_buffer_type; + } + // if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { + // return (void *)ggml_backend_openvino_register_host_buffer; + // } + // if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { + // return (void *)ggml_backend_openvino_unregister_host_buffer; + // } + return nullptr; +} + +static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = { + /* .get_name = */ ggml_backend_openvino_reg_get_name, + /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count, + /* .get_device = */ ggml_backend_openvino_reg_get_device, + /* .get_proc_address = */ ggml_backend_openvino_get_proc_address, +}; + +static int get_openvino_device_count() { + ov::Core core; + auto devices = core.get_available_devices(); + // return devices.size(); + return 1; +} + +static ggml_openvino_device_info ggml_openvino_init() { + ggml_openvino_device_info info = {}; + // TODO + info.device_count = get_openvino_device_count(); + return info; +} + +const ggml_openvino_device_info & ggml_openvino_info() { + static ggml_openvino_device_info info = ggml_openvino_init(); + return info; +} + +GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { + static ggml_backend_reg reg; + + static bool initialized = false; + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context; + + // GGML_LOG_DEBUG("ggml_openvino_info().device_count = %d \n", ggml_openvino_info().device_count); + for (int i = 0; i < ggml_openvino_info().device_count; i++) { + ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context; + dev_ctx->device = i; + dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i); + + // ggml_openvino_set_device(i); + dev_ctx->description = ov::get_openvino_version().description; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .interface = */ ggml_backend_openvino_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .interface = */ ggml_backend_openvino_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } -GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {} + return ® +} From f3b34e941c92ebcc82cd8b88a4051433a1ba1450 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 19 Nov 2024 15:53:54 +0800 Subject: [PATCH 003/156] * Configure the device(default CPU) that uses OpenVINO to compile the model * Add OpenVINO ADD operator to Llama.cpp. The output is somewhat abnormal and needs further debugging. --- ggml/src/ggml-openvino.cpp | 150 +++++++++++++++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 87047a2f30bda..4b864a0b6dd6e 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -10,10 +10,29 @@ #include #include +#define GGML_OPENVINO_MAX_STREAMS 8 + struct ggml_backend_openvino_context { - int device; - std::string name; - std::string description; + int device; // the device ID currently in use + std::string name; // context Name + std::string description; // context description + + // OpenVINO core components + ov::Core core; // OpenVINO core interface + std::shared_ptr model; // compiled Model + ov::InferRequest infer_request; // inference Request + + // OpenVINO Multi-stream support + static const int MAX_STREAMS = 8; // define the maximum number of flows + std::vector streams; // used to support multi-stream reasoning + int current_stream; // the currently active stream index + + // state Management + bool is_initialized; // initialize + + ggml_backend_openvino_context() + : device(0), name("OpenVINO"), description("OpenVINO Backend Context"), + current_stream(0), is_initialized(false) {} }; static void ggml_backend_openvino_free(ggml_backend_t backend) { @@ -32,10 +51,129 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( GGML_UNUSED(backend); } +static void ggml_backend_openvino_add_forward(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { + // Step 1: get the input tensor src0 和 src1 + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; + + if (src0 == nullptr || src1 == nullptr) { + std::cerr << "Error: src0 or src1 is null." << std::endl; + return; + } + + // Step 2: Check that the input tensor types and shapes match + if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32) { + std::cerr << "Error: Unsupported tensor type. Only GGML_TYPE_F32 is supported for OpenVINO." << std::endl; + return; + } + if (src0->ne[0] != src1->ne[0] || src0->ne[1] != src1->ne[1]) { + std::cerr << "Error: src0 and src1 shapes do not match." << std::endl; + return; + } + + // Step 3: Initialize OpenVINO model and streams (only done on first call) + if (!ctx.is_initialized) { + try { + // define input tensor shape + ov::Shape input_shape = {static_cast(src0->ne[0]), static_cast(src0->ne[1])}; + + // creat OpenVINO input node + auto input0 = std::make_shared(ov::element::f32, input_shape); + auto input1 = std::make_shared(ov::element::f32, input_shape); + + // define add operation + auto add_node = std::make_shared(input0, input1); + + // create model + auto model = std::make_shared(add_node, ov::ParameterVector{input0, input1}); + + // compile model and store in context +#ifdef GGML_OPENVINO_GPU + ctx.model = std::make_shared(ctx.core.compile_model(model, "GPU")); +#elif GGML_OPENVINO_NPU + ctx.model = std::make_shared(ctx.core.compile_model(model, "NPU")); +#else + ctx.model = std::make_shared(ctx.core.compile_model(model, "CPU")); +#endif + // initialize infer request + ctx.infer_request = ctx.model->create_infer_request(); + ctx.is_initialized = true; + + // std::cout << "OpenVINO add model initialized successfully." << std::endl; + } catch (const std::exception &e) { + std::cerr << "Error initializing OpenVINO model: " << e.what() << std::endl; + return; + } + } + + // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors + auto input_tensor0 = ctx.infer_request.get_tensor(ctx.model->input(0)); + auto input_tensor1 = ctx.infer_request.get_tensor(ctx.model->input(1)); + + // Note: OpenVINO Tensor data is contiguous, make sure src0 and src1 data is contiguous. + std::memcpy(input_tensor0.data(), src0->data, src0->nb[0] * src0->ne[0]); + std::memcpy(input_tensor1.data(), src1->data, src1->nb[0] * src1->ne[0]); + + // Step 5: execute inference + ctx.infer_request.infer(); + + // Step 6: get output data + ov::Tensor output_tensor = ctx.infer_request.get_tensor(ctx.model->output(0)); + + // Allocate memory for dst->data if not already allocated + if (dst->data == nullptr) { + dst->data = malloc(dst->nb[0] * dst->ne[0]); + if (dst->data == nullptr) { + std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; + return; + } + } + // Copy output data to dst + std::memcpy(dst->data, output_tensor.data(), dst->nb[0] * dst->ne[0]); + + // // Print results (optional, for debugging) + // float* dst_data = static_cast(dst->data); + // std::cout << "Output data:"; + // for (int i = 0; i < std::min(10, static_cast(dst->ne[0])); ++i) { + // std::cout << dst_data[i] << " "; + // } + // std::cout << std::endl; +} + static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { // Placeholder for OpenVINO add operation - GGML_ASSERT(ctx.device != 0); + // GGML_ASSERT(ctx.device != 0); GGML_ASSERT(dst->data != nullptr); + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + // ggml_backend_openvino_add_forward(ctx, dst, src0, src1); + } else if (src1->type == GGML_TYPE_F32) { + // ggml_compute_forward_add_f16_f32(params, dst); + } else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_F32: + { + if (src1->type == GGML_TYPE_F32) { + { + ggml_backend_openvino_add_forward(ctx, dst); + } + } + else { + GGML_ABORT("fatal error"); + } + } break; + default: + GGML_ABORT("%s: unsupported type %d\n", __func__, src1->type); + } + } static void test_op_for_NONE() { @@ -270,7 +408,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_UNARY: return false; case GGML_OP_NONE: - return true; + return false; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -281,7 +419,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con { ov::op::v1::Add add; //add.evaluate(op->outputs[0], op->inputs[1]); - return false; + return true; } case GGML_OP_ADD1: case GGML_OP_SUB: From d51cf69a3f6c3e66a6cdd2650d3ea5feef6311e3 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 21 Nov 2024 18:03:22 +0800 Subject: [PATCH 004/156] Solve the issue of abnormal model output caused by using OpenVINO ADD operator --- ggml/src/ggml-openvino.cpp | 159 ++++++++++++------------------------- 1 file changed, 52 insertions(+), 107 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 4b864a0b6dd6e..2cb9dfa7d3b5b 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -51,10 +51,18 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( GGML_UNUSED(backend); } -static void ggml_backend_openvino_add_forward(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { +static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { // Step 1: get the input tensor src0 和 src1 - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + // set the shape and stride of dst + dst->ne[0] = src0->ne[0]; + dst->ne[1] = src0->ne[1]; + dst->nb[0] = src0->nb[0]; + dst->nb[1] = src0->nb[1]; if (src0 == nullptr || src1 == nullptr) { std::cerr << "Error: src0 or src1 is null." << std::endl; @@ -71,76 +79,61 @@ static void ggml_backend_openvino_add_forward(ggml_backend_openvino_context & ct return; } - // Step 3: Initialize OpenVINO model and streams (only done on first call) - if (!ctx.is_initialized) { - try { - // define input tensor shape - ov::Shape input_shape = {static_cast(src0->ne[0]), static_cast(src0->ne[1])}; + ov::Tensor input0 = ov::Tensor(ov::element::f32, {static_cast(src0->ne[0]), static_cast(src0->ne[1])}, src0->data); + ov::Tensor input1 = ov::Tensor(ov::element::f32, {static_cast(src1->ne[0]), static_cast(src1->ne[1])}, src1->data); - // creat OpenVINO input node - auto input0 = std::make_shared(ov::element::f32, input_shape); - auto input1 = std::make_shared(ov::element::f32, input_shape); + auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); + auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); + auto add = std::make_shared(input0_param, input1_param); + auto function = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); - // define add operation - auto add_node = std::make_shared(input0, input1); - - // create model - auto model = std::make_shared(add_node, ov::ParameterVector{input0, input1}); - - // compile model and store in context + // compile model and store in context #ifdef GGML_OPENVINO_GPU - ctx.model = std::make_shared(ctx.core.compile_model(model, "GPU")); + auto compiled_model = core.compile_model(function, "GPU"); #elif GGML_OPENVINO_NPU - ctx.model = std::make_shared(ctx.core.compile_model(model, "NPU")); + auto compiled_model = core.compile_model(function, "NPU"); #else - ctx.model = std::make_shared(ctx.core.compile_model(model, "CPU")); + auto compiled_model = core.compile_model(function, "CPU"); #endif - // initialize infer request - ctx.infer_request = ctx.model->create_infer_request(); - ctx.is_initialized = true; - - // std::cout << "OpenVINO add model initialized successfully." << std::endl; - } catch (const std::exception &e) { - std::cerr << "Error initializing OpenVINO model: " << e.what() << std::endl; - return; - } - } + // initialize infer request + auto infer_request = compiled_model.create_infer_request(); // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors - auto input_tensor0 = ctx.infer_request.get_tensor(ctx.model->input(0)); - auto input_tensor1 = ctx.infer_request.get_tensor(ctx.model->input(1)); - - // Note: OpenVINO Tensor data is contiguous, make sure src0 and src1 data is contiguous. - std::memcpy(input_tensor0.data(), src0->data, src0->nb[0] * src0->ne[0]); - std::memcpy(input_tensor1.data(), src1->data, src1->nb[0] * src1->ne[0]); + infer_request.set_tensor(input0_param, input0); + infer_request.set_tensor(input1_param, input1); // Step 5: execute inference - ctx.infer_request.infer(); + infer_request.infer(); // Step 6: get output data - ov::Tensor output_tensor = ctx.infer_request.get_tensor(ctx.model->output(0)); - - // Allocate memory for dst->data if not already allocated - if (dst->data == nullptr) { - dst->data = malloc(dst->nb[0] * dst->ne[0]); - if (dst->data == nullptr) { - std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; - return; - } + ov::Tensor output = infer_request.get_tensor(compiled_model.output()); + + // // Allocate memory for dst->data if not already allocated + // if (dst->data == nullptr) { + // dst->data = malloc(dst->nb[0] * dst->ne[0]); + // if (dst->data == nullptr) { + // std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; + // return; + // } + // } + + std::memcpy(dst->data, output.data(), output.get_byte_size()); + + if (dst->ne[0] != src0->ne[0] || dst->ne[1] != src0->ne[1]) { + std::cerr << "Error: dst tensor shape does not match input tensor shape." << std::endl; + return; } - // Copy output data to dst - std::memcpy(dst->data, output_tensor.data(), dst->nb[0] * dst->ne[0]); - - // // Print results (optional, for debugging) - // float* dst_data = static_cast(dst->data); - // std::cout << "Output data:"; - // for (int i = 0; i < std::min(10, static_cast(dst->ne[0])); ++i) { - // std::cout << dst_data[i] << " "; + + // float* dst_data1 = (float*)(dst->data); + // printf("Output data:");; + // for (int i = 0; i < (10 < (int)(dst->ne[0]) ? 10 : (int)(dst->ne[0])); ++i) { + // printf("%f ", dst_data1[i]); // } - // std::cout << std::endl; + // printf("\n"); + // fflush(stdout); } -static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { +static void ggml_backend_openvino_add(ggml_tensor * dst) { // Placeholder for OpenVINO add operation // GGML_ASSERT(ctx.device != 0); GGML_ASSERT(dst->data != nullptr); @@ -163,7 +156,7 @@ static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_ { if (src1->type == GGML_TYPE_F32) { { - ggml_backend_openvino_add_forward(ctx, dst); + ggml_backend_openvino_add_forward(dst); } } else { @@ -181,16 +174,13 @@ static void test_op_for_NONE() { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // TODO - ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; - for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; switch (node->op) { case GGML_OP_ADD: // TODO - ggml_backend_openvino_add(*ctx, node); + ggml_backend_openvino_add(node); break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: @@ -405,53 +395,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; switch (op->op) { - case GGML_OP_UNARY: - return false; - case GGML_OP_NONE: - return false; - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_NORM: - return false; case GGML_OP_ADD: - { - ov::op::v1::Add add; - //add.evaluate(op->outputs[0], op->inputs[1]); return true; - } - case GGML_OP_ADD1: - case GGML_OP_SUB: - { - ov::op::v1::Subtract sub; - //sub.evaluate(TensorVector& outputs, const TensorVector& inputs); - return false; - } - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_RMS_NORM: - case GGML_OP_SCALE: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_SIN: - case GGML_OP_COS: - case GGML_OP_IM2COL: - case GGML_OP_POOL_2D: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_ARGSORT: - case GGML_OP_ACC: - case GGML_OP_GROUP_NORM: - case GGML_OP_UPSCALE: - case GGML_OP_PAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_LEAKY_RELU: - case GGML_OP_CROSS_ENTROPY_LOSS: - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - case GGML_OP_OPT_STEP_ADAMW: - return false; default: return false; } From 7a5684ca9c09bf4a0f42234485992f85bfdf6491 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 2 Dec 2024 10:18:54 +0800 Subject: [PATCH 005/156] Add OpenVINO MUL operator to GGML of Llama.cpp. --- ggml/src/ggml-openvino.cpp | 94 ++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2cb9dfa7d3b5b..788c2cb12231e 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #define GGML_OPENVINO_MAX_STREAMS 8 @@ -133,6 +134,42 @@ static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { // fflush(stdout); } +static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { + struct ggml_tensor *src0 = dst->src[0]; + struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + // define shape + ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // For Example: [7, 3072] + ov::Shape shape1 = {static_cast(src1->ne[1]), static_cast(src1->ne[0])}; // For Example: [1, 3072] -> broadcast to [7, 3072] + + // create OpenVINO tensor (src0 and src1) + ov::Tensor tensor0(ov::element::f32, shape0, src0->data); + ov::Tensor tensor1(ov::element::f32, shape1, src1->data); + + // define input parameters + auto input0 = std::make_shared(ov::element::f32, shape0); + auto input1 = std::make_shared(ov::element::f32, shape1); + + // create a multiply operation using broadcasting + auto multiply = std::make_shared(input0, input1); + + // create model + auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); + ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + infer_request.set_tensor(input0, tensor0); + infer_request.set_tensor(input1, tensor1); + + infer_request.infer(); + + // get output tensor and copy it back to dst->data + ov::Tensor output_tensor = infer_request.get_output_tensor(); + std::memcpy(dst->data, output_tensor.data(), src0->ne[0] * src0->ne[1] * sizeof(float)); +} + static void ggml_backend_openvino_add(ggml_tensor * dst) { // Placeholder for OpenVINO add operation // GGML_ASSERT(ctx.device != 0); @@ -169,28 +206,49 @@ static void ggml_backend_openvino_add(ggml_tensor * dst) { } -static void test_op_for_NONE() { - GGML_LOG_DEBUG("...test_op_for_NONE... \n"); +static void ggml_backend_openvino_mul(ggml_tensor * dst) { + GGML_ASSERT(dst->data != nullptr); + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_backend_openvino_mul_forward(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { + return GGML_STATUS_SUCCESS; + } + switch (node->op) { - case GGML_OP_ADD: - // TODO - ggml_backend_openvino_add(node); - break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - break; - case GGML_OP_NONE: - test_op_for_NONE(); - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + break; + case GGML_OP_ADD: + { + ggml_backend_openvino_add(node); + } break; + case GGML_OP_MUL: + { + ggml_backend_openvino_mul(node); + } break; + case GGML_OP_MUL_MAT: break; default: GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); @@ -395,8 +453,18 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + return true; case GGML_OP_ADD: return true; + case GGML_OP_MUL: + return true; + case GGML_OP_MUL_MAT: + return false; default: return false; } From 0b38979df3cf0314fa44c32fb3f754f41f87fba5 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 2 Dec 2024 10:39:36 +0800 Subject: [PATCH 006/156] Add compile options --- ggml/src/ggml-openvino.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 788c2cb12231e..370c0c5d98e26 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -86,15 +86,15 @@ static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); auto add = std::make_shared(input0_param, input1_param); - auto function = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); + auto model = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); // compile model and store in context #ifdef GGML_OPENVINO_GPU - auto compiled_model = core.compile_model(function, "GPU"); + auto compiled_model = core.compile_model(model, "GPU"); #elif GGML_OPENVINO_NPU - auto compiled_model = core.compile_model(function, "NPU"); + auto compiled_model = core.compile_model(model, "NPU"); #else - auto compiled_model = core.compile_model(function, "CPU"); + auto compiled_model = core.compile_model(model, "CPU"); #endif // initialize infer request auto infer_request = compiled_model.create_infer_request(); @@ -157,7 +157,14 @@ static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { // create model auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); + // compile model and store in context +#ifdef GGML_OPENVINO_GPU + ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); +#elif GGML_OPENVINO_NPU + ov::CompiledModel compiled_model = core.compile_model(model, "NPU"); +#else ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); +#endif ov::InferRequest infer_request = compiled_model.create_infer_request(); infer_request.set_tensor(input0, tensor0); From 79b1bd1d4f05b18efa1a301282551f4c59637551 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 4 Dec 2024 14:09:13 +0800 Subject: [PATCH 007/156] add OpenVINO frontend convert process steps --- ggml/src/ggml-openvino.cpp | 53 ++--- ggml/src/ggml-openvino/README.md | 30 +++ ggml/src/ggml-openvino/decoder.h | 54 +++++ ggml/src/ggml-openvino/ggml-decoder.cpp | 203 ++++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.h | 69 ++++++ .../src/ggml-openvino/ggml-graph-iterator.cpp | 96 +++++++++ ggml/src/ggml-openvino/ggml-graph-iterator.h | 61 ++++++ ggml/src/ggml-openvino/graph_iterator.h | 43 ++++ ggml/src/ggml-openvino/utils.cpp | 108 ++++++++++ ggml/src/ggml-openvino/utils.h | 6 + 10 files changed, 698 insertions(+), 25 deletions(-) create mode 100644 ggml/src/ggml-openvino/README.md create mode 100644 ggml/src/ggml-openvino/decoder.h create mode 100644 ggml/src/ggml-openvino/ggml-decoder.cpp create mode 100644 ggml/src/ggml-openvino/ggml-decoder.h create mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.cpp create mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.h create mode 100644 ggml/src/ggml-openvino/graph_iterator.h create mode 100644 ggml/src/ggml-openvino/utils.cpp create mode 100644 ggml/src/ggml-openvino/utils.h diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 370c0c5d98e26..34d692a8cff90 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1,6 +1,7 @@ #include "ggml-openvino.h" #include "ggml-backend-impl.h" #include "ggml-impl.h" +#include "ggml-openvino/utils.h" #include #include @@ -234,33 +235,35 @@ static void ggml_backend_openvino_mul(ggml_tensor * dst) { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + // for (int i = 0; i < cgraph->n_nodes; i++) { + // struct ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { - return GGML_STATUS_SUCCESS; - } + // if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { + // return GGML_STATUS_SUCCESS; + // } - switch (node->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - break; - case GGML_OP_ADD: - { - ggml_backend_openvino_add(node); - } break; - case GGML_OP_MUL: - { - ggml_backend_openvino_mul(node); - } break; - case GGML_OP_MUL_MAT: - break; - default: - GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - } - } + // switch (node->op) { + // case GGML_OP_PERMUTE: + // case GGML_OP_RESHAPE: + // case GGML_OP_TRANSPOSE: + // case GGML_OP_VIEW: + // break; + // case GGML_OP_ADD: + // { + // ggml_backend_openvino_add(node); + // } break; + // case GGML_OP_MUL: + // { + // ggml_backend_openvino_mul(node); + // } break; + // case GGML_OP_MUL_MAT: + // break; + // default: + // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + // } + // } + + openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/README.md b/ggml/src/ggml-openvino/README.md new file mode 100644 index 0000000000000..46c2adb438653 --- /dev/null +++ b/ggml/src/ggml-openvino/README.md @@ -0,0 +1,30 @@ +# Instructions to Modify and Build ggml with OpenVINO + +## Step 1: Modify the Source Code + +In order to change the frontend `.so` path to the path to `.so` file, you need to add path to the `.so` file in cmake compiler option: +1. Open a terminal and navigate to the root directory of this repo. +2. Run the following commands to configure: + ```sh + mkdir build + cmake -B build -DGGML_OV_FRONTEND="${openvino_repo_dir}/bin/intel64/Release/libopenvino_ggml_frontend.so" + ``` +Where GGML_OV_FRONTEND should point to the path to `libopenvino_ggml_frontend.so` file. + +## Step 2: Build the Project + +After modifying the source code, you need to build the project using CMake. Follow these steps: + +1. (Optional) Enable debug option for ggml-openvino, this will output dump of subgraph sent to OpenVINO, information after convert ggml_cgraph to GraphIterator, and calculation input value/output value of each OP: + ```sh + cmake -B build -DGGML_OPENVINO_DEBUG=ON + ``` + +2. Run the following commands to configure and build the project: + ```sh + cmake -B build -DGGML_OPENVINO=ON + cmake --build build -j + ``` + +This will configure the project with OpenVINO support and build it using multiple cores for faster compilation. + diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h new file mode 100644 index 0000000000000..d2ef7587b898c --- /dev/null +++ b/ggml/src/ggml-openvino/decoder.h @@ -0,0 +1,54 @@ +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/frontend/decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +// TODO: Directly include from openvino +class GgmlDecoder : public DecoderBase { +public: + virtual ov::Any get_attribute(const std::string& name) const = 0; + + virtual PartialShape get_input_shape(size_t index) const = 0; + + virtual element::Type get_input_type(size_t index) const = 0; + + virtual size_t get_input_size() const = 0; + + virtual void get_input_node(size_t input_port_idx, + std::string& producer_name, + std::string& producer_output_port_name, + size_t& producer_output_port_index) const = 0; + + virtual bool is_graph_input(size_t index) const = 0; + + virtual std::string& get_input_name(size_t index) const = 0; + + virtual PartialShape get_output_shape(size_t index) const = 0; + + virtual element::Type get_output_type(size_t index) const = 0; + + virtual size_t get_output_size() const = 0; + + virtual bool is_graph_output(size_t index) const = 0; + + virtual int32_t* get_output_op_params(size_t index) const = 0; + + virtual std::string& get_output_name(size_t index) const = 0; + + virtual const std::string& get_op_type() const = 0; + + virtual const std::string& get_op_name() const = 0; + + // virtual const std::vector& outputs() const = 0; + + // virtual size_t output(size_t index) const = 0; + +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp new file mode 100644 index 0000000000000..4d82c756cd375 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -0,0 +1,203 @@ +#include "ggml-decoder.h" +#include +#include + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph) + :m_cgraph(cgraph), + m_node(node), + m_op_name(std::string(m_node->name)) { + switch (m_node->op) { + // Unary OPs + case GGML_OP_UNARY: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + { + m_inputs.push_back(m_node->src[0]); + m_outputs.push_back(m_node); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); + #endif + break; + } + // SCALE + case GGML_OP_SCALE: + { + m_inputs.push_back(m_node->src[0]); + m_outputs.push_back(m_node); + #ifdef GGML_OPENVINO_DEBUG + float v; + memcpy(&v, m_node->op_params, sizeof(float)); + GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); + GGML_LOG_INFO("Scale: %f \n", v); + #endif + break; + } + // OPs with 2 inputs + case GGML_OP_ADD: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + case GGML_OP_SUB: + case GGML_OP_GET_ROWS: + { + m_inputs.push_back(m_node->src[0]); + m_inputs.push_back(m_node->src[1]); + m_outputs.push_back(m_node); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); + GGML_LOG_INFO("Decoder input 1: %f \n", *(float*)(m_node->src[1]->data)); + #endif + break; + } + default: + break; + } +} + +ov::PartialShape GgmlOvDecoder::get_input_shape(size_t index) const { + ov::PartialShape input_shape; + // Use input_node->ne + ggml_tensor * node = m_inputs[index]; + std::vector shape; + // GGML_MAX_DIMS + // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + if (node->ne[i] == 0) { + return input_shape; + } + shape.push_back(static_cast(node->ne[i])); + } + input_shape = ov::PartialShape(shape); + return input_shape; +} + +ov::element::Type GgmlOvDecoder::get_input_type(size_t index) const { + ov::element::Type type = ov::element::dynamic; + // GGML_LOG_DEBUG("%d\n", m_inputs[index]->type); + switch (m_inputs[index]->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; + } + return type; +} + +size_t GgmlOvDecoder::get_input_size() const { + return m_inputs.size(); +} + +bool GgmlOvDecoder::is_graph_input(size_t index) const { + if (m_inputs[index]->flags & GGML_TENSOR_FLAG_INPUT ) { + return true; + } + return false; +} + +std::string& GgmlOvDecoder::get_input_name(size_t index) const { + m_name = std::string(m_inputs[index]->name); + return m_name; +} + +ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { + ov::PartialShape output_shape; + // Use input_node->ne + ggml_tensor * node = m_outputs[index]; + std::vector shape; + // GGML_MAX_DIMS + // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + if (node->ne[i] == 0 ) { + // empty if any dimension has no elements + return output_shape; + } + shape.push_back(static_cast(node->ne[i])); + } + output_shape = ov::PartialShape(shape); + return output_shape; +} + +ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { + // TODO: Change to Output + ov::element::Type type = ov::element::dynamic; + // GGML_LOG_DEBUG("%d\n", m_outputs[index]->type); + switch (m_outputs[index]->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; + } + return type; +} + +bool GgmlOvDecoder::is_graph_output(size_t index) const { + if (m_outputs[index]->flags & GGML_TENSOR_FLAG_OUTPUT) { + return true; + } + return false; +} + +int32_t* GgmlOvDecoder::get_output_op_params(size_t index) const{ + return m_outputs[index]->op_params; +} + +size_t GgmlOvDecoder::get_output_size() const { + return m_outputs.size(); +} + +std::string& GgmlOvDecoder::get_output_name(size_t index) const { + m_name = std::string(m_outputs[index]->name); + return m_name; +} + +const std::string& GgmlOvDecoder::get_op_name() const { + return m_op_name; +} + +const std::string& GgmlOvDecoder::get_op_type() const { + static const std::map opTypeMap = { + {GGML_OP_ACC, "GGML_OP_ACC"}, + {GGML_OP_ADD, "GGML_OP_ADD"}, + {GGML_OP_ADD1, "GGML_OP_ADD1"}, + {GGML_OP_DIV, "GGML_OP_DIV"}, + {GGML_OP_DUP, "GGML_OP_DUP"}, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, + {GGML_OP_MUL, "GGML_OP_MUL"}, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_SCALE, "GGML_OP_SCALE"}, + {GGML_OP_SUB, "GGML_OP_SUB"}, + {GGML_OP_UNARY, "GGML_OP_UNARY"}, + {GGML_OP_VIEW, "GGML_OP_VIEW"} + }; + auto it = opTypeMap.find(m_node->op); + if (it != opTypeMap.end()) { + return it->second; + } else { + static const std::string unknown_op = "UNKNOWN_OP"; + return unknown_op; + } + // static std::string op_type = ggml_op_name(m_node->op); + // return op_type; +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h new file mode 100644 index 0000000000000..3048e2e7e9649 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -0,0 +1,69 @@ +#pragma once + +#include "decoder.h" +#include "ggml.h" + +class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { +public: + using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; + GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph); + + virtual ov::Any get_attribute(const std::string& name) const override { + return nullptr; + GGML_UNUSED(name); + } + + virtual ov::PartialShape get_input_shape(size_t index) const override; + + virtual ov::element::Type get_input_type(size_t index) const override; + + virtual size_t get_input_size() const override; + + virtual void get_input_node(size_t input_port_idx, + std::string& producer_name, + std::string& producer_output_port_name, + size_t& producer_output_port_index) const override { + GGML_UNUSED(input_port_idx); + GGML_UNUSED(producer_name); + GGML_UNUSED(producer_output_port_name); + GGML_UNUSED(producer_output_port_index); + } + + virtual bool is_graph_input(size_t index) const override; + + virtual std::string& get_input_name(size_t index) const override; + + virtual ov::PartialShape get_output_shape(size_t index) const override; + + virtual ov::element::Type get_output_type(size_t index) const override; + + virtual size_t get_output_size() const override; + + virtual bool is_graph_output(size_t index) const override; + + virtual int32_t* get_output_op_params(size_t index) const override; + + virtual std::string& get_output_name(size_t index) const override; + + virtual const std::string& get_op_type() const override; + + virtual const std::string& get_op_name() const override; + + const ggml_tensor* get_input_ggml_tensor(size_t index) const { + return m_inputs[index]; + } + + // virtual const std::vector& outputs() const override; + + // virtual size_t output(size_t index) const override; + +private: + size_t m_index; + struct ggml_cgraph * m_cgraph; + std::vector m_inputs; + std::vector m_outputs; + ggml_tensor * m_node; + const std::string m_op_name; + mutable std::string m_name; +}; + diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp new file mode 100644 index 0000000000000..17a9b7ecfe9ce --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp @@ -0,0 +1,96 @@ +#include "ggml-graph-iterator.h" +#include +#include + +namespace ov { +namespace frontend { +namespace tensorflow { +namespace ggml { + +GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) + :m_cgraph(cgraph) { + initialize_decoders(); + #ifdef GGML_OPENVINO_DEBUG + dump_graph_iterator(); + #endif +} + + void GgmlOvGraphIterator::initialize_decoders() { + auto nodes_size = m_cgraph->n_nodes; + // Initialize decoder for each node + // m_decoders.resize(static_cast(nodes_size)); + + for (int i = 0; i < nodes_size; ++i) { + // Skip View Op + if (m_cgraph->nodes[i] ->op == GGML_OP_VIEW || m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE) { + continue; + } + auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); + m_decoders.push_back(decoder); + for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { + // if (i == 0 || decoder->is_graph_input(inp)) { + m_input_names.push_back(decoder->get_input_name(inp)); + // } + } + for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { + if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { + m_output_names.push_back(decoder->get_output_name(inp)); + } + } + } + +} + +void GgmlOvGraphIterator::reset() { + node_index = 0; + } + +size_t GgmlOvGraphIterator::size() const { + return m_decoders.size(); +} + +void GgmlOvGraphIterator::next() { + node_index++; +} + +bool GgmlOvGraphIterator::is_end() const { + return node_index >= m_decoders.size(); +} + +std::shared_ptr GgmlOvGraphIterator::get_decoder() const { + return m_decoders[node_index]; +} + +std::vector GgmlOvGraphIterator::get_input_names() const { + return m_input_names; +} + +std::vector GgmlOvGraphIterator::get_output_names() const { + return m_output_names; +} + +void GgmlOvGraphIterator::dump_graph_iterator() const { + for (size_t i = 0; i < m_decoders.size(); ++i) { + GGML_LOG_INFO("OP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); + for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { + ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_input_shape(inp); + ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_input_type(inp); + GGML_LOG_INFO("Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); + GGML_LOG_INFO("Input shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO("Input type: %s\n", ptype.to_string().c_str()); + } + for (size_t outp = 0; outp < std::dynamic_pointer_cast(m_decoders[i])->get_output_size(); ++outp) { + ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_output_shape(outp); + ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_output_type(outp); + GGML_LOG_INFO("Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); + GGML_LOG_INFO("Output shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO("Output type: %s\n", ptype.to_string().c_str()); + + } + } +} + +} +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.h b/ggml/src/ggml-openvino/ggml-graph-iterator.h new file mode 100644 index 0000000000000..305afb5c98f87 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.h @@ -0,0 +1,61 @@ +#pragma once + +#include "graph_iterator.h" +#include "ggml-decoder.h" +#include + +// To remove tensorflow +namespace ov { +namespace frontend { +namespace tensorflow { +namespace ggml { + +class GgmlOvGraphIterator : public GgmlGraphIterator { + +protected: + void initialize_decoders(); + +public: + using Ptr = std::shared_ptr; + GgmlOvGraphIterator(struct ggml_cgraph * cgraph); + + /// \brief Get a number of operation nodes in the sgraph + virtual size_t size() const override; + + /// \brief Set iterator to the start position + virtual void reset() override; + + /// \brief Move to the next node in the graph + virtual void next() override; + + /// \brief Returns true if iterator goes out of the range of available nodes + virtual bool is_end() const override; + + /// \brief Return a pointer to a decoder of the current node + virtual std::shared_ptr get_decoder() const override; + + virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const override { + return nullptr; + GGML_UNUSED(func_name); + } + + /// \brief Returns a vector of input names in the original order + virtual std::vector get_input_names() const override; + + /// \brief Returns a vector of output names in the original order + virtual std::vector get_output_names() const override; + + virtual void dump_graph_iterator() const; + +private: + struct ggml_cgraph * m_cgraph; + size_t node_index = 0; + std::vector> m_decoders; + std::vector m_input_names; + std::vector m_output_names; +}; + +} +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/graph_iterator.h b/ggml/src/ggml-openvino/graph_iterator.h new file mode 100644 index 0000000000000..e0b475e445e9b --- /dev/null +++ b/ggml/src/ggml-openvino/graph_iterator.h @@ -0,0 +1,43 @@ +#pragma once + +#include "openvino/frontend/graph_iterator.hpp" + +namespace ov { +namespace frontend { +namespace tensorflow { // To be Removed +namespace ggml { + +// TODO: Directly include from openvino +class GgmlGraphIterator : public GraphIterator { +public: + + virtual size_t size() const = 0; + + virtual void reset() = 0; + + virtual void next() = 0; + + virtual bool is_end() const = 0; + + virtual std::shared_ptr get_decoder() const = 0; + + virtual std::vector get_input_names() const = 0; + + virtual std::vector get_output_names() const = 0; + + virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const = 0; + + virtual std::map get_input_names_map() const { + return {}; + } + + virtual std::map get_output_names_map() const { + return {}; + } + +}; + +} +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp new file mode 100644 index 0000000000000..905e2f4197e01 --- /dev/null +++ b/ggml/src/ggml-openvino/utils.cpp @@ -0,0 +1,108 @@ +#include "utils.h" +#include "ggml-backend-impl.h" +#include +#include + +using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { + return std::make_shared(cgraph); +} + +std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { + std::map input_tensors; + auto input_names = ggml_graph_iterator->get_input_names(); + ggml_graph_iterator->reset(); + for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { + auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); + for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { + if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { + auto input_data = decoder->get_input_ggml_tensor(inp)->data; + ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); + input_tensors[decoder->get_input_name(inp)] = input_tensor; + } + } + } + return input_tensors; +} + +static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { + ov::frontend::FrontEnd::Ptr front_end = nullptr; + auto fem = ov::frontend::FrontEndManager(); + std::string fe_so_path; +#ifdef GGML_OV_FRONTEND + fe_so_path = GGML_OV_FRONTEND; +#endif + fem.register_front_end("ggml", fe_so_path); + front_end = fem.load_by_framework("ggml"); + return front_end; +} + +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ov::Core core; + auto devices = core.get_available_devices(); + // Get GGML Frontend + auto front_end = get_ggml_frontend(); + if (!front_end) { + GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); + return GGML_STATUS_FAILED; + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("GGML FrontEnd is initialized \n"); + #endif + } + + auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); + std::shared_ptr graph_iterator = ggml_graph_iterator; + + // Load GraphIterator -> InputModel + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); + if (!input_model) { + GGML_LOG_ERROR("Input Model is not loaded \n"); + return GGML_STATUS_FAILED; + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Input Model loaded \n"); + #endif + } + + // Convert InputModel -> ov::Model + std::shared_ptr model = front_end->convert(input_model); + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Model converted \n"); + #endif + } + + + // Loading a model to the device + ov::CompiledModel compiled_model = core.compile_model(model); + + // Create infer request + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // Get input tensor + auto input_names = ggml_graph_iterator->get_input_names(); + auto input_tensors = get_ggml_graph_input_tensors(ggml_graph_iterator); + + // Set input tensor + for (size_t i = 0; i < input_names.size(); i++) { + infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + } + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + // Put data in output tensor to the last node -> data in cgraph + // Get output type + ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; + std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Output: %f\n", *output_tensor.data()); + #endif + + return GGML_STATUS_SUCCESS; + GGML_UNUSED(backend); +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h new file mode 100644 index 0000000000000..15dd46ed4ef99 --- /dev/null +++ b/ggml/src/ggml-openvino/utils.h @@ -0,0 +1,6 @@ +#include "ggml-graph-iterator.h" +#include "ggml-backend-impl.h" + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); + +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From 57cbd46e1c3381ccbd3c5448f911bbaeef6c2206 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 5 Dec 2024 16:58:36 +0800 Subject: [PATCH 008/156] add get openvino available ops function --- ggml/src/ggml-openvino.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 34d692a8cff90..c25a927c30f74 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -458,6 +458,17 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } +std::set get_openvino_available_opsets() { + ov::Core core; + std::set unique_ops; + for (const auto& opset : ov::get_available_opsets()) { + for (const auto& op : opset.second().get_type_info_set()) { + unique_ops.insert(op.name).second; + } + } + return unique_ops; +} + static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; From 8d0486fe2ef650737a898b81e2df4d5bb8eafbf7 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Sat, 16 Nov 2024 12:52:19 +0800 Subject: [PATCH 009/156] Add PoC of integration of openvino frontend. Main changes: ggml-ov-frontend-utils, GraphIterator, Decoder --- ggml/src/ggml-openvino.cpp | 2 +- .../ggml-openvino/ggml-ov-frontend-utils.cpp | 54 +++++++++++++++++++ .../ggml-openvino/ggml-ov-frontend-utils.h | 6 +++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp create mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.h diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index c25a927c30f74..c33e3f2be0039 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -487,7 +487,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_MUL_MAT: return false; default: - return false; + return true; } } diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp new file mode 100644 index 0000000000000..f1b865aacfcbc --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp @@ -0,0 +1,54 @@ +#include "ggml-ov-frontend-utils.h" +#include "ggml-backend-impl.h" +#include + +using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { + return std::make_shared(cgraph); +} + +static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { + ov::frontend::FrontEnd::Ptr front_end = nullptr; + auto fem = ov::frontend::FrontEndManager(); + std::string fe_so_path = "/home/yumeng/Code/ov-ggml-frontend/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; + fem.register_front_end("ggml", fe_so_path); + front_end = fem.load_by_framework("ggml"); + return front_end; +} + +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) { + // Get GGML Frontend + auto front_end = get_ggml_frontend(); + if (!front_end) { + GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); + return GGML_STATUS_FAILED; + } else { + GGML_LOG_ERROR("GGML FrontEnd is initialized \n"); + } + + auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); + std::shared_ptr graph_iterator = ggml_graph_iterator; + GGML_LOG_ERROR("Decoder count in current GraphIterator: "); + GGML_LOG_ERROR(std::to_string(graph_iterator->size()).c_str()); + + // Load GraphIterator -> InputModel + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); + if (!input_model) { + GGML_LOG_ERROR("\nInput Model is not loaded \n"); + return GGML_STATUS_FAILED; + } else { + GGML_LOG_ERROR("\nInput Model loaded \n"); + } + + // TODO: Convert InputModel -> ov::Model + // std::shared_ptr model = front_end->convert(input_model); + // if (!model) { + // GGML_LOG_ERROR("Model is not converted"); + // } + + // TODO: Compute + + return GGML_STATUS_SUCCESS; + GGML_UNUSED(backend); +} diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h new file mode 100644 index 0000000000000..15dd46ed4ef99 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h @@ -0,0 +1,6 @@ +#include "ggml-graph-iterator.h" +#include "ggml-backend-impl.h" + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); + +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From ced2425c81877ac53d676ae861891acb8a2e8964 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Tue, 19 Nov 2024 10:25:31 +0800 Subject: [PATCH 010/156] Implement GgmlOvDecoder. Add dump functions. --- ggml/src/ggml-openvino/decoder.h | 2 ++ ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp | 3 +-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index d2ef7587b898c..e047235d88b78 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -39,6 +39,8 @@ class GgmlDecoder : public DecoderBase { virtual std::string& get_output_name(size_t index) const = 0; + virtual size_t get_output_size() const = 0; + virtual const std::string& get_op_type() const = 0; virtual const std::string& get_op_name() const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 3048e2e7e9649..96398d3f83e97 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -45,6 +45,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual std::string& get_output_name(size_t index) const override; + size_t get_output_size() const override; + virtual const std::string& get_op_type() const override; virtual const std::string& get_op_name() const override; diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp index f1b865aacfcbc..fd5921b476356 100644 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp @@ -29,8 +29,7 @@ enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_ auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); std::shared_ptr graph_iterator = ggml_graph_iterator; - GGML_LOG_ERROR("Decoder count in current GraphIterator: "); - GGML_LOG_ERROR(std::to_string(graph_iterator->size()).c_str()); + GGML_LOG_ERROR("Decoder count in current GraphIterator: %s\n", std::to_string(graph_iterator->size()).c_str()); // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); From 3ff257311031fe5f5ac2f6e5a87b55bd897d1bb9 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Fri, 22 Nov 2024 13:10:14 +0800 Subject: [PATCH 011/156] Convert subgraph with add, sub, mul, div op to ov model and do infer on openvino device --- ggml/src/ggml-openvino.cpp | 3 +- ggml/src/ggml-openvino/decoder.h | 4 + ggml/src/ggml-openvino/ggml-decoder.h | 6 +- .../ggml-openvino/ggml-ov-frontend-utils.cpp | 73 ++++++++++++++++--- 4 files changed, 75 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index c33e3f2be0039..ea12c05ac728d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -268,6 +268,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); + GGML_UNUSED(ctx); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -487,7 +488,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_MUL_MAT: return false; default: - return true; + return false; } } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index e047235d88b78..be943716f298c 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -41,6 +41,10 @@ class GgmlDecoder : public DecoderBase { virtual size_t get_output_size() const = 0; + virtual bool is_graph_output(size_t index) const = 0; + + virtual std::string& get_output_name(size_t index) const = 0; + virtual const std::string& get_op_type() const = 0; virtual const std::string& get_op_name() const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 96398d3f83e97..1eaba59426498 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -45,7 +45,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual std::string& get_output_name(size_t index) const override; - size_t get_output_size() const override; + virtual size_t get_output_size() const override; + + virtual bool is_graph_output(size_t index) const override; + + virtual std::string& get_output_name(size_t index) const override; virtual const std::string& get_op_type() const override; diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp index fd5921b476356..10107cbfd0c45 100644 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp @@ -1,6 +1,7 @@ #include "ggml-ov-frontend-utils.h" #include "ggml-backend-impl.h" #include +#include using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; @@ -8,9 +9,27 @@ std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph return std::make_shared(cgraph); } +std::vector get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { + std::vector input_tensors; + auto input_names = ggml_graph_iterator->get_input_names(); + ggml_graph_iterator->reset(); + for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { + auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); + for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { + if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { + auto input_data = decoder->get_input_ggml_tensor(inp)->data; + ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); + input_tensors.push_back(input_tensor); + } + } + } + return input_tensors; +} + static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); + // std::string fe_so_path = "/home/yumeng/Code/test/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; std::string fe_so_path = "/home/yumeng/Code/ov-ggml-frontend/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; fem.register_front_end("ggml", fe_so_path); front_end = fem.load_by_framework("ggml"); @@ -18,36 +37,72 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { } enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ov::Core core; + auto devices = core.get_available_devices(); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Device numbers: %d\n", devices.size()); + #endif // Get GGML Frontend auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; } else { - GGML_LOG_ERROR("GGML FrontEnd is initialized \n"); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("GGML FrontEnd is initialized \n"); + #endif } auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); std::shared_ptr graph_iterator = ggml_graph_iterator; - GGML_LOG_ERROR("Decoder count in current GraphIterator: %s\n", std::to_string(graph_iterator->size()).c_str()); // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); if (!input_model) { - GGML_LOG_ERROR("\nInput Model is not loaded \n"); + GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; } else { - GGML_LOG_ERROR("\nInput Model loaded \n"); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Input Model loaded \n"); + #endif } // TODO: Convert InputModel -> ov::Model - // std::shared_ptr model = front_end->convert(input_model); - // if (!model) { - // GGML_LOG_ERROR("Model is not converted"); - // } + std::shared_ptr model = front_end->convert(input_model); + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Model converted \n"); + #endif + } + - // TODO: Compute + // Loading a model to the device + ov::CompiledModel compiled_model = core.compile_model(model); + // Create infer request + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // Get input tensor + auto input_tensor = get_ggml_graph_input_tensors(ggml_graph_iterator); + + // Set input tensor + for (size_t i = 0; i < input_tensor.size(); i++) { + infer_request.set_input_tensor(i, input_tensor[i]); + } + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + // Put data in output tensor to the last node -> data in cgraph + // Get output type + ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; + std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("%f\n", *output_tensor.data()); + #endif + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } From f16ab81a152432387b9b5d519b43dd42c4ea9566 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Sat, 23 Nov 2024 06:03:08 +0800 Subject: [PATCH 012/156] Add GGML_OV_FRONTEND option. Add readme. --- .../ggml-openvino/ggml-ov-frontend-utils.cpp | 108 ------------------ .../ggml-openvino/ggml-ov-frontend-utils.h | 6 - 2 files changed, 114 deletions(-) delete mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp delete mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.h diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp deleted file mode 100644 index 10107cbfd0c45..0000000000000 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include "ggml-ov-frontend-utils.h" -#include "ggml-backend-impl.h" -#include -#include - -using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; - -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { - return std::make_shared(cgraph); -} - -std::vector get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { - std::vector input_tensors; - auto input_names = ggml_graph_iterator->get_input_names(); - ggml_graph_iterator->reset(); - for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { - auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); - for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { - auto input_data = decoder->get_input_ggml_tensor(inp)->data; - ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); - input_tensors.push_back(input_tensor); - } - } - } - return input_tensors; -} - -static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { - ov::frontend::FrontEnd::Ptr front_end = nullptr; - auto fem = ov::frontend::FrontEndManager(); - // std::string fe_so_path = "/home/yumeng/Code/test/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; - std::string fe_so_path = "/home/yumeng/Code/ov-ggml-frontend/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; - fem.register_front_end("ggml", fe_so_path); - front_end = fem.load_by_framework("ggml"); - return front_end; -} - -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ov::Core core; - auto devices = core.get_available_devices(); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Device numbers: %d\n", devices.size()); - #endif - // Get GGML Frontend - auto front_end = get_ggml_frontend(); - if (!front_end) { - GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("GGML FrontEnd is initialized \n"); - #endif - } - - auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); - std::shared_ptr graph_iterator = ggml_graph_iterator; - - // Load GraphIterator -> InputModel - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Input Model loaded \n"); - #endif - } - - // TODO: Convert InputModel -> ov::Model - std::shared_ptr model = front_end->convert(input_model); - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Model converted \n"); - #endif - } - - - // Loading a model to the device - ov::CompiledModel compiled_model = core.compile_model(model); - - // Create infer request - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - // Get input tensor - auto input_tensor = get_ggml_graph_input_tensors(ggml_graph_iterator); - - // Set input tensor - for (size_t i = 0; i < input_tensor.size(); i++) { - infer_request.set_input_tensor(i, input_tensor[i]); - } - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Put data in output tensor to the last node -> data in cgraph - // Get output type - ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("%f\n", *output_tensor.data()); - #endif - - return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); -} diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h deleted file mode 100644 index 15dd46ed4ef99..0000000000000 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h +++ /dev/null @@ -1,6 +0,0 @@ -#include "ggml-graph-iterator.h" -#include "ggml-backend-impl.h" - -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); - -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From 3975148d225e4ef6f896521273e59fde2394e9c1 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Fri, 6 Dec 2024 07:37:58 +0800 Subject: [PATCH 013/156] Change output for infer request to set output tensor. Support scale, view op. --- ggml/src/ggml-openvino/ggml-decoder.cpp | 43 ++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 4 ++ .../src/ggml-openvino/ggml-graph-iterator.cpp | 27 ++++++------ ggml/src/ggml-openvino/utils.cpp | 41 ++++++++++++++---- 4 files changed, 78 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4d82c756cd375..b367987372470 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -10,13 +10,21 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr // Unary OPs case GGML_OP_UNARY: case GGML_OP_RESHAPE: - case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + case GGML_OP_CONT: + case GGML_OP_CPY: + case GGML_OP_RMS_NORM: { m_inputs.push_back(m_node->src[0]); m_outputs.push_back(m_node); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); - #endif + break; + } + // For view, input is m_node itself + case GGML_OP_VIEW: + { + m_inputs.push_back(m_node); + m_outputs.push_back(m_node); break; } // SCALE @@ -24,12 +32,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr { m_inputs.push_back(m_node->src[0]); m_outputs.push_back(m_node); - #ifdef GGML_OPENVINO_DEBUG - float v; - memcpy(&v, m_node->op_params, sizeof(float)); - GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); - GGML_LOG_INFO("Scale: %f \n", v); - #endif break; } // OPs with 2 inputs @@ -39,14 +41,20 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr case GGML_OP_MUL_MAT: case GGML_OP_SUB: case GGML_OP_GET_ROWS: + case GGML_OP_SOFT_MAX: + { + m_inputs.push_back(m_node->src[0]); + m_inputs.push_back(m_node->src[1]); + m_outputs.push_back(m_node); + break; + } + // OPs with 3 inputs: + case GGML_OP_ROPE: { m_inputs.push_back(m_node->src[0]); m_inputs.push_back(m_node->src[1]); + m_inputs.push_back(m_node->src[2]); // ??? m_outputs.push_back(m_node); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); - GGML_LOG_INFO("Decoder input 1: %f \n", *(float*)(m_node->src[1]->data)); - #endif break; } default: @@ -130,7 +138,6 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { // TODO: Change to Output ov::element::Type type = ov::element::dynamic; - // GGML_LOG_DEBUG("%d\n", m_outputs[index]->type); switch (m_outputs[index]->type) { case GGML_TYPE_F32: type = ov::element::f32; @@ -179,6 +186,8 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, {GGML_OP_ADD1, "GGML_OP_ADD1"}, + {GGML_OP_CONT, "GGML_OP_CONT"}, + {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, @@ -186,8 +195,12 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, + {GGML_OP_ROPE, "GGML_OP_ROPE"}, {GGML_OP_SCALE, "GGML_OP_SCALE"}, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, {GGML_OP_SUB, "GGML_OP_SUB"}, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"} }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 1eaba59426498..ceae589ed494a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -59,6 +59,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_inputs[index]; } + const ggml_tensor* get_output_ggml_tensor(size_t index) const { + return m_outputs[index]; + } + // virtual const std::vector& outputs() const override; // virtual size_t output(size_t index) const override; diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp index 17a9b7ecfe9ce..44e119a1ac5e3 100644 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp @@ -15,16 +15,17 @@ GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) #endif } - void GgmlOvGraphIterator::initialize_decoders() { +void GgmlOvGraphIterator::initialize_decoders() { auto nodes_size = m_cgraph->n_nodes; // Initialize decoder for each node // m_decoders.resize(static_cast(nodes_size)); for (int i = 0; i < nodes_size; ++i) { // Skip View Op - if (m_cgraph->nodes[i] ->op == GGML_OP_VIEW || m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE) { - continue; - } + // if (m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE + // || m_cgraph->nodes[i] ->op == GGML_OP_CPY ) { + // continue; + // } auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); m_decoders.push_back(decoder); for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { @@ -33,9 +34,9 @@ GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) // } } for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { + // if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { m_output_names.push_back(decoder->get_output_name(inp)); - } + // } } } @@ -71,20 +72,20 @@ std::vector GgmlOvGraphIterator::get_output_names() const { void GgmlOvGraphIterator::dump_graph_iterator() const { for (size_t i = 0; i < m_decoders.size(); ++i) { - GGML_LOG_INFO("OP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); + GGML_LOG_INFO("\nOP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_input_shape(inp); ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_input_type(inp); - GGML_LOG_INFO("Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); - GGML_LOG_INFO("Input shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO("Input type: %s\n", ptype.to_string().c_str()); + GGML_LOG_INFO("- Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); + GGML_LOG_INFO(" Input shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO(" Input type: %s\n", ptype.to_string().c_str()); } for (size_t outp = 0; outp < std::dynamic_pointer_cast(m_decoders[i])->get_output_size(); ++outp) { ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_output_shape(outp); ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_output_type(outp); - GGML_LOG_INFO("Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); - GGML_LOG_INFO("Output shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO("Output type: %s\n", ptype.to_string().c_str()); + GGML_LOG_INFO("- Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); + GGML_LOG_INFO(" Output shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO(" Output type: %s\n", ptype.to_string().c_str()); } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 905e2f4197e01..db52b1f81d192 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -18,6 +18,9 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_size(); ++inp) { if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { auto input_data = decoder->get_input_ggml_tensor(inp)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); + #endif ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); input_tensors[decoder->get_input_name(inp)] = input_tensor; } @@ -26,6 +29,27 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr get_ggml_graph_output_tensors(std::shared_ptr ggml_graph_iterator) { + std::map output_tensors; + auto output_names = ggml_graph_iterator->get_output_names(); + ggml_graph_iterator->reset(); + for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { + auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); + for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { + if (std::find(output_names.begin(), output_names.end(), decoder->get_output_name(inp)) != output_names.end()) { + auto output_data = decoder->get_output_ggml_tensor(inp)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Output %d: %g\n", inp, *(double*)(output_data)); + #endif + ov::Tensor output_tensor = ov::Tensor(decoder->get_output_type(inp), decoder->get_output_shape(inp).to_shape(), output_data); + output_tensors[decoder->get_output_name(inp)] = output_tensor; + } + } + } + return output_tensors; +} + + static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); @@ -92,16 +116,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.set_input_tensor(i, input_tensors[input_names[i]]); } - infer_request.infer(); + // Set output tensor - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Put data in output tensor to the last node -> data in cgraph - // Get output type - ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Output: %f\n", *output_tensor.data()); - #endif + auto output_names = ggml_graph_iterator->get_output_names(); + auto output_tensors = get_ggml_graph_output_tensors(ggml_graph_iterator); + for (size_t i = 0; i < output_names.size(); i++) { + infer_request.set_output_tensor(i, output_tensors[output_names[i]]); + } + + infer_request.infer(); return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); From 7f0c3bc660adb2f395437b639fbb9cc442d78851 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 9 Dec 2024 10:09:13 +0800 Subject: [PATCH 014/156] add GET_ROWS operator of OpenVINO to GGML of llama.cpp --- ggml/src/ggml-openvino.cpp | 146 ++++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index ea12c05ac728d..0a1e969c9f5e7 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -234,36 +234,130 @@ static void ggml_backend_openvino_mul(ggml_tensor * dst) { } } +void ggml_compute_forward_get_rows_f16(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] + ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] + + ov::Tensor tensor0(ov::element::f16, shape0, src0->data); + ov::Tensor tensor1(ov::element::i32, shape1, src1->data); + + auto input0 = std::make_shared(ov::element::f16, shape0); + auto input1 = std::make_shared(ov::element::i32, shape1); + + auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); + + auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); + ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + infer_request.set_tensor(input0, tensor0); + infer_request.set_tensor(input1, tensor1); + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + // Convert output tensor data type from f16 to f32 + ov::Tensor output_tensor_f32 = ov::Tensor(ov::element::f32, output_tensor.get_shape()); + for (size_t i = 0; i < output_tensor.get_size(); ++i) { + output_tensor_f32.data()[i] = static_cast(output_tensor.data()[i]); + } + + // Copy the converted data to dst->data + std::memcpy(dst->data, output_tensor_f32.data(), output_tensor_f32.get_byte_size()); +} + +void ggml_compute_forward_get_rows_f32(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] + ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] + + ov::Tensor tensor0(ov::element::f32, shape0, src0->data); + ov::Tensor tensor1(ov::element::i32, shape1, src1->data); + + auto input0 = std::make_shared(ov::element::f32, shape0); + auto input1 = std::make_shared(ov::element::i32, shape1); + + auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); + + auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); + ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + infer_request.set_tensor(input0, tensor0); + infer_request.set_tensor(input1, tensor1); + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + + // Copy the converted data to dst->data + std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); +} + +void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_f16(dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_get_rows_f32(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } + +} + static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // for (int i = 0; i < cgraph->n_nodes; i++) { - // struct ggml_tensor * node = cgraph->nodes[i]; + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; - // if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { - // return GGML_STATUS_SUCCESS; - // } + if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { + return GGML_STATUS_SUCCESS; + } - // switch (node->op) { - // case GGML_OP_PERMUTE: - // case GGML_OP_RESHAPE: - // case GGML_OP_TRANSPOSE: - // case GGML_OP_VIEW: - // break; - // case GGML_OP_ADD: - // { - // ggml_backend_openvino_add(node); - // } break; - // case GGML_OP_MUL: - // { - // ggml_backend_openvino_mul(node); - // } break; - // case GGML_OP_MUL_MAT: - // break; - // default: - // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - // } - // } + switch (node->op) { + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + break; + case GGML_OP_ADD: + { + ggml_backend_openvino_add(node); + } break; + case GGML_OP_MUL: + { + ggml_backend_openvino_mul(node); + } break; + case GGML_OP_MUL_MAT: + break; + case GGML_OP_GET_ROWS: + { + ggml_compute_forward_get_rows(node); + } break; + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } - openvino_frontend_compute(backend, cgraph); + // openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; From 8b2654ff35c7a8ad15e8f86071b92a84f09db9dd Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 10 Dec 2024 18:26:55 +0800 Subject: [PATCH 015/156] Update build.md and add operation mapping(GGML to OpenVINO) --- ggml/src/ggml-openvino.cpp | 118 ++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 0a1e969c9f5e7..efbff646e3b18 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -325,39 +325,7 @@ void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { - return GGML_STATUS_SUCCESS; - } - - switch (node->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - break; - case GGML_OP_ADD: - { - ggml_backend_openvino_add(node); - } break; - case GGML_OP_MUL: - { - ggml_backend_openvino_mul(node); - } break; - case GGML_OP_MUL_MAT: - break; - case GGML_OP_GET_ROWS: - { - ggml_compute_forward_get_rows(node); - } break; - default: - GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - } - } - - // openvino_frontend_compute(backend, cgraph); + openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; @@ -558,7 +526,7 @@ std::set get_openvino_available_opsets() { std::set unique_ops; for (const auto& opset : ov::get_available_opsets()) { for (const auto& op : opset.second().get_type_info_set()) { - unique_ops.insert(op.name).second; + unique_ops.insert(op.name); } } return unique_ops; @@ -566,8 +534,12 @@ std::set get_openvino_available_opsets() { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; +#ifdef OPENVINO_OP_DEBUG +static const std::set& openvino_ops = []() -> const std::set& { + static const std::set ops = get_openvino_available_opsets(); + return ops; + }(); switch (op->op) { case GGML_OP_NONE: case GGML_OP_PERMUTE: @@ -584,6 +556,82 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con default: return false; } +#else + static const std::set& openvino_ops = []() -> const std::set& { + static const std::set ops = get_openvino_available_opsets(); + return ops; + }(); + + static const std::map> op_mapping = { + {GGML_OP_ACC, {"Add"}}, + {GGML_OP_ADD, {"Add"}}, + {GGML_OP_ADD1, {"Add"}}, + {GGML_OP_ADD_REL_POS, {"Add", "MatMul", "Reshape"}}, + {GGML_OP_ARANGE, {"Range"}}, + {GGML_OP_ARGMAX, {"TopK"}}, + {GGML_OP_ARGSORT, {"TopK"}}, + {GGML_OP_CLAMP, {"Clamp"}}, + {GGML_OP_CONCAT, {"Concat"}}, + {GGML_OP_CONV_TRANSPOSE_1D, {"ConvolutionBackpropData"}}, + {GGML_OP_CONV_TRANSPOSE_2D, {"ConvolutionBackpropData"}}, + {GGML_OP_COS, {"Cos"}}, + {GGML_OP_CROSS_ENTROPY_LOSS, {"Softmax", "Log", "Multiply", "ReduceSum", "Negative"}}, + {GGML_OP_DIAG, {"Eye", "Multiply"}}, + {GGML_OP_DIAG_MASK_INF, {"Eye", "Multiply", "Select", "Broadcast"}}, + {GGML_OP_DIAG_MASK_ZERO, {"Eye", "Multiply", "Select", "Broadcast"}}, + {GGML_OP_DIV, {"Divide"}}, + {GGML_OP_FLASH_ATTN_EXT, {"ScaledDotProductAttention"}}, + {GGML_OP_GET_ROWS, {"Gather"}}, + {GGML_OP_GROUP_NORM, {"GroupNormalization"}}, + {GGML_OP_IM2COL, {"Custom", "Reshape", "Transpose"}}, + {GGML_OP_LEAKY_RELU, {"PReLU"}}, + {GGML_OP_LOG, {"Log"}}, + {GGML_OP_MEAN, {"ReduceMean"}}, + {GGML_OP_MUL, {"Multiply"}}, + {GGML_OP_MUL_MAT, {"MatMul"}}, + {GGML_OP_MUL_MAT_ID, {"MatMul", "Identity"}}, + {GGML_OP_NORM, {"NormalizeL2"}}, + {GGML_OP_OUT_PROD, {"MatMul", "Reshape"}}, + {GGML_OP_PAD, {"Pad"}}, + {GGML_OP_PERMUTE, {"Transpose"}}, + {GGML_OP_POOL_1D, {"AvgPool", "MaxPool"}}, + {GGML_OP_POOL_2D, {"AvgPool", "MaxPool"}}, + {GGML_OP_REPEAT, {"Tile"}}, + {GGML_OP_RESHAPE, {"Reshape"}}, + {GGML_OP_RMS_NORM, {"Custom"}}, + {GGML_OP_ROPE, {"Custom"}}, + {GGML_OP_SCALE, {"Multiply", "Constant"}}, + {GGML_OP_SET, {"Assign"}}, + {GGML_OP_SIN, {"Sin"}}, + {GGML_OP_SOFT_MAX, {"Softmax"}}, + {GGML_OP_SQR, {"Power"}}, + {GGML_OP_SQRT, {"Sqrt"}}, + {GGML_OP_SSM_CONV, {"Custom"}}, + {GGML_OP_SSM_SCAN, {"Custom"}}, + {GGML_OP_SUB, {"Subtract"}}, + {GGML_OP_SUM, {"ReduceSum"}}, + {GGML_OP_SUM_ROWS, {"ReduceSum", "Squeeze", "Unsqueeze"}}, + {GGML_OP_TIMESTEP_EMBEDDING, {"Range", "Power", "Multiply", "Sin", "Cos", "Concat"}}, + {GGML_OP_TRANSPOSE, {"Transpose"}}, + {GGML_OP_UPSCALE, {"Interpolate"}}, + {GGML_OP_VIEW, {"Reshape"}}, + {GGML_OP_WIN_PART, {"StridedSlice", "Concat", "Reshape", "Custom"}}, + {GGML_OP_WIN_UNPART, {"Reshape", "Transpose", "Custom"}}, + }; + + auto it = op_mapping.find(op->op); + if (it == op_mapping.end()) { + return false; + } + + for (const std::string& op_name : it->second) { + if (openvino_ops.count(op_name) == 0) { + return false; + } + } + + return true; +#endif } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { From 7fd417aa24468df7a9451d09db1c5069f7c01b88 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 16 Dec 2024 11:13:45 +0800 Subject: [PATCH 016/156] add the rms_norm operator implemented using OpenVINO to the GGML backend of llama.cpp --- ggml/src/ggml-openvino.cpp | 91 +++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index efbff646e3b18..b6f01fdb45448 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -324,6 +324,95 @@ void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { } +void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + assert(src0 != nullptr); + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + const size_t input_size = ne0 * ne1 * ne2 * ne3; + + const float *src_data = static_cast(src0->data); + float *dst_data = static_cast(dst->data); + assert(dst_data != nullptr); + + ov::Core core; + + ov::Shape input_shape = {static_cast(ne3), static_cast(ne2), + static_cast(ne1), static_cast(ne0)}; + ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); + + auto input_param = std::make_shared( + input_tensor.get_element_type(), + input_tensor.get_shape() + ); + assert(input_param != nullptr && "Input parameter creation failed!"); + + auto square = std::make_shared(input_param, input_param); + auto reduce_sum = std::make_shared( + square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), + true + ); + + auto mean = std::make_shared( + reduce_sum, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(ne0)}) + ); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + auto rms = std::make_shared( + std::make_shared( + mean, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}) + ) + ); + + auto scale = std::make_shared( + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), + rms + ); + + auto normalized_input = std::make_shared(input_param, scale); + + ov::ParameterVector parameters = {input_param}; + auto function = std::make_shared(ov::NodeVector{normalized_input}, parameters); + + auto compiled_model = core.compile_model(function, "CPU"); + + auto infer_request = compiled_model.create_infer_request(); + + infer_request.set_input_tensor(0, input_tensor); + + infer_request.infer(); + + auto output_tensor = infer_request.get_output_tensor(); + assert(output_tensor.get_size() == input_size); + + std::memcpy(dst_data, output_tensor.data(), input_size * sizeof(float)); +} + +void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_backend_openvino_rms_norm_f32(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); @@ -598,7 +687,7 @@ static const std::set& openvino_ops = []() -> const std::set Date: Thu, 12 Dec 2024 13:13:31 +0800 Subject: [PATCH 017/156] Fix issue for output memory copy of infer request --- .../src/ggml-openvino/ggml-graph-iterator.cpp | 16 +++++++-------- ggml/src/ggml-openvino/utils.cpp | 20 +++++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp index 44e119a1ac5e3..5c06179023b83 100644 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp @@ -21,22 +21,20 @@ void GgmlOvGraphIterator::initialize_decoders() { // m_decoders.resize(static_cast(nodes_size)); for (int i = 0; i < nodes_size; ++i) { - // Skip View Op - // if (m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE - // || m_cgraph->nodes[i] ->op == GGML_OP_CPY ) { - // continue; - // } auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); m_decoders.push_back(decoder); for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - // if (i == 0 || decoder->is_graph_input(inp)) { + // Skip duplicate input name + if (std::find(m_input_names.begin(), m_input_names.end(), decoder->get_input_name(inp)) == m_input_names.end()) { m_input_names.push_back(decoder->get_input_name(inp)); - // } + } } for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - // if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { + // Skip duplicate output name + auto output_name = decoder->get_output_name(inp); + if (std::find(m_output_names.begin(), m_output_names.end(), output_name) == m_output_names.end()) { m_output_names.push_back(decoder->get_output_name(inp)); - // } + } } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index db52b1f81d192..2dfe837cbdfec 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -29,8 +29,8 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr get_ggml_graph_output_tensors(std::shared_ptr ggml_graph_iterator) { - std::map output_tensors; +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_graph_iterator) { + std::map output_tensors; auto output_names = ggml_graph_iterator->get_output_names(); ggml_graph_iterator->reset(); for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { @@ -41,8 +41,7 @@ std::map get_ggml_graph_output_tensors(std::shared_ptr< #ifdef GGML_OPENVINO_DEBUG printf("Output %d: %g\n", inp, *(double*)(output_data)); #endif - ov::Tensor output_tensor = ov::Tensor(decoder->get_output_type(inp), decoder->get_output_shape(inp).to_shape(), output_data); - output_tensors[decoder->get_output_name(inp)] = output_tensor; + output_tensors[decoder->get_output_name(inp)] = output_data; } } } @@ -100,7 +99,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); @@ -113,18 +111,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + infer_request.set_input_tensor(i, input_tensors[input_names[i]]); } - // Set output tensor + infer_request.infer(); + // Set dst data for outputs auto output_names = ggml_graph_iterator->get_output_names(); - auto output_tensors = get_ggml_graph_output_tensors(ggml_graph_iterator); + auto output_tensors = get_ggml_graph_output_dst(ggml_graph_iterator); for (size_t i = 0; i < output_names.size(); i++) { - infer_request.set_output_tensor(i, output_tensors[output_names[i]]); + auto output_tensor = infer_request.get_output_tensor(i); + std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); } - - infer_request.infer(); return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); From 84c8a7d2a13c2463e871f64626b19cd57c474e51 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Fri, 13 Dec 2024 07:28:28 +0800 Subject: [PATCH 018/156] Change to implementation following pytorch frontend --- ggml/src/ggml-openvino/decoder.h | 18 ++- ggml/src/ggml-openvino/ggml-decoder.cpp | 134 +++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 44 +++--- .../src/ggml-openvino/ggml-graph-iterator.cpp | 95 ------------- ggml/src/ggml-openvino/ggml-graph-iterator.h | 61 -------- ggml/src/ggml-openvino/graph_iterator.h | 43 ------ ggml/src/ggml-openvino/utils.cpp | 74 +++++----- ggml/src/ggml-openvino/utils.h | 4 +- 8 files changed, 143 insertions(+), 330 deletions(-) delete mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.cpp delete mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.h delete mode 100644 ggml/src/ggml-openvino/graph_iterator.h diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index be943716f298c..c7f1bbd7255c0 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -12,9 +12,9 @@ class GgmlDecoder : public DecoderBase { public: virtual ov::Any get_attribute(const std::string& name) const = 0; - virtual PartialShape get_input_shape(size_t index) const = 0; + virtual PartialShape get_input_shape(const std::string& name) const = 0; - virtual element::Type get_input_type(size_t index) const = 0; + virtual element::Type get_input_type(const std::string& name) const = 0; virtual size_t get_input_size() const = 0; @@ -23,19 +23,15 @@ class GgmlDecoder : public DecoderBase { std::string& producer_output_port_name, size_t& producer_output_port_index) const = 0; - virtual bool is_graph_input(size_t index) const = 0; - virtual std::string& get_input_name(size_t index) const = 0; - virtual PartialShape get_output_shape(size_t index) const = 0; + virtual std::vector get_input_names() const = 0; - virtual element::Type get_output_type(size_t index) const = 0; + virtual PartialShape get_output_shape(const std::string& name) const = 0; - virtual size_t get_output_size() const = 0; + virtual element::Type get_output_type(const std::string& name) const = 0; - virtual bool is_graph_output(size_t index) const = 0; - - virtual int32_t* get_output_op_params(size_t index) const = 0; + virtual int32_t* get_output_op_params(const std::string& name) const = 0; virtual std::string& get_output_name(size_t index) const = 0; @@ -49,6 +45,8 @@ class GgmlDecoder : public DecoderBase { virtual const std::string& get_op_name() const = 0; + virtual void visit_subgraph(std::function)> node_visitor) const = 0; + // virtual const std::vector& outputs() const = 0; // virtual size_t output(size_t index) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b367987372470..ab4b0995a5975 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,11 +2,8 @@ #include #include -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph) - :m_cgraph(cgraph), - m_node(node), - m_op_name(std::string(m_node->name)) { - switch (m_node->op) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { + switch (node->op) { // Unary OPs case GGML_OP_UNARY: case GGML_OP_RESHAPE: @@ -16,22 +13,26 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr case GGML_OP_CPY: case GGML_OP_RMS_NORM: { - m_inputs.push_back(m_node->src[0]); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_output_names.push_back(node->name); break; } - // For view, input is m_node itself + // For view, input is node itself case GGML_OP_VIEW: { - m_inputs.push_back(m_node); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node; + outputs[node->name] = node; break; } // SCALE case GGML_OP_SCALE: { - m_inputs.push_back(m_node->src[0]); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); break; } // OPs with 2 inputs @@ -43,18 +44,25 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr case GGML_OP_GET_ROWS: case GGML_OP_SOFT_MAX: { - m_inputs.push_back(m_node->src[0]); - m_inputs.push_back(m_node->src[1]); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + inputs[node->src[1]->name] = node->src[1]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_input_names.push_back(node->src[1]->name); + m_output_names.push_back(node->name); break; } // OPs with 3 inputs: case GGML_OP_ROPE: { - m_inputs.push_back(m_node->src[0]); - m_inputs.push_back(m_node->src[1]); - m_inputs.push_back(m_node->src[2]); // ??? - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + inputs[node->src[1]->name] = node->src[1]; + inputs[node->src[2]->name] = node->src[2]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_input_names.push_back(node->src[1]->name); + m_input_names.push_back(node->src[2]->name); + m_output_names.push_back(node->name); break; } default: @@ -62,13 +70,33 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr } } -ov::PartialShape GgmlOvDecoder::get_input_shape(size_t index) const { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph) + :m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + m_inputs.clear(); + m_outputs.clear(); + m_input_names.clear(); + m_output_names.clear(); + // If first init + if (m_node) { + set_input_output(m_node, m_inputs, m_outputs); + } else { + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + auto cur_node = m_cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + // Init model input and output + set_input_output(cur_node, m_inputs, m_outputs); + } + } +} + +ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ov::PartialShape input_shape; // Use input_node->ne - ggml_tensor * node = m_inputs[index]; + ggml_tensor * node = m_inputs.at(name); std::vector shape; - // GGML_MAX_DIMS - // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0) { return input_shape; @@ -79,10 +107,9 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(size_t index) const { return input_shape; } -ov::element::Type GgmlOvDecoder::get_input_type(size_t index) const { +ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { ov::element::Type type = ov::element::dynamic; - // GGML_LOG_DEBUG("%d\n", m_inputs[index]->type); - switch (m_inputs[index]->type) { + switch (m_inputs.at(name)->type) { case GGML_TYPE_F32: type = ov::element::f32; break; @@ -102,28 +129,24 @@ ov::element::Type GgmlOvDecoder::get_input_type(size_t index) const { } size_t GgmlOvDecoder::get_input_size() const { - return m_inputs.size(); -} - -bool GgmlOvDecoder::is_graph_input(size_t index) const { - if (m_inputs[index]->flags & GGML_TENSOR_FLAG_INPUT ) { - return true; - } - return false; + return m_input_names.size(); } std::string& GgmlOvDecoder::get_input_name(size_t index) const { - m_name = std::string(m_inputs[index]->name); + m_name = m_input_names[index]; return m_name; } -ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { +std::vector GgmlOvDecoder::get_input_names() const { + return m_input_names; +} + +ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; // Use input_node->ne - ggml_tensor * node = m_outputs[index]; + ggml_tensor * node = m_outputs.at(name); std::vector shape; - // GGML_MAX_DIMS - // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0 ) { // empty if any dimension has no elements @@ -135,10 +158,10 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { return output_shape; } -ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { +ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { // TODO: Change to Output ov::element::Type type = ov::element::dynamic; - switch (m_outputs[index]->type) { + switch (m_outputs.at(name)->type) { case GGML_TYPE_F32: type = ov::element::f32; break; @@ -157,30 +180,31 @@ ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { return type; } -bool GgmlOvDecoder::is_graph_output(size_t index) const { - if (m_outputs[index]->flags & GGML_TENSOR_FLAG_OUTPUT) { - return true; - } - return false; -} - -int32_t* GgmlOvDecoder::get_output_op_params(size_t index) const{ - return m_outputs[index]->op_params; -} - -size_t GgmlOvDecoder::get_output_size() const { - return m_outputs.size(); +int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ + return m_outputs.at(name)->op_params; } std::string& GgmlOvDecoder::get_output_name(size_t index) const { - m_name = std::string(m_outputs[index]->name); + m_name = std::string(m_output_names[index]); return m_name; } +std::vector GgmlOvDecoder::get_output_names() const { + return m_output_names; +} + const std::string& GgmlOvDecoder::get_op_name() const { return m_op_name; } +void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { + for (const auto& node : m_nodes) { + auto decoder = std::make_shared(node, m_cgraph); + // m_decoders.push_back(decoder); + node_visitor(decoder); + } +} + const std::string& GgmlOvDecoder::get_op_type() const { static const std::map opTypeMap = { {GGML_OP_ACC, "GGML_OP_ACC"}, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index ceae589ed494a..56bb3f889ffd2 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -6,6 +6,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; + GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph); virtual ov::Any get_attribute(const std::string& name) const override { @@ -13,9 +14,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { GGML_UNUSED(name); } - virtual ov::PartialShape get_input_shape(size_t index) const override; + virtual ov::PartialShape get_input_shape(const std::string& name) const override; - virtual ov::element::Type get_input_type(size_t index) const override; + virtual ov::element::Type get_input_type(const std::string& name) const override; virtual size_t get_input_size() const override; @@ -29,19 +30,15 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { GGML_UNUSED(producer_output_port_index); } - virtual bool is_graph_input(size_t index) const override; - virtual std::string& get_input_name(size_t index) const override; - virtual ov::PartialShape get_output_shape(size_t index) const override; + virtual std::vector get_input_names() const override; - virtual ov::element::Type get_output_type(size_t index) const override; + virtual ov::PartialShape get_output_shape(const std::string& name) const override; - virtual size_t get_output_size() const override; - - virtual bool is_graph_output(size_t index) const override; + virtual ov::element::Type get_output_type(const std::string& name) const override; - virtual int32_t* get_output_op_params(size_t index) const override; + virtual int32_t* get_output_op_params(const std::string& name) const override; virtual std::string& get_output_name(size_t index) const override; @@ -55,24 +52,27 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual const std::string& get_op_name() const override; - const ggml_tensor* get_input_ggml_tensor(size_t index) const { - return m_inputs[index]; - } + virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor* get_output_ggml_tensor(size_t index) const { - return m_outputs[index]; + const ggml_tensor* get_input_ggml_tensor(std::string& name) const { + return m_inputs.at(name); } - // virtual const std::vector& outputs() const override; - - // virtual size_t output(size_t index) const override; + const ggml_tensor* get_output_ggml_tensor(std::string& name) const { + return m_outputs.at(name); + } private: - size_t m_index; + void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); + struct ggml_cgraph * m_cgraph; - std::vector m_inputs; - std::vector m_outputs; - ggml_tensor * m_node; + std::map m_inputs; + std::vector m_input_names; + std::map m_outputs; + std::vector m_output_names; + ggml_tensor* m_node; + std::vector m_nodes; + std::vector> m_decoders; const std::string m_op_name; mutable std::string m_name; }; diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp deleted file mode 100644 index 5c06179023b83..0000000000000 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "ggml-graph-iterator.h" -#include -#include - -namespace ov { -namespace frontend { -namespace tensorflow { -namespace ggml { - -GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) - :m_cgraph(cgraph) { - initialize_decoders(); - #ifdef GGML_OPENVINO_DEBUG - dump_graph_iterator(); - #endif -} - -void GgmlOvGraphIterator::initialize_decoders() { - auto nodes_size = m_cgraph->n_nodes; - // Initialize decoder for each node - // m_decoders.resize(static_cast(nodes_size)); - - for (int i = 0; i < nodes_size; ++i) { - auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); - m_decoders.push_back(decoder); - for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - // Skip duplicate input name - if (std::find(m_input_names.begin(), m_input_names.end(), decoder->get_input_name(inp)) == m_input_names.end()) { - m_input_names.push_back(decoder->get_input_name(inp)); - } - } - for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - // Skip duplicate output name - auto output_name = decoder->get_output_name(inp); - if (std::find(m_output_names.begin(), m_output_names.end(), output_name) == m_output_names.end()) { - m_output_names.push_back(decoder->get_output_name(inp)); - } - } - } - -} - -void GgmlOvGraphIterator::reset() { - node_index = 0; - } - -size_t GgmlOvGraphIterator::size() const { - return m_decoders.size(); -} - -void GgmlOvGraphIterator::next() { - node_index++; -} - -bool GgmlOvGraphIterator::is_end() const { - return node_index >= m_decoders.size(); -} - -std::shared_ptr GgmlOvGraphIterator::get_decoder() const { - return m_decoders[node_index]; -} - -std::vector GgmlOvGraphIterator::get_input_names() const { - return m_input_names; -} - -std::vector GgmlOvGraphIterator::get_output_names() const { - return m_output_names; -} - -void GgmlOvGraphIterator::dump_graph_iterator() const { - for (size_t i = 0; i < m_decoders.size(); ++i) { - GGML_LOG_INFO("\nOP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); - for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { - ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_input_shape(inp); - ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_input_type(inp); - GGML_LOG_INFO("- Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); - GGML_LOG_INFO(" Input shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO(" Input type: %s\n", ptype.to_string().c_str()); - } - for (size_t outp = 0; outp < std::dynamic_pointer_cast(m_decoders[i])->get_output_size(); ++outp) { - ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_output_shape(outp); - ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_output_type(outp); - GGML_LOG_INFO("- Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); - GGML_LOG_INFO(" Output shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO(" Output type: %s\n", ptype.to_string().c_str()); - - } - } -} - -} -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.h b/ggml/src/ggml-openvino/ggml-graph-iterator.h deleted file mode 100644 index 305afb5c98f87..0000000000000 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include "graph_iterator.h" -#include "ggml-decoder.h" -#include - -// To remove tensorflow -namespace ov { -namespace frontend { -namespace tensorflow { -namespace ggml { - -class GgmlOvGraphIterator : public GgmlGraphIterator { - -protected: - void initialize_decoders(); - -public: - using Ptr = std::shared_ptr; - GgmlOvGraphIterator(struct ggml_cgraph * cgraph); - - /// \brief Get a number of operation nodes in the sgraph - virtual size_t size() const override; - - /// \brief Set iterator to the start position - virtual void reset() override; - - /// \brief Move to the next node in the graph - virtual void next() override; - - /// \brief Returns true if iterator goes out of the range of available nodes - virtual bool is_end() const override; - - /// \brief Return a pointer to a decoder of the current node - virtual std::shared_ptr get_decoder() const override; - - virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const override { - return nullptr; - GGML_UNUSED(func_name); - } - - /// \brief Returns a vector of input names in the original order - virtual std::vector get_input_names() const override; - - /// \brief Returns a vector of output names in the original order - virtual std::vector get_output_names() const override; - - virtual void dump_graph_iterator() const; - -private: - struct ggml_cgraph * m_cgraph; - size_t node_index = 0; - std::vector> m_decoders; - std::vector m_input_names; - std::vector m_output_names; -}; - -} -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/graph_iterator.h b/ggml/src/ggml-openvino/graph_iterator.h deleted file mode 100644 index e0b475e445e9b..0000000000000 --- a/ggml/src/ggml-openvino/graph_iterator.h +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - -#include "openvino/frontend/graph_iterator.hpp" - -namespace ov { -namespace frontend { -namespace tensorflow { // To be Removed -namespace ggml { - -// TODO: Directly include from openvino -class GgmlGraphIterator : public GraphIterator { -public: - - virtual size_t size() const = 0; - - virtual void reset() = 0; - - virtual void next() = 0; - - virtual bool is_end() const = 0; - - virtual std::shared_ptr get_decoder() const = 0; - - virtual std::vector get_input_names() const = 0; - - virtual std::vector get_output_names() const = 0; - - virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const = 0; - - virtual std::map get_input_names_map() const { - return {}; - } - - virtual std::map get_output_names_map() const { - return {}; - } - -}; - -} -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2dfe837cbdfec..2436f86feba4e 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,49 +1,40 @@ #include "utils.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include #include -using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; +using ov::frontend::ggml::GgmlDecoder; -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { - return std::make_shared(cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph) { + return std::make_shared(nullptr, cgraph); } -std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { +std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::map input_tensors; - auto input_names = ggml_graph_iterator->get_input_names(); - ggml_graph_iterator->reset(); - for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { - auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); - for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { - auto input_data = decoder->get_input_ggml_tensor(inp)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); - #endif - ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); - input_tensors[decoder->get_input_name(inp)] = input_tensor; - } - } + auto input_names = ggml_decoder->get_input_names(); + for (size_t inp = 0; inp < input_names.size(); ++inp) { + auto name = input_names[inp]; + auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); + #endif + ov::Tensor input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + input_tensors[name] = input_tensor; } return input_tensors; } -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_graph_iterator) { +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; - auto output_names = ggml_graph_iterator->get_output_names(); - ggml_graph_iterator->reset(); - for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { - auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); - for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - if (std::find(output_names.begin(), output_names.end(), decoder->get_output_name(inp)) != output_names.end()) { - auto output_data = decoder->get_output_ggml_tensor(inp)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Output %d: %g\n", inp, *(double*)(output_data)); - #endif - output_tensors[decoder->get_output_name(inp)] = output_data; - } - } + auto output_names = ggml_decoder->get_output_names(); + for (size_t inp = 0; inp < output_names.size(); ++inp) { + auto name = output_names[inp]; + auto output_data = ggml_decoder->get_output_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Output %d: %g\n", inp, *(double*)(output_data)); + #endif + output_tensors[name] = output_data; } return output_tensors; } @@ -74,12 +65,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML FrontEnd is initialized \n"); #endif } - - auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); - std::shared_ptr graph_iterator = ggml_graph_iterator; - + auto ggml_decoder = get_ggml_decoder(cgraph); + std::shared_ptr graph_decoder = ggml_decoder; // Load GraphIterator -> InputModel - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); if (!input_model) { GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; @@ -106,8 +95,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::InferRequest infer_request = compiled_model.create_infer_request(); // Get input tensor - auto input_names = ggml_graph_iterator->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_graph_iterator); + auto input_names = ggml_decoder->get_input_names(); + auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { @@ -117,11 +106,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.infer(); // Set dst data for outputs - auto output_names = ggml_graph_iterator->get_output_names(); - auto output_tensors = get_ggml_graph_output_dst(ggml_graph_iterator); + auto output_names = ggml_decoder->get_output_names(); + auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + #ifdef GGML_OPENVINO_DEBUG + printf("Output %s after: %g\n", output_names[i], *(double*)(output_tensor.data())); + #endif } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 15dd46ed4ef99..7ec633beda298 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,6 +1,4 @@ -#include "ggml-graph-iterator.h" +#include "ggml-decoder.h" #include "ggml-backend-impl.h" -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); - enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From cc3f0d036be8c067ac4e26997ca677c42345aea0 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Wed, 18 Dec 2024 03:04:49 +0800 Subject: [PATCH 019/156] Add support for UNARY SILU op . Fix pytorch impl bugs. --- ggml/src/ggml-openvino.cpp | 7 +++++ ggml/src/ggml-openvino/ggml-decoder.cpp | 36 ++++++++++++++++++++----- ggml/src/ggml-openvino/utils.cpp | 2 +- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index b6f01fdb45448..1fede40c4a165 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -642,6 +642,13 @@ static const std::set& openvino_ops = []() -> const std::setsrc[0]->name] = node; + inputs[node->name] = node; outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); break; } // SCALE @@ -228,13 +230,33 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"} }; + static const std::map unaryOpTypeMap = { + {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, + {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, + {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG"}, + {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP"}, + {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH"}, + {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU"}, + {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU"}, + {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID"}, + {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU"}, + {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK"}, + {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU"}, + {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, + {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, + {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"} + }; auto it = opTypeMap.find(m_node->op); if (it != opTypeMap.end()) { + if (it->first == GGML_OP_UNARY) { + auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); + if (unary_it != unaryOpTypeMap.end()) { + return unary_it->second; + } + } return it->second; - } else { - static const std::string unknown_op = "UNKNOWN_OP"; - return unknown_op; - } - // static std::string op_type = ggml_op_name(m_node->op); - // return op_type; + } + static const std::string unknown_op = "UNKNOWN_OP"; + return unknown_op; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2436f86feba4e..3bc5779b49268 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -112,7 +112,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); #ifdef GGML_OPENVINO_DEBUG - printf("Output %s after: %g\n", output_names[i], *(double*)(output_tensor.data())); + printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif } From ac84b2a216bd034030e6b55ad3995e8349f1d40e Mon Sep 17 00:00:00 2001 From: yumengbo Date: Thu, 19 Dec 2024 03:37:38 +0800 Subject: [PATCH 020/156] Support Softmax op --- ggml/src/ggml-openvino.cpp | 17 +++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.cpp | 6 ++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 1fede40c4a165..771ca86d02886 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -642,11 +642,28 @@ static const std::set& openvino_ops = []() -> const std::setsrc[0]->name] = node->src[0]; - inputs[node->src[1]->name] = node->src[1]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); - m_input_names.push_back(node->src[1]->name); m_output_names.push_back(node->name); + if (node->src[1]) { + inputs[node->src[1]->name] = node->src[1]; + m_input_names.push_back(node->src[1]->name); + } break; } // OPs with 3 inputs: From 6ca17e8a9952e1cbd119b2c7e04a5008b7f175c1 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Thu, 19 Dec 2024 03:39:05 +0800 Subject: [PATCH 021/156] Support Softmax op --- ggml/src/ggml-openvino.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 771ca86d02886..797ceb74ba356 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -642,8 +642,6 @@ static const std::set& openvino_ops = []() -> const std::set Date: Sat, 21 Dec 2024 08:27:12 +0800 Subject: [PATCH 022/156] Support ROPE op. --- ggml/src/ggml-openvino/ggml-decoder.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ee156bb995871..4f351266c6c2c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -61,12 +61,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; inputs[node->src[1]->name] = node->src[1]; - inputs[node->src[2]->name] = node->src[2]; - outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); m_input_names.push_back(node->src[1]->name); - m_input_names.push_back(node->src[2]->name); + outputs[node->name] = node; m_output_names.push_back(node->name); + if (node->src[2]) { + inputs[node->src[2]->name] = node->src[2]; + m_input_names.push_back(node->src[2]->name); + } break; } default: @@ -92,6 +94,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr // Init model input and output set_input_output(cur_node, m_inputs, m_outputs); } + #ifdef GGML_OPENVINO_DEBUG + ggml_graph_print(m_cgraph); + #endif } } From 8ff4d8a22253cba70f1b2044743cc28334d19dc9 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 19 Dec 2024 15:43:39 +0800 Subject: [PATCH 023/156] Add support for RMS_NORM OP --- ggml/src/ggml-openvino.cpp | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 797ceb74ba356..f8389f06b5cf8 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -334,9 +334,8 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; - const int64_t ne3 = src0->ne[3]; - const size_t input_size = ne0 * ne1 * ne2 * ne3; + const size_t input_size = ne0 * ne1 * ne2; const float *src_data = static_cast(src0->data); float *dst_data = static_cast(dst->data); @@ -344,8 +343,7 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { ov::Core core; - ov::Shape input_shape = {static_cast(ne3), static_cast(ne2), - static_cast(ne1), static_cast(ne0)}; + ov::Shape input_shape = {static_cast(ne2), static_cast(ne1), static_cast(ne0)}; ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); auto input_param = std::make_shared( @@ -357,7 +355,7 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { auto square = std::make_shared(input_param, input_param); auto reduce_sum = std::make_shared( square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), true ); @@ -383,9 +381,16 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { auto normalized_input = std::make_shared(input_param, scale); ov::ParameterVector parameters = {input_param}; - auto function = std::make_shared(ov::NodeVector{normalized_input}, parameters); + auto model = std::make_shared(ov::NodeVector{normalized_input}, parameters); - auto compiled_model = core.compile_model(function, "CPU"); + // static bool model_saved = false; + // if (!model_saved) { + // std::cout << "\n rms model saved" << std::endl; + // ov::save_model(model, "//rms_norm_model.xml"); + // model_saved = true; + // } + + auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); @@ -416,6 +421,18 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); + // for (int i = 0; i < cgraph->n_nodes; i++) { + // struct ggml_tensor * node = cgraph->nodes[i]; + + // switch (node->op) { + // case GGML_OP_RMS_NORM: + // ggml_backend_openvino_rms_norm(node); + // break; + // default: + // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + // } + // } + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); From 31419d9c9e1f37ad1918c20ae4d5d045b03b3616 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 15 Jan 2025 00:37:49 +0800 Subject: [PATCH 024/156] Add MUL_MAT,CPY,CONT as operators implemented in OpenVINO for GGML backend --- ggml/src/ggml-openvino.cpp | 432 +++++++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 +- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/utils.cpp | 8 +- ggml/src/ggml-openvino/utils.h | 2 +- 5 files changed, 428 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index f8389f06b5cf8..07aff4b72e34b 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1,6 +1,7 @@ -#include "ggml-openvino.h" #include "ggml-backend-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "ggml-openvino.h" #include "ggml-openvino/utils.h" #include @@ -418,20 +419,425 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { } } -static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - openvino_frontend_compute(backend, cgraph); - // for (int i = 0; i < cgraph->n_nodes; i++) { - // struct ggml_tensor * node = cgraph->nodes[i]; +void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { - // switch (node->op) { - // case GGML_OP_RMS_NORM: - // ggml_backend_openvino_rms_norm(node); - // break; - // default: - // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - // } - // } + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = 0; + const int nth = 1; + + const enum ggml_type type = src0->type; + const auto *type_traits = ggml_get_type_traits(type); + + enum ggml_type const vec_dot_type = type_traits->vec_dot_type; + ggml_from_float_t const from_float = type_traits->from_float; + ggml_from_float_to_mat_t const from_float_to_mat = type_traits->from_float_to_mat; + int64_t const vec_dot_num_rows = type_traits->nrows; + int64_t const matmul_num_cols = type_traits->ncols; + int64_t const blck_size_interleave = type_traits->blck_size_interleave; + ggml_gemv_t const gemv = type_traits->gemv; + ggml_gemm_t const gemm = type_traits->gemm; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // src1->type = GGML_TYPE_F32, vec_dot_type = GGML_TYPE_F16 + // The main function of this code is to convert the data of src1 from GGML_TYPE_F32 type to vec_dot_type (i.e. GGML_TYPE_F16) and store the result in params->wdata. + // The code processes data of different dimensions through multiple loops and conditional judgments and uses different conversion functions to complete data conversion. + std::unique_ptr wdata(new char[ne13 * ggml_row_size(vec_dot_type, ne10) * ne11 * ne12]); + if (src1->type != vec_dot_type) { + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata.get() + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); + } + } + } + } + + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) + const int64_t nr0 = ne0; + + // This is the size of the rest of the dimensions of the result + const int64_t nr1 = ne1 * ne2 * ne3; + + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t num_rows_per_vec_dot = vec_dot_num_rows; + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { + num_rows_per_vec_dot = 1; + } + + // Now select a reasonable chunk size. + int chunk_size = 16; + + // We need to step up the size if it's small + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } + + // distribute the work across the inner or outer loop based on which one is larger + // The number of chunks in the 0/1 dim. + // CEIL(nr0/chunk_size) + int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; + + // The number of elements in each chunk + const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = ith; + + while (current_chunk < nchunk0 * nchunk1) { + const int64_t ith0 = current_chunk % nchunk0; + const int64_t ith1 = current_chunk / nchunk0; + + const int64_t ir0_start = dr0 * ith0; + const int64_t ir0_end = MIN(ir0_start + dr0, nr0); + + const int64_t ir1_start = dr1 * ith1; + const int64_t ir1_end = MIN(ir1_start + dr1, nr1); + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits->vec_dot; + enum ggml_type const vec_dot_type = type_traits->vec_dot_type; + + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + // threads with no work simply yield (not sure if it helps) + if (ir0_start >= ir0_end || ir1_start >= ir1_end) { + return; + } + + // const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + + // attempt to reduce false-sharing (does not seem to make a difference) + // 16 * 2, accounting for mmla kernels + float tmp[32]; + + for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { + const int64_t i13 = (ir1 / (ne12 * ne1)); + const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; + const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); + + // broadcast src0 into src1 + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + const char * src1_col = (const char*)wdata.get() + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + vec_dot(ne00, &tmp[ir0 - iir0], + (num_rows_per_vec_dot > 1 ? 16 : 0), + src0_row + ir0 * nb01, + (num_rows_per_vec_dot > 1 ? nb01 : 0), + src1_col, + (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), + num_rows_per_vec_dot); + } + + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + } + } + } + } + + if (nth >= nchunk0 * nchunk1) { + break; + } + + // current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); + current_chunk++; + } +} + +void ggml_backend_openvino_reshape(ggml_tensor *dst) { + + GGML_UNUSED(dst); +} + +void ggml_backend_openvino_view(ggml_tensor *dst) { + + GGML_UNUSED(dst); +} + +void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + + // Validate tensor properties + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(src0->type == dst->type); + + // Determine tensor properties + const size_t element_size = ggml_type_size(src0->type); + + // Case 1: Both tensors are contiguous + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + // OpenVINO tensors for src and dst + // Source is 1D since it's contiguous + ov::Tensor src_tensor(ov::element::f32, {src0->ne[0]}, src0->data); + // // Destination is 1D since it's contiguous + ov::Tensor dst_tensor(ov::element::f32, {dst->ne[0]}, dst->data); + + // Perform the memory copy row by row + size_t row_size = dst->nb[0]; // Size of one row in destination + size_t src_stride = src0->nb[0]; // Stride for source tensor + + for (size_t i = 0; i < dst->ne[0]; ++i) { + std::memcpy((char *)dst_tensor.data()+i*row_size, (char *)src_tensor.data()+i*src_stride, row_size); + } + return; + } + + // Case 2: Compatible types, dimensions, and strides + const size_t ne00 = src0->ne[0]; + const size_t ne01 = src0->ne[1]; + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb0 = dst->nb[0]; + + if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { + for (size_t i01 = 0; i01 < ne01; ++i01) { + const char *src_row = reinterpret_cast(src0->data) + i01 * nb01; + char *dst_row = reinterpret_cast(dst->data) + i01 * dst->nb[1]; + + ov::Tensor src_row_tensor(ov::element::f32, {ne00}, const_cast(reinterpret_cast(src_row))); + ov::Tensor dst_row_tensor(ov::element::f32, {ne00}, reinterpret_cast(dst_row)); + + std::memcpy(dst_row_tensor.data(), src_row_tensor.data(), ne00 * sizeof(float)); + } + return; + } + + // Case 3: Non-contiguous source, contiguous destination + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t nb02 = src0->nb[2]; + const int64_t nb03 = src0->nb[3]; + + // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 + // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 + if (ggml_is_contiguous(dst)) { + const size_t rs = ne00 * element_size; // Row size in bytes for dst + + // Create OpenVINO tensors for source and destination + // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. + ov::Tensor src_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, src0->data); + ov::Tensor dst_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, dst->data); + + // Perform the copy in a single loop + const size_t num_rows = ne03 * ne02 * ne01; + for (size_t row = 0; row < num_rows; ++row) { + // Calculate the source row pointer based on original strides + // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. + const char* src0_ptr = (char*)src_tensor.data() + + // Calculates which block of the i03 dimension the current row belongs to + (row / (ne02 * ne01)) * nb03 + // 0 + // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. + ((row / ne01) % ne02) * nb02 + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 + // Calculates the position within the current i02 block in terms of the i01 index. + (row % ne01) * nb01; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 + + // Destination row pointer is linear + // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. + char* dst_ptr = (char*)dst_tensor.data() + row * rs; + + // Copy row + std::memcpy(dst_ptr, src0_ptr, rs); + } + return; + } + std::cout << "Duplication of bytes completed successfully." << std::endl; +} + +static void ggml_backend_openvino_transpose(ggml_tensor *dst) { + // NOP + GGML_UNUSED(dst); +} + +static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { + // NOP + GGML_UNUSED(dst); +} + +void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + assert(src0 != nullptr); + assert(ggml_nelements(dst) == ggml_nelements(src0)); + + // Extract shapes + ov::Shape src_shape(src0->ne, src0->ne + 4); + ov::Shape dst_shape(dst->ne, dst->ne + 4); + + // Initialize OpenVINO core + ov::Core core; + + // Create OpenVINO parameter for the source tensor + auto src_input = std::make_shared(ov::element::f32, src_shape); + + std::shared_ptr model; + if (ggml_is_contiguous(dst)) { + // Contiguous Case: Flatten src and reshape to dst shape + ov::Shape flattened_shape = {ggml_nelements(src0)}; + auto flatten = std::make_shared( + src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); + + auto reshape_to_dst = std::make_shared( + flatten, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), false); + + auto dst_output = std::make_shared(reshape_to_dst, ov::element::f16); + + model = std::make_shared( + ov::ResultVector{std::make_shared(dst_output)}, + ov::ParameterVector{src_input}, + "ContiguousCopy"); + // Compile and execute the model + auto compiled_model = core.compile_model(model, "CPU"); + + ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); + ov::Tensor dst_tensor(ov::element::f16, dst_shape, dst->data); + + auto infer_request = compiled_model.create_infer_request(); + infer_request.set_input_tensor(0, src_tensor); + infer_request.set_output_tensor(0, dst_tensor); + infer_request.infer(); + } else { + // Non-contiguous case: element-wise copy + for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { + for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { + for (int64_t i01 = 0; i01 < dst->ne[1]; ++i01) { + for (int64_t i00 = 0; i00 < dst->ne[0]; ++i00) { + const char *src_ptr = static_cast(src0->data) + + i00 * src0->nb[0] + i01 * src0->nb[1] + + i02 * src0->nb[2] + i03 * src0->nb[3]; + + char *dst_ptr = static_cast(dst->data) + + i00 * dst->nb[0] + i01 * dst->nb[1] + + i02 * dst->nb[2] + i03 * dst->nb[3]; + + *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(*(const float *)src_ptr); + } + } + } + } + } +} + +static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + // Find the indices of GGML_OP_CONT, GGML_OP_CPY nodes, GGML_OP_MUL_MAT and so on. + std::vector cont_indices; + std::vector reshape_indices; + std::vector view_indices; + + std::vector cpy_indices; + std::vector transpose_indices; + std::vector permute_indices; + + std::vector mul_mat_indices; + + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i]->op == GGML_OP_CONT) { + cont_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { + reshape_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { + view_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { + cpy_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { + transpose_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { + permute_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { + mul_mat_indices.push_back(i); + } + } + + // Process nodes in order + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes && + std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && + std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && + std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i); + } + } + } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4f351266c6c2c..172c72ff50503 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,7 +76,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) : "NONE_OP") { @@ -88,7 +88,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + for (int node_n = start_index; node_n <= end_index; node_n++) { auto cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); // Init model input and output diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 56bb3f889ffd2..2bb2f585f13d1 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -7,7 +7,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph); + GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3bc5779b49268..84c9001c5c611 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -6,8 +6,8 @@ using ov::frontend::ggml::GgmlDecoder; -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph) { - return std::make_shared(nullptr, cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { + return std::make_shared(nullptr, cgraph, start_index, end_index); } std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { @@ -52,7 +52,7 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { ov::Core core; auto devices = core.get_available_devices(); // Get GGML Frontend @@ -65,7 +65,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML FrontEnd is initialized \n"); #endif } - auto ggml_decoder = get_ggml_decoder(cgraph); + auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); std::shared_ptr graph_decoder = ggml_decoder; // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 7ec633beda298..fc5268d98a993 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); From ab06af6715211d29eff97946fc794a588a00f764 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 22 Jan 2025 15:22:56 +0800 Subject: [PATCH 025/156] Move CPY from GGML OV Backend to OV Frontend --- ggml/src/ggml-openvino.cpp | 7 +- ggml/src/ggml-openvino/decoder.h | 2 + ggml/src/ggml-openvino/ggml-decoder.cpp | 100 +++++++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 4 + 4 files changed, 107 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 07aff4b72e34b..444ccdf36644d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -815,9 +815,9 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); + ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { @@ -829,7 +829,6 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe int start_index = i; while (i < cgraph->n_nodes && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && - std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index c7f1bbd7255c0..56f2ddcc80821 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -51,6 +51,8 @@ class GgmlDecoder : public DecoderBase { // virtual size_t output(size_t index) const = 0; + virtual bool check_if_continuous() const = 0; + }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 172c72ff50503..355a95d978c94 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,6 +1,7 @@ #include "ggml-decoder.h" #include #include +#include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { switch (node->op) { @@ -9,8 +10,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; @@ -19,6 +18,103 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname); break; } + case GGML_OP_CONT: + { + if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node)) { + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_output_names.push_back(node->name); + m_continuous = true; + break; + } + + if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && + node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && node->nb[0] == ggml_type_size(node->src[0]->type)) { + + for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { + const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; + char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; + std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); + } + + inputs[node->name] = node; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); + m_continuous = false; + break; + } + + // if (ggml_is_contiguous(node)) { + const size_t rs = node->src[0]->ne[0] * ggml_type_size(node->src[0]->type); // Row size in bytes for dst + + // Create OpenVINO tensors for source and destination + // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. + ov::Tensor src_tensor(ov::element::f32, + ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, + node->src[0]->data); + ov::Tensor dst_tensor(ov::element::f32, + ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, + node->data); + + // Perform the copy in a single loop + const size_t num_rows = node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1]; + for (size_t row = 0; row < num_rows; ++row) { + // Calculate the source row pointer based on original strides + // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. + const char* src0_ptr = (char*)src_tensor.data() + + // Calculates which block of the i03 dimension the current row belongs to + (row / (node->src[0]->ne[2] * node->src[0]->ne[1])) * node->src[0]->nb[3] + // 0 + // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. + ((row / node->src[0]->ne[1]) % node->src[0]->ne[2]) * node->src[0]->nb[2] + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 + // Calculates the position within the current i02 block in terms of the i01 index. + (row % node->src[0]->ne[1]) * node->src[0]->nb[1]; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 + + // Destination row pointer is linear + // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. + char* dst_ptr = (char*)dst_tensor.data() + row * rs; + + // Copy row + std::memcpy(dst_ptr, src0_ptr, rs); + } + + inputs[node->name] = node; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); + m_continuous = false; + break; + //} + } + case GGML_OP_CPY: + { + if (ggml_is_contiguous(node)) { + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_output_names.push_back(node->name); + m_continuous = true; + break; + } else { + for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 + for (int64_t i0 = 0; i0 < node->ne[0]; ++i0) { // ne[0] = 7 + int64_t src_index = i0 * node->src[0]->nb[0] / sizeof(float) + // stride in nb[0] + i1 * node->src[0]->nb[1] / sizeof(float); // stride in nb[1] + char *dst_ptr = static_cast(node->data) + + i0 * node->nb[0] + i1 * node->nb[1]; + *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(((float*)node->src[0]->data)[src_index]); + } + } + // inputs[node->src[0]->name] = node->src[0]; + inputs[node->name] = node; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); + m_continuous = false; + break; + } + } // For view, input is node itself case GGML_OP_VIEW: { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2bb2f585f13d1..2afde161ee4d9 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -62,6 +62,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_outputs.at(name); } + virtual bool check_if_continuous() const override { + return m_continuous; + } private: void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); @@ -75,5 +78,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::vector> m_decoders; const std::string m_op_name; mutable std::string m_name; + bool m_continuous; }; From 0fa3921e554b8d0272accd91733219a16695d62c Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 18 Feb 2025 14:11:07 +0800 Subject: [PATCH 026/156] add implementation of MUL_MAT, CPY, CONT of GGML ops using OV ops --- ggml/src/ggml-openvino.cpp | 629 +++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 10 + ggml/src/ggml-openvino/utils.cpp | 1 + 4 files changed, 453 insertions(+), 188 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 444ccdf36644d..99a32b1dfdc9c 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -419,191 +419,200 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { } } +// Extracting valid shapes +std::vector get_effective_shape(const ggml_tensor * t) { + std::vector shape; + for (int i = 2; i >= 0; i--) { + if (t->ne[i] != 1 || t->ne[2] != 1) + shape.push_back(t->ne[i]); + } + return shape; +} -void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = 0; - const int nth = 1; - - const enum ggml_type type = src0->type; - const auto *type_traits = ggml_get_type_traits(type); - - enum ggml_type const vec_dot_type = type_traits->vec_dot_type; - ggml_from_float_t const from_float = type_traits->from_float; - ggml_from_float_to_mat_t const from_float_to_mat = type_traits->from_float_to_mat; - int64_t const vec_dot_num_rows = type_traits->nrows; - int64_t const matmul_num_cols = type_traits->ncols; - int64_t const blck_size_interleave = type_traits->blck_size_interleave; - ggml_gemv_t const gemv = type_traits->gemv; - ggml_gemm_t const gemm = type_traits->gemm; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - // src1->type = GGML_TYPE_F32, vec_dot_type = GGML_TYPE_F16 - // The main function of this code is to convert the data of src1 from GGML_TYPE_F32 type to vec_dot_type (i.e. GGML_TYPE_F16) and store the result in params->wdata. - // The code processes data of different dimensions through multiple loops and conditional judgments and uses different conversion functions to complete data conversion. - std::unique_ptr wdata(new char[ne13 * ggml_row_size(vec_dot_type, ne10) * ne11 * ne12]); - if (src1->type != vec_dot_type) { - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata.get() + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); - } +/* +* Construct an index vector for Gather to extract non-contiguous data. +* Parameters: +* - valid_cols: number of valid columns per row (e.g., for src0, valid columns = 96) +* - num_rows: number of rows in each batch (e.g., src0: 32 rows per batch) +* - batch: number of batches (e.g., 32) +* - row_stride: physical row length (in elements), e.g., src0: nb[1]/(element_size) = 6144/2 = 3072 +* - batch_stride: physical batch stride (in elements), e.g., src0: nb[2]/(element_size) = 192/2 = 96 +*/ +std::vector build_indices(int valid_cols, int num_rows, int batch, int row_stride, int batch_stride) { + std::vector indices; + indices.reserve(valid_cols * num_rows * batch); + for (int b = 0; b < batch; b++) { + for (int r = 0; r < num_rows; r++) { + for (int c = 0; c < valid_cols; c++) { + // 计算物理索引 = b * batch_stride + r * row_stride + c + indices.push_back(b * batch_stride + r * row_stride + c); } } } + return indices; +} - // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) - const int64_t nr0 = ne0; - - // This is the size of the rest of the dimensions of the result - const int64_t nr1 = ne1 * ne2 * ne3; +void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { + assert(dst && dst->src[0] && dst->src[1]); + const ggml_tensor * src0 = dst->src[0]; // src0 type F16 + const ggml_tensor * src1 = dst->src[1]; // src1 type F32 + + if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { + int valid_cols_src0 = dst->src[0]->ne[0]; + int num_rows_src0 = dst->src[0]->ne[1]; + int batch_src0 = dst->src[0]->ne[2]; + int valid_cols_src1 = dst->src[1]->ne[0]; + int num_rows_src1 = dst->src[1]->ne[1]; + int batch_src1 = dst->src[1]->ne[2]; + int row_stride_src0 = dst->src[0]->nb[1] / dst->src[0]->nb[0]; + int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0]; + + int row_stride_src1 = dst->src[1]->nb[1] / dst->src[1]->nb[0]; + int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0]; + + std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); + std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); + + // Total number of elements + size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 + size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 + + // Treat src0->data and src1->data as 1D tensors + // Note: The total length of physical data should be enough to cover the last valid element index + 1. + // flat shapes: + ov::Shape flat_shape_src0 = { total_src0 }; + ov::Shape flat_shape_src1 = { total_src1 }; + + // Create a Parameter node for collecting non-continuous data + auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + + // Create an index Constant node + auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); + auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); + + // Use the Gather operator to collect valid data + // axis = 0 + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered_src0 = std::make_shared(param_src0, indices_const_src0, axis_const); + auto gathered_src1 = std::make_shared(param_src1, indices_const_src1, axis_const); + + // Reshape to batched form: + // For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32, + // Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch. + std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; + auto reshape_src0 = std::make_shared( + gathered_src0, + ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), + false); + // For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32, + // Reshape to 3D Tensor: shape = [32, 7, 96]. + std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; + auto reshape_src1 = std::make_shared( + gathered_src1, + ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), + false); + + // For src0, first Convert from F16 to F32 + auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); + + // Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1] + auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + auto src0_transposed = std::make_shared(src0_f32, transpose_order); + + auto A = src0_transposed; + auto B = reshape_src1; + + auto batched_matmul = std::make_shared(B, A, false, false); + // batched_matmul output: shape = [32,7,32] + + std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; + auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); + + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src1, param_src0}); + + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); - // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols - int64_t num_rows_per_vec_dot = vec_dot_num_rows; - // TODO: currently the mmla kernels support only even numbered rows/cols. - // this check can be removed once they are extended to support odd numbered rows/cols too - if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { - num_rows_per_vec_dot = 1; - } + // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively + ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); + ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); + infer_request.set_input_tensor(0, tensor_src1); + infer_request.set_input_tensor(1, tensor_src0); - // Now select a reasonable chunk size. - int chunk_size = 16; + ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); + infer_request.set_output_tensor(0, tensor_dst); - // We need to step up the size if it's small - if (nr0 == 1 || nr1 == 1) { - chunk_size = 64; + infer_request.infer(); + return ; } - // distribute the work across the inner or outer loop based on which one is larger - // The number of chunks in the 0/1 dim. - // CEIL(nr0/chunk_size) - int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; - int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; - - // The number of elements in each chunk - const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; - const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - - // The first chunk comes from our thread_id, the rest will get auto-assigned. - int current_chunk = ith; - - while (current_chunk < nchunk0 * nchunk1) { - const int64_t ith0 = current_chunk % nchunk0; - const int64_t ith1 = current_chunk / nchunk0; - - const int64_t ir0_start = dr0 * ith0; - const int64_t ir0_end = MIN(ir0_start + dr0, nr0); - - const int64_t ir1_start = dr1 * ith1; - const int64_t ir1_end = MIN(ir1_start + dr1, nr1); - - const bool src1_cont = ggml_is_contiguous(src1); - - ggml_vec_dot_t const vec_dot = type_traits->vec_dot; - enum ggml_type const vec_dot_type = type_traits->vec_dot_type; - - // broadcast factors - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - // threads with no work simply yield (not sure if it helps) - if (ir0_start >= ir0_end || ir1_start >= ir1_end) { - return; - } + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // Valid shape + std::vector eff_shape_src0 = get_effective_shape(src0); + std::vector eff_shape_src1 = get_effective_shape(src1); + std::vector eff_shape_dst = get_effective_shape(dst); + + // Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1) + int rank = static_cast(eff_shape_dst.size()); + if (rank != 1 && rank != 2 && rank != 3) + throw std::runtime_error("Only rank 1, 2 or 3 supported"); + + // Total number of flattened elements + size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d; + size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d; + + ov::Shape flat_shape_src0 = { total_src0 }; + ov::Shape flat_shape_src1 = { total_src1 }; + + auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + + auto reshape_src0 = std::make_shared( + param_flat_src0, + ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), + false); + auto reshape_src1 = std::make_shared( + param_flat_src1, + ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), + false); + + // Convert src0: F16 -> F32 + auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); + + // Transpose src0_f32: + // For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072]. + // For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1]. + ov::Output A_for_mul; + if (rank == 1) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + A_for_mul = std::make_shared(src0_f32, trans_order); + } else if (rank == 2) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + A_for_mul = std::make_shared(src0_f32, trans_order); + } else { // rank == 3 + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + A_for_mul = std::make_shared(src0_f32, trans_order); + } - // const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - assert(ne12 % ne02 == 0); - assert(ne13 % ne03 == 0); - - // block-tiling attempt - const int64_t blck_0 = 16; - const int64_t blck_1 = 16; - - const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; - - // attempt to reduce false-sharing (does not seem to make a difference) - // 16 * 2, accounting for mmla kernels - float tmp[32]; - - for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { - for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { - const int64_t i13 = (ir1 / (ne12 * ne1)); - const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; - const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); - - // broadcast src0 into src1 - const int64_t i03 = i13 / r3; - const int64_t i02 = i12 / r2; - - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; - - const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); - - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - const char * src1_col = (const char*)wdata.get() + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size - : (i11 * nb11 + i12 * nb12 + i13 * nb13)); - float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); - - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { - vec_dot(ne00, &tmp[ir0 - iir0], - (num_rows_per_vec_dot > 1 ? 16 : 0), - src0_row + ir0 * nb01, - (num_rows_per_vec_dot > 1 ? nb01 : 0), - src1_col, - (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), - num_rows_per_vec_dot); - } + ov::Core core; + ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data}; + ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data}; + ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); - for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { - memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); - } - } - } - } + std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); + auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src1, param_flat_src0}); - if (nth >= nchunk0 * nchunk1) { - break; - } + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); - // current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); - current_chunk++; - } + infer_request.set_input_tensor(0, tensor_src1); + infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_output_tensor(0, tensor_dst); + infer_request.infer(); } void ggml_backend_openvino_reshape(ggml_tensor *dst) { @@ -628,19 +637,45 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Case 1: Both tensors are contiguous if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - // OpenVINO tensors for src and dst - // Source is 1D since it's contiguous - ov::Tensor src_tensor(ov::element::f32, {src0->ne[0]}, src0->data); - // // Destination is 1D since it's contiguous - ov::Tensor dst_tensor(ov::element::f32, {dst->ne[0]}, dst->data); - - // Perform the memory copy row by row - size_t row_size = dst->nb[0]; // Size of one row in destination - size_t src_stride = src0->nb[0]; // Stride for source tensor - - for (size_t i = 0; i < dst->ne[0]; ++i) { - std::memcpy((char *)dst_tensor.data()+i*row_size, (char *)src_tensor.data()+i*src_stride, row_size); - } + ov::Shape flat_shape = { static_cast(ggml_nelements(dst)) }; + + // Construct the logical shape of the target tensor + ov::Shape dst_shape = { + static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) + }; + + // --- Construct the OpenVINO computation graph --- + // 1. Define input parameter, type f32, shape flat_shape: [8192] + auto input_param = std::make_shared(ov::element::f32, flat_shape); + + // 2. Create a Constant node to represent the new shape of the target Reshape(dst_shape) + // Note: dst_shape needs to be converted to an int64_t array + std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); + + // 3. Use the Reshape operator to reshape the input tensor to the target shape(dst_shape) + auto reshape_op = std::make_shared(input_param, reshape_const, false); + + // 4. Construct the model, whose output is the result of reshape_op + auto model = std::make_shared(ov::OutputVector{ reshape_op }, ov::ParameterVector{ input_param }); + + // --- Compile and execute --- + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensor: directly wrap src0->data, shape is flat_shape[8192] + ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // Construct output Tensor: dst->data, shape is dst_shape: [1,1,8192] + ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + // Execute inference, the computation graph flattens the data of src0 and reshapes it to the shape of dst->ne, and writes it directly to dst->data + infer_request.infer(); return; } @@ -652,6 +687,70 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t nb0 = dst->nb[0]; if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { + // Assume that the data type is f32 and each element is 4 bytes + const size_t element_size = ggml_type_size(src0->type); // 4 bytes + + // Logically, the number of valid elements per row is 3072 (src0->ne[0]), and the number of rows is 7 (src0->ne[1]) + size_t valid_elems = static_cast(src0->ne[0]); // 3072 + size_t num_rows = static_cast(src0->ne[1]); // 7 + + // Number of floats physically stored per row = nb[1] / element_size = 36864/4 = 9216 + size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 + + // Total number of physical elements = (num_rows - 1)*phys_stride + valid_elems + size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + // size_t total_phys = num_rows * phys_stride; + + // 1. Wrap src0->data into a 1D tensor with shape [58368] + ov::Shape flat_input_shape = { total_phys }; + auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); + + // 2. Construct index tensor idx with shape [3072,7] + // For each logical position (i,j) (i in [0,3072), j in [0,7)), calculate index = j*phys_stride + i. + std::vector indices; + indices.reserve(valid_elems * num_rows); + for (size_t j = 0; j < num_rows; j++) { + for (size_t i = 0; i < valid_elems; i++) { + indices.push_back(static_cast(j * phys_stride + i)); + } + } + ov::Shape indices_shape = { valid_elems, num_rows }; // [3072,7] + auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); + + // 3. Use the Gather operator (axis=0) to collect valid data + // Note: The third parameter is axis, and a value of 0 means collecting data from the 1D input according to the index + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared(flat_input_param, indices_const, axis_const); + // The shape of gathered should be [3072,7] + + // 4. Reshape gathered into a 4D tensor [3072,7,1,1] + auto reshape_const = ov::op::v0::Constant::create( + ov::element::i64, {4}, std::vector{ static_cast(valid_elems), static_cast(num_rows), 1, 1 } + ); + auto reshaped = std::make_shared(gathered, reshape_const, false); + // The reshaped shape is [3072,7,1,1] + + // 5. Construct the model and output it as reshaped + auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{flat_input_param}); + + // --- Compile and execute --- + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensor: directly wrap src0->data, shape is flat_input_shape = [58368] + ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // Construct output Tensor: dst is continuous storage, and its logical shape is [3072,7,1,1] + ov::Shape output_shape = { valid_elems, num_rows, 1, 1 }; + ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + // Execute inference. The computation graph uses Gather to collect the first 3072 valid elements of each row of src0, + // and reshape them to [3072,7,1,1] and write them directly to dst->data + infer_request.infer(); + /* for (size_t i01 = 0; i01 < ne01; ++i01) { const char *src_row = reinterpret_cast(src0->data) + i01 * nb01; char *dst_row = reinterpret_cast(dst->data) + i01 * dst->nb[1]; @@ -660,7 +759,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { ov::Tensor dst_row_tensor(ov::element::f32, {ne00}, reinterpret_cast(dst_row)); std::memcpy(dst_row_tensor.data(), src_row_tensor.data(), ne00 * sizeof(float)); - } + }*/ return; } @@ -673,6 +772,72 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 if (ggml_is_contiguous(dst)) { + size_t valid_i = static_cast(src0->ne[0]); // 96 + size_t valid_j = static_cast(src0->ne[1]); // 32 + size_t valid_k = static_cast(src0->ne[2]); // 7 + + // Output the logical shape of dst: dst->ne = [3072, 7, 1, 1] + // 3072 = 32 * 96, 7 is consistent with src0->ne[2] + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + + // Physics step length: + size_t stride_j = static_cast(src0->nb[1]) / ggml_type_size(src0->type); // 2688/4 = 672 + size_t stride_k = static_cast(src0->nb[2]) / ggml_type_size(src0->type); // 384/4 = 96 + + // Construct index array, output order: for k in [0,6], for j in [0,31], for i in [0,95]: + // desired input index = j * stride_j + k * stride_k + i + std::vector indices; + indices.reserve(total_valid); + for (size_t k = 0; k < valid_k; k++) { + for (size_t j = 0; j < valid_j; j++) { + for (size_t i = 0; i < valid_i; i++) { + int64_t idx = static_cast(j * stride_j + k * stride_k + i); + indices.push_back(idx); + } + } + } + // The size of indices should be 21504 + + // 1. Construct input: treat src0->data as a 1D tensor. The valid range is 0~21503. + ov::Shape flat_input_shape = { total_valid }; + auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + + // 2. Construct index constant: 1D tensor, shape [21504] + ov::Shape indices_shape = { total_valid }; + auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); + + // 3. Set axis=0 (collect data from 1D input) + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + + // 4. Use the Gather operator (OpenVINO v8 Gather is used here) to collect valid data + auto gathered = std::make_shared(input_param, indices_const, axis_const); + // gathered has a shape of [21504] + + // 5. Reshape gathered to [3072,7,1,1], because 3072*7 = 21504 + ov::Shape target_shape = { static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }; // [3072,7,1,1] + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {4}, + std::vector{ static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }); + auto reshaped = std::make_shared(gathered, reshape_const, false); + + // 6. Construct model + auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{input_param}); + + // --- Compile and execute --- + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensor: directly wrap src0->data. Note: src0->data is regarded as a one-dimensional array according to the physical valid area, flat_input_shape: [21504] + ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // Construct output Tensor: dst->data is stored continuously, with shape target_shape: [3072,7,1,1] + ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + // Execute reasoning: The computation graph uses Gather+Reshape to collect each valid element of src0 in a predetermined order and write it directly to dst->data + infer_request.infer(); + /* const size_t rs = ne00 * element_size; // Row size in bytes for dst // Create OpenVINO tensors for source and destination @@ -699,7 +864,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Copy row std::memcpy(dst_ptr, src0_ptr, rs); - } + }*/ return; } std::cout << "Duplication of bytes completed successfully." << std::endl; @@ -746,7 +911,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { ov::ResultVector{std::make_shared(dst_output)}, ov::ParameterVector{src_input}, "ContiguousCopy"); - // Compile and execute the model + // Compile and execute the model auto compiled_model = core.compile_model(model, "CPU"); ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); @@ -757,6 +922,93 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { + // In this example, the logical shape is [7,3072,1,1]. + // Here we assume that the number of "rows" is 3072 and the number of "columns" is 7. + const size_t num_cols = static_cast(dst->ne[0]); // 7 + const size_t num_rows = static_cast(dst->ne[1]); // 3072 + const size_t total_elems = num_cols * num_rows; // 7 * 3072 = 21504 + + // For src0: + // src0->nb[0] = 12288, so the stride along logical dimension 0 = 12288/4 = 3072 (f32) + // const size_t src_stride0 = 12288 / ggml_type_size(src0->type); // 3072 + const size_t src_stride0 = src0->nb[0] / ggml_type_size(src0->type); // 3072 + + // Construct index array (length 21504), in flat output order (row-first, row length = 7): + // For output flat index n, set: + // r = n / 7, c = n % 7. + // Valid data index corresponding to src0 = c * src_stride0 + r. + std::vector indices; + indices.reserve(total_elems); + for (size_t n = 0; n < total_elems; n++) { + size_t r = n / num_cols; // r in [0,3072) + size_t c = n % num_cols; // c in [0,7) + int64_t idx = static_cast(c * src_stride0 + r); + indices.push_back(idx); + } + + // --- Construct OpenVINO calculation graph --- + // 1. Encapsulate src0->data into 1D input Tensor with shape [21504] + ov::Shape flat_shape = { total_elems }; + auto input_param = std::make_shared(ov::element::f32, flat_shape); + + // 2. Constructs an index constant with a shape of [21504] + auto indices_const = ov::op::v0::Constant::create(ov::element::i64, flat_shape, indices); + + // 3. Construct axis constant, axis = 0 + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + + // 4. Use the Gather operator to collect valid data. The result shape is [21504], type f32 + auto gathered = std::make_shared(input_param, indices_const, axis_const); + + // 5. Convert data types: f32 to f16 + auto converted = std::make_shared(gathered, ov::element::f16); + + // 6. Reshape into a 2D tensor with shape [num_rows, num_cols] = [3072,7]. + // Note: row-first arrangement is used here, that is, the 0th dimension represents rows (3072 rows) and the 1st dimension represents columns (7 consecutive elements) + std::vector new_shape = { static_cast(num_rows), static_cast(num_cols) }; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {2}, new_shape); + auto reshaped = std::make_shared(converted, reshape_const, false); + + // 7. To keep consistent with the logical shape of dst [7,3072,1,1] (note: the order of ne arrays in ggml may be different from the intuitive), + // Here we finally need to get a flat continuous result with row-first arrangement of [3072,7] (i.e., 7 consecutive elements per row). + // If you need to expand to 4D, you can further reshape, but here we only focus on two-dimensional valid data. + // Let output_shape = [num_rows, num_cols] = [3072,7] + + // 8. Construct model: input is input_param, output is reshaped + auto model = std::make_shared(ov::OutputVector{ reshaped }, ov::ParameterVector{ input_param }); + + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // 9. Construct input Tensor: directly wrap src0->data, shape is flat_shape, type f32 + ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // 10. Since dst is non-contiguous (row spacing is dst->nb[1] = 64 bytes), + // We let the model output to a temporary continuous buffer and then copy it row by row to dst->data. + ov::Shape contig_output_shape = { num_rows, num_cols }; // [3072,7] + // Allocate a temporary buffer (to store f16 data, number of elements = 3072*7) + std::vector temp_output(total_elems); + ov::Tensor output_tensor_contig(ov::element::f16, contig_output_shape, temp_output.data()); + infer_request.set_output_tensor(0, output_tensor_contig); + + // 11. Execute inference, the computation graph will collect, convert, and reshape to obtain a continuous f16 result + infer_request.infer(); + + // 12. Copy temporary output to dst->data by line, considering the non-continuous storage of dst (each line is separated by dst->nb[1] bytes) + // Each line of valid data is num_cols * sizeof(f16) = 7 * 2 = 14 bytes. + uint8_t *dst_ptr = static_cast(dst->data); + size_t dst_row_stride = static_cast(dst->nb[1]); // 64 bytes per row + size_t row_bytes = num_cols * ggml_type_size(dst->type); // 7 * 2 = 14 bytes + for (size_t r = 0; r < num_rows; r++) { + // Temporary output is a continuous two-dimensional array, offset = r * num_cols + uint8_t *src_row_ptr = reinterpret_cast(temp_output.data()) + r * row_bytes; + // Copy row_bytes to the starting address of the dst row + std::memcpy(dst_ptr + r * dst_row_stride, src_row_ptr, row_bytes); + } + + /** // Non-contiguous case: element-wise copy for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { @@ -774,7 +1026,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { } } } - } + }*/ } } @@ -828,6 +1080,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && + // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 355a95d978c94..945b5cbf7a676 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -4,6 +4,7 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { + m_node_op_name[node->name] = ggml_op_name(node->op); switch (node->op) { // Unary OPs case GGML_OP_UNARY: diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2afde161ee4d9..f4b91f92513bc 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -65,6 +65,15 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool check_if_continuous() const override { return m_continuous; } + + virtual const std::string& get_node_op_name(const std::string& name) const { + auto it = m_node_op_name.find(name); + if (it != m_node_op_name.end()) { + return it->second; + } + return ""; + } + private: void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); @@ -79,5 +88,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { const std::string m_op_name; mutable std::string m_name; bool m_continuous; + std::map m_node_op_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 84c9001c5c611..88d603b4aea93 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -109,6 +109,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { + // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); #ifdef GGML_OPENVINO_DEBUG From 4f83f2617481bdf6a8e55130ca8d404b6bc1a230 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 19 Feb 2025 17:51:07 +0800 Subject: [PATCH 027/156] add implementation of CPY when the output tensor is non-contiguous --- ggml/src/ggml-openvino.cpp | 147 ++++++++++++------------------------- 1 file changed, 48 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 99a32b1dfdc9c..dc45f0fe6dbef 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -529,7 +529,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src1, param_src0}); + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); @@ -538,8 +538,8 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); - infer_request.set_input_tensor(0, tensor_src1); - infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_src1); ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); infer_request.set_output_tensor(0, tensor_dst); @@ -548,9 +548,6 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { return ; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - // Valid shape std::vector eff_shape_src0 = get_effective_shape(src0); std::vector eff_shape_src1 = get_effective_shape(src1); @@ -604,13 +601,13 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src1, param_flat_src0}); + auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src1); - infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_src1); infer_request.set_output_tensor(0, tensor_dst); infer_request.infer(); } @@ -922,111 +919,63 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { - // In this example, the logical shape is [7,3072,1,1]. - // Here we assume that the number of "rows" is 3072 and the number of "columns" is 7. - const size_t num_cols = static_cast(dst->ne[0]); // 7 - const size_t num_rows = static_cast(dst->ne[1]); // 3072 - const size_t total_elems = num_cols * num_rows; // 7 * 3072 = 21504 - - // For src0: - // src0->nb[0] = 12288, so the stride along logical dimension 0 = 12288/4 = 3072 (f32) - // const size_t src_stride0 = 12288 / ggml_type_size(src0->type); // 3072 - const size_t src_stride0 = src0->nb[0] / ggml_type_size(src0->type); // 3072 - - // Construct index array (length 21504), in flat output order (row-first, row length = 7): - // For output flat index n, set: - // r = n / 7, c = n % 7. - // Valid data index corresponding to src0 = c * src_stride0 + r. - std::vector indices; - indices.reserve(total_elems); - for (size_t n = 0; n < total_elems; n++) { - size_t r = n / num_cols; // r in [0,3072) - size_t c = n % num_cols; // c in [0,7) - int64_t idx = static_cast(c * src_stride0 + r); - indices.push_back(idx); + std::vector gather_idx; + for (int row = 0; row < dst->src[0]->ne[1]; row++) { + for (int col = 0; col < dst->src[0]->ne[0]; col++) { + gather_idx.push_back((row*dst->src[0]->nb[1]+col*dst->src[0]->nb[0])/4); + } } + size_t N = gather_idx.size(); + ov::Shape gather_idx_shape = {N, 1}; + std::vector scatter_idx; + for (int row = 0; row < dst->ne[1]; row++) { + for (int col = 0; col < dst->ne[0]; col++) { + scatter_idx.push_back(row * dst->nb[1] / 2 + col); + } + } + ov::Shape scatter_idx_shape = {N, 1}; - // --- Construct OpenVINO calculation graph --- - // 1. Encapsulate src0->data into 1D input Tensor with shape [21504] - ov::Shape flat_shape = { total_elems }; - auto input_param = std::make_shared(ov::element::f32, flat_shape); - - // 2. Constructs an index constant with a shape of [21504] - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, flat_shape, indices); - - // 3. Construct axis constant, axis = 0 - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + // param_src0 shape => 1D, rank=1, size is large enough. For example, row*col= 21504 + some padding, e.g. 80000 + // ov::Shape flat_src0_shape = {80000}; + ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; + auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // 4. Use the Gather operator to collect valid data. The result shape is [21504], type f32 - auto gathered = std::make_shared(input_param, indices_const, axis_const); + auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); + auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared( + param_src0, gather_indices_const, gather_axis_const); - // 5. Convert data types: f32 to f16 auto converted = std::make_shared(gathered, ov::element::f16); - // 6. Reshape into a 2D tensor with shape [num_rows, num_cols] = [3072,7]. - // Note: row-first arrangement is used here, that is, the 0th dimension represents rows (3072 rows) and the 1st dimension represents columns (7 consecutive elements) - std::vector new_shape = { static_cast(num_rows), static_cast(num_cols) }; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {2}, new_shape); - auto reshaped = std::make_shared(converted, reshape_const, false); + // param_dst_base shape => 1D, rank=1, size够大, e.g. row=3072 => i up to 3071 => offset i*64=196544 + j*2, e.g.200000 + // ov::Shape flat_dst_shape = {200000, 1}; + ov::Shape flat_dst_shape = {dst->nb[2], 1}; + auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // 7. To keep consistent with the logical shape of dst [7,3072,1,1] (note: the order of ne arrays in ggml may be different from the intuitive), - // Here we finally need to get a flat continuous result with row-first arrangement of [3072,7] (i.e., 7 consecutive elements per row). - // If you need to expand to 4D, you can further reshape, but here we only focus on two-dimensional valid data. - // Let output_shape = [num_rows, num_cols] = [3072,7] + auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); - // 8. Construct model: input is input_param, output is reshaped - auto model = std::make_shared(ov::OutputVector{ reshaped }, ov::ParameterVector{ input_param }); + // ScatterNDUpdate( base, scatter_indices, updates ) + // scatter_indices last dimension = 1 => each index is 1D coordinate + auto scatter = std::make_shared( + param_dst_base, scatter_indices_const, converted + ); + + ov::ParameterVector params = { param_src0, param_dst_base }; + auto model = std::make_shared(ov::OutputVector{ scatter }, params); - ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // 9. Construct input Tensor: directly wrap src0->data, shape is flat_shape, type f32 - ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - // 10. Since dst is non-contiguous (row spacing is dst->nb[1] = 64 bytes), - // We let the model output to a temporary continuous buffer and then copy it row by row to dst->data. - ov::Shape contig_output_shape = { num_rows, num_cols }; // [3072,7] - // Allocate a temporary buffer (to store f16 data, number of elements = 3072*7) - std::vector temp_output(total_elems); - ov::Tensor output_tensor_contig(ov::element::f16, contig_output_shape, temp_output.data()); - infer_request.set_output_tensor(0, output_tensor_contig); + ov::Tensor tensor_src0(ov::element::f32, flat_src0_shape, src0->data); + ov::Tensor tensor_dst_base(ov::element::f16, flat_dst_shape, dst->data); - // 11. Execute inference, the computation graph will collect, convert, and reshape to obtain a continuous f16 result - infer_request.infer(); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_dst_base); - // 12. Copy temporary output to dst->data by line, considering the non-continuous storage of dst (each line is separated by dst->nb[1] bytes) - // Each line of valid data is num_cols * sizeof(f16) = 7 * 2 = 14 bytes. - uint8_t *dst_ptr = static_cast(dst->data); - size_t dst_row_stride = static_cast(dst->nb[1]); // 64 bytes per row - size_t row_bytes = num_cols * ggml_type_size(dst->type); // 7 * 2 = 14 bytes - for (size_t r = 0; r < num_rows; r++) { - // Temporary output is a continuous two-dimensional array, offset = r * num_cols - uint8_t *src_row_ptr = reinterpret_cast(temp_output.data()) + r * row_bytes; - // Copy row_bytes to the starting address of the dst row - std::memcpy(dst_ptr + r * dst_row_stride, src_row_ptr, row_bytes); - } + ov::Tensor out_tensor(ov::element::f16, flat_dst_shape, dst->data); + infer_request.set_output_tensor(0, out_tensor); - /** - // Non-contiguous case: element-wise copy - for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { - for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { - for (int64_t i01 = 0; i01 < dst->ne[1]; ++i01) { - for (int64_t i00 = 0; i00 < dst->ne[0]; ++i00) { - const char *src_ptr = static_cast(src0->data) + - i00 * src0->nb[0] + i01 * src0->nb[1] + - i02 * src0->nb[2] + i03 * src0->nb[3]; - - char *dst_ptr = static_cast(dst->data) + - i00 * dst->nb[0] + i01 * dst->nb[1] + - i02 * dst->nb[2] + i03 * dst->nb[3]; - - *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(*(const float *)src_ptr); - } - } - } - }*/ + infer_request.infer(); } } From eb6b03c2b3a67e7568c7c727a28d7d8b4fb10d61 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 25 Feb 2025 12:43:12 +0800 Subject: [PATCH 028/156] add tmp source code files --- examples/simple/simple.cpp | 2 +- ggml/src/ggml-openvino.cpp | 63 ++---- ggml/src/ggml-openvino/decoder.h | 15 ++ ggml/src/ggml-openvino/ggml-decoder.cpp | 284 ++++++++++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 17 +- ggml/src/ggml-openvino/utils.cpp | 50 ++++- setup.sh | 2 + 7 files changed, 318 insertions(+), 115 deletions(-) create mode 100755 setup.sh diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index d09771d10457f..9e6c678e830aa 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { std::string s(buf, n); printf("%s", s.c_str()); } - + printf("\n"); // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index dc45f0fe6dbef..2e20e8e39b1bb 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -685,8 +685,6 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { // Assume that the data type is f32 and each element is 4 bytes - const size_t element_size = ggml_type_size(src0->type); // 4 bytes - // Logically, the number of valid elements per row is 3072 (src0->ne[0]), and the number of rows is 7 (src0->ne[1]) size_t valid_elems = static_cast(src0->ne[0]); // 3072 size_t num_rows = static_cast(src0->ne[1]); // 7 @@ -740,7 +738,10 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { infer_request.set_input_tensor(0, input_tensor); // Construct output Tensor: dst is continuous storage, and its logical shape is [3072,7,1,1] - ov::Shape output_shape = { valid_elems, num_rows, 1, 1 }; + ov::Shape output_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]), + static_cast(dst->ne[3])}; ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); @@ -811,7 +812,10 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // gathered has a shape of [21504] // 5. Reshape gathered to [3072,7,1,1], because 3072*7 = 21504 - ov::Shape target_shape = { static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }; // [3072,7,1,1] + ov::Shape target_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]), + static_cast(dst->ne[3])}; // [3072,7,1,1] auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{ static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }); auto reshaped = std::make_shared(gathered, reshape_const, false); @@ -834,34 +838,6 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Execute reasoning: The computation graph uses Gather+Reshape to collect each valid element of src0 in a predetermined order and write it directly to dst->data infer_request.infer(); - /* - const size_t rs = ne00 * element_size; // Row size in bytes for dst - - // Create OpenVINO tensors for source and destination - // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. - ov::Tensor src_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, src0->data); - ov::Tensor dst_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, dst->data); - - // Perform the copy in a single loop - const size_t num_rows = ne03 * ne02 * ne01; - for (size_t row = 0; row < num_rows; ++row) { - // Calculate the source row pointer based on original strides - // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. - const char* src0_ptr = (char*)src_tensor.data() + - // Calculates which block of the i03 dimension the current row belongs to - (row / (ne02 * ne01)) * nb03 + // 0 - // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. - ((row / ne01) % ne02) * nb02 + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 - // Calculates the position within the current i02 block in terms of the i01 index. - (row % ne01) * nb01; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 - - // Destination row pointer is linear - // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. - char* dst_ptr = (char*)dst_tensor.data() + row * rs; - - // Copy row - std::memcpy(dst_ptr, src0_ptr, rs); - }*/ return; } std::cout << "Duplication of bytes completed successfully." << std::endl; @@ -939,6 +915,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { // ov::Shape flat_src0_shape = {80000}; ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); + // auto param_src00 = std::make_shared(ov::element::f32, flat_src0_shape); auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); @@ -951,6 +928,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { // ov::Shape flat_dst_shape = {200000, 1}; ov::Shape flat_dst_shape = {dst->nb[2], 1}; auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); + // auto param_dst_base11 = std::make_shared(ov::element::f16, flat_dst_shape); auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); @@ -961,6 +939,8 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { ); ov::ParameterVector params = { param_src0, param_dst_base }; + // ov::ParameterVector params = { param_src0}; + // ov::ParameterVector params = { param_src00, param_dst_base11}; auto model = std::make_shared(ov::OutputVector{ scatter }, params); auto compiled_model = core.compile_model(model, "CPU"); @@ -1009,16 +989,17 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } + // openvino_frontend_compute(backend, cgraph); // Process nodes in order for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { @@ -1029,8 +1010,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && - // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && - std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && + std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && + //std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; } @@ -1270,7 +1251,7 @@ static const std::set& openvino_ops = []() -> const std::set shape; + std::vector stride; +}; // TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: @@ -14,6 +21,8 @@ class GgmlDecoder : public DecoderBase { virtual PartialShape get_input_shape(const std::string& name) const = 0; + virtual std::vector get_input_stride(const std::string& name) const = 0; + virtual element::Type get_input_type(const std::string& name) const = 0; virtual size_t get_input_size() const = 0; @@ -27,6 +36,10 @@ class GgmlDecoder : public DecoderBase { virtual std::vector get_input_names() const = 0; + virtual const std::string& get_node_op_name(const std::string& name) const = 0; + + // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; + virtual PartialShape get_output_shape(const std::string& name) const = 0; virtual element::Type get_output_type(const std::string& name) const = 0; @@ -53,6 +66,8 @@ class GgmlDecoder : public DecoderBase { virtual bool check_if_continuous() const = 0; + virtual const std::vector>& get_params() const = 0; + }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 945b5cbf7a676..a412f8b75aa2c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,9 +2,13 @@ #include #include #include +#include +#include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { m_node_op_name[node->name] = ggml_op_name(node->op); + std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); + std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -16,6 +20,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); break; } @@ -25,76 +30,73 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); m_continuous = true; + + ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; + auto input_param = std::make_shared(ov::element::f32, flat_shape); + m_params.push_back(input_param); + break; } if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && - node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && node->nb[0] == ggml_type_size(node->src[0]->type)) { + node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && + node->nb[0] == ggml_type_size(node->src[0]->type)) { - for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { - const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; - char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; - std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); - } + // for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { + // const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; + // char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; + // std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); + // } - inputs[node->name] = node; + inputs[node->src[0]->name] = node->src[0]; outputs[node->name] = node; - m_input_names.push_back(node->name); + m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); + + const size_t element_size = ggml_type_size(node->src[0]->type); + size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 + size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 + size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 + size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + ov::Shape flat_input_shape = { total_phys }; + auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); + m_params.push_back(flat_input_param); + m_continuous = false; break; } - // if (ggml_is_contiguous(node)) { - const size_t rs = node->src[0]->ne[0] * ggml_type_size(node->src[0]->type); // Row size in bytes for dst - - // Create OpenVINO tensors for source and destination - // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. - ov::Tensor src_tensor(ov::element::f32, - ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, - node->src[0]->data); - ov::Tensor dst_tensor(ov::element::f32, - ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, - node->data); - - // Perform the copy in a single loop - const size_t num_rows = node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1]; - for (size_t row = 0; row < num_rows; ++row) { - // Calculate the source row pointer based on original strides - // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. - const char* src0_ptr = (char*)src_tensor.data() + - // Calculates which block of the i03 dimension the current row belongs to - (row / (node->src[0]->ne[2] * node->src[0]->ne[1])) * node->src[0]->nb[3] + // 0 - // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. - ((row / node->src[0]->ne[1]) % node->src[0]->ne[2]) * node->src[0]->nb[2] + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 - // Calculates the position within the current i02 block in terms of the i01 index. - (row % node->src[0]->ne[1]) * node->src[0]->nb[1]; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 - - // Destination row pointer is linear - // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. - char* dst_ptr = (char*)dst_tensor.data() + row * rs; - - // Copy row - std::memcpy(dst_ptr, src0_ptr, rs); - } - - inputs[node->name] = node; + if (ggml_is_contiguous(node)) { + inputs[node->src[0]->name] = node->src[0]; outputs[node->name] = node; - m_input_names.push_back(node->name); + m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); + + size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 + size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 + size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + ov::Shape flat_input_shape = { total_valid }; + auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + m_params.push_back(input_param); + m_continuous = false; break; - //} + } } case GGML_OP_CPY: { if (ggml_is_contiguous(node)) { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); m_continuous = true; break; } else { @@ -108,12 +110,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - inputs[node->name] = node; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_output_names.push_back(node->name); + inputs[node_name] = node; + outputs[node_name] = node; + m_input_names.push_back(node_name); + m_node_op_name[node_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); m_continuous = false; break; + + // inputs[node->src[0]->name] = node->src[0]; + // std::string temp_name = node->src[0]->name + std::string("_cpy_tmp"); + // inputs[temp_name] = node; + + // outputs[node->name] = node; + // m_input_names.push_back(node->src[0]->name); + // m_input_names.push_back(temp_name); + // m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); + // m_node_op_name[temp_name] = ggml_op_name(node->op); + + // m_output_names.push_back(node->name); + + // ov::Shape flat_src0_shape = {80000}; + // auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); + // m_params.push_back(param_src0); + + // std::cout << "decoder ADDR-0: " << param_src0.get() << std::endl; + + // ov::Shape flat_dst_shape = {200000, 1}; + // auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); + // m_params.push_back(param_dst_base); + + // std::cout << "decoder ADDR-1: " << param_dst_base.get() << std::endl; + + // m_continuous = false; + // break; } } // For view, input is node itself @@ -122,49 +152,76 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname] = node; outputs[node->name] = node; m_input_names.push_back(node->name); + m_node_op_name[node->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); break; } // SCALE case GGML_OP_SCALE: { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(node_name); + // m_node_op_name[node_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); + break; + } + case GGML_OP_MUL_MAT: + { + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { + m_continuous = false; + } else { + m_continuous = true; + } + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_input_names.push_back(src1_name); + m_node_op_name[src1_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); break; } // OPs with 2 inputs case GGML_OP_ADD: case GGML_OP_DIV: case GGML_OP_MUL: - case GGML_OP_MUL_MAT: case GGML_OP_SUB: case GGML_OP_GET_ROWS: case GGML_OP_SOFT_MAX: { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); if (node->src[1]) { - inputs[node->src[1]->name] = node->src[1]; - m_input_names.push_back(node->src[1]->name); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + inputs[src1_name] = node->src[1]; + m_node_op_name[src1_name] = ggml_op_name(node->op); + m_input_names.push_back(src1_name); } break; } // OPs with 3 inputs: case GGML_OP_ROPE: { + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); inputs[node->src[0]->name] = node->src[0]; inputs[node->src[1]->name] = node->src[1]; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_input_names.push_back(node->src[1]->name); + m_node_op_name[node->src[1]->name] = ggml_op_name(node->op); outputs[node->name] = node; m_output_names.push_back(node->name); if (node->src[2]) { + std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); inputs[node->src[2]->name] = node->src[2]; m_input_names.push_back(node->src[2]->name); + m_node_op_name[node->src[2]->name] = ggml_op_name(node->op); } break; } @@ -173,6 +230,77 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapn_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + file << " - " << std::setw(3) << i << ": [ " + << std::setw(5) << node->ne[0] << ", " + << std::setw(5) << node->ne[1] << ", " + << std::setw(5) << node->ne[2] << "] " + << std::left << std::setw(16) << ggml_op_name(node->op) << std::right << " " + << " " << node->name + << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") << "\n"; + + if (node->src[0]) { + file << std::setw(10) << " [ " + << std::setw(5) << node->src[0]->ne[0] << ", " + << std::setw(5) << node->src[0]->ne[1] << ", " + << std::setw(5) << node->src[0]->ne[2] << "] " + << std::setw(12) + << "0: " << ggml_op_name(node->src[0]->op) << " "; + // // Custom logic to handle '\000' + // const char* name_ptr = node->src[0]->name; + // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { + // file << *name_ptr; + // name_ptr++; + // } + file << node->src[0]->name; + file << "\n"; + } + if (node->src[1]) { + file << std::setw(10) << " [ " + << std::setw(5) << node->src[1]->ne[0] << ", " + << std::setw(5) << node->src[1]->ne[1] << ", " + << std::setw(5) << node->src[1]->ne[2] << "] " + << std::setw(12) + << "1: " << ggml_op_name(node->src[1]->op) << " "; + // // Custom logic to handle '\000' + // const char* name_ptr = node->src[1]->name; + // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { + // file << *name_ptr; + // name_ptr++; + // } + file << node->src[1]->name; + file << "\n"; + } + } + + file << "n_leafs = " << cgraph->n_leafs << "\n"; + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * node = cgraph->leafs[i]; + + file << " - " << std::setw(3) << i << ": [ " + << std::setw(5) << node->ne[0] << ", " + << std::setw(5) << node->ne[1] << "] " + << std::setw(8) << ggml_op_name(node->op) << " " + << std::setw(16) << ggml_get_name(node) << "\n"; + } + + file << "========================================\n"; + + file.close(); +} + GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) :m_cgraph(cgraph), m_node(node), @@ -193,7 +321,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr set_input_output(cur_node, m_inputs, m_outputs); } #ifdef GGML_OPENVINO_DEBUG - ggml_graph_print(m_cgraph); + ggml_graph_op_print(m_cgraph); #endif } } @@ -204,6 +332,13 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ggml_tensor * node = m_inputs.at(name); std::vector shape; + // [TODO], 在这里判断如果是MUL_MAT就设置shape为一维 + if(m_node_op_name.at(name) == "MUL_MAT") { + shape.push_back(static_cast(node->ne[0] * node->ne[1] * node->ne[2])); + input_shape = ov::PartialShape(shape); + return input_shape; + } + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0) { return input_shape; @@ -214,6 +349,15 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { return input_shape; } +std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { + std::vector stride; + ggml_tensor * node = m_inputs.at(name); + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + stride.push_back(static_cast(node->nb[i])); + } + return stride; +} + ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { ov::element::Type type = ov::element::dynamic; switch (m_inputs.at(name)->type) { @@ -248,6 +392,18 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } +const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { + auto it = m_node_op_name.find(name); + if (it != m_node_op_name.end()) { + return it->second; + } + return ""; +} + +const std::vector>& GgmlOvDecoder::get_params() const { + return m_params; +} + ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; // Use input_node->ne diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f4b91f92513bc..0921fd8bb5828 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -2,6 +2,7 @@ #include "decoder.h" #include "ggml.h" +#include "openvino/op/parameter.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: @@ -16,6 +17,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual ov::PartialShape get_input_shape(const std::string& name) const override; + virtual std::vector get_input_stride(const std::string& name) const override; + virtual ov::element::Type get_input_type(const std::string& name) const override; virtual size_t get_input_size() const override; @@ -66,13 +69,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_continuous; } - virtual const std::string& get_node_op_name(const std::string& name) const { - auto it = m_node_op_name.find(name); - if (it != m_node_op_name.end()) { - return it->second; - } - return ""; - } + virtual const std::string& get_node_op_name(const std::string& name) const override; + // virtual const std::string& get_node_op_info(const std::string& name) const override; + + virtual const std::vector>& get_params() const override; private: void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); @@ -85,9 +85,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { ggml_tensor* m_node; std::vector m_nodes; std::vector> m_decoders; - const std::string m_op_name; + std::string m_op_name; mutable std::string m_name; bool m_continuous; std::map m_node_op_name; + std::vector> m_params; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 88d603b4aea93..8fa1f99a01007 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -13,13 +13,58 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::map input_tensors; auto input_names = ggml_decoder->get_input_names(); + // auto node_name = ggml_decoder->get_op_name(); for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; + auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif - ov::Tensor input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + ov::Tensor input_tensor; + auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + // if (node_op_name == "CPY" && (input_shape[0] != 7)) { + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {80000}, input_data); + + // } else if (node_op_name == "CONT" || node_op_name == "MUL_MAT") { + // // auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + // // size_t total_size = 1; + // // for (auto dim : input_shape) { + // // total_size *= dim; + // // } + // // ov::Shape new_shape = {total_size}; + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {ggml_decoder->get_input_shape(name).to_shape()[0]}, input_data); + // } else { + if (node_op_name == "CONT" && ggml_decoder->check_if_continuous()) { + ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + ggml_decoder->get_input_shape(name).to_shape()[1] * + ggml_decoder->get_input_shape(name).to_shape()[2] }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); + } else if ( node_op_name == "CONT" && + !ggml_decoder->check_if_continuous() && + input_shape[0] == 1) { + size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 3072 + size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 7 + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + std::vector strides = ggml_decoder->get_input_stride(name); + size_t phys_stride = static_cast(strides[1]) / element_size; + size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; + ov::Shape flat_input_shape = { total_phys }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); + } else if (node_op_name == "CONT") { + size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 + size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 + size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + ov::Shape flat_input_shape = { total_valid }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); + } else { + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + } + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // } + input_tensors[name] = input_tensor; } return input_tensors; @@ -80,6 +125,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -90,6 +137,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); + ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000000000..697639dd143c3 --- /dev/null +++ b/setup.sh @@ -0,0 +1,2 @@ +cmake --build build --parallel $(nproc) + From 40afe75dd974e46d383ad436fb8312779c7b2289 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 25 Feb 2025 17:29:43 +0800 Subject: [PATCH 029/156] Execute singel CONT operator is OK --- ggml/src/ggml-openvino.cpp | 8 +- ggml/src/ggml-openvino/decoder.h | 2 + ggml/src/ggml-openvino/ggml-decoder.cpp | 129 +++++++++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 + 4 files changed, 78 insertions(+), 63 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2e20e8e39b1bb..e1c294a1d9f19 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -998,8 +998,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { @@ -1010,8 +1010,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && - std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && - //std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && + // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && + // std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index ef18c12144eba..9a884a33741bb 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -42,6 +42,8 @@ class GgmlDecoder : public DecoderBase { virtual PartialShape get_output_shape(const std::string& name) const = 0; + virtual std::vector get_output_stride(const std::string& name) const = 0; + virtual element::Type get_output_type(const std::string& name) const = 0; virtual int32_t* get_output_op_params(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a412f8b75aa2c..6a249c103f065 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -7,8 +7,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { m_node_op_name[node->name] = ggml_op_name(node->op); - std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); - std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); + // Execute singel CONT operator is OK + std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); + std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -17,21 +20,21 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); break; } case GGML_OP_CONT: { if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node)) { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); m_continuous = true; ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; @@ -51,11 +54,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne[0] * ggml_type_size(node->src[0]->type)); // } - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); const size_t element_size = ggml_type_size(node->src[0]->type); size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 @@ -71,11 +74,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 @@ -98,6 +101,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); m_continuous = true; + + ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 4); + auto input_param = std::make_shared(ov::element::f32, src_shape); + m_params.push_back(input_param); break; } else { for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 @@ -118,57 +125,52 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - // std::string temp_name = node->src[0]->name + std::string("_cpy_tmp"); + // inputs[src0_name] = node->src[0]; + // std::string temp_name = src0_name + std::string("_cpy_tmp"); // inputs[temp_name] = node; - // outputs[node->name] = node; - // m_input_names.push_back(node->src[0]->name); + // outputs[node_name] = node; + // m_input_names.push_back(src0_name); // m_input_names.push_back(temp_name); - // m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); + // m_node_op_name[src0_name] = ggml_op_name(node->op); // m_node_op_name[temp_name] = ggml_op_name(node->op); + // m_output_names.push_back(node_name); + // m_continuous = false; - // m_output_names.push_back(node->name); - - // ov::Shape flat_src0_shape = {80000}; + // ov::Shape flat_src0_shape = {node->src[0]->nb[2]}; // auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); // m_params.push_back(param_src0); - // std::cout << "decoder ADDR-0: " << param_src0.get() << std::endl; - - // ov::Shape flat_dst_shape = {200000, 1}; + // ov::Shape flat_dst_shape = {node->nb[2], 1}; // auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); // m_params.push_back(param_dst_base); - // std::cout << "decoder ADDR-1: " << param_dst_base.get() << std::endl; - - // m_continuous = false; - // break; + break; } } // For view, input is node itself case GGML_OP_VIEW: { - inputs[node->name] = node; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_node_op_name[node->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[node_name] = node; + outputs[node_name] = node; + m_input_names.push_back(node_name); + m_node_op_name[node_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); break; } // SCALE case GGML_OP_SCALE: { - inputs[src0_name] = node->src[0]; + inputs[node_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(node_name); - // m_node_op_name[node_name] = ggml_op_name(node->op); + m_node_op_name[node_name] = ggml_op_name(node->op); m_output_names.push_back(node_name); break; } case GGML_OP_MUL_MAT: { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { @@ -198,7 +200,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); inputs[src1_name] = node->src[1]; m_node_op_name[src1_name] = ggml_op_name(node->op); m_input_names.push_back(src1_name); @@ -208,20 +210,20 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - inputs[node->src[0]->name] = node->src[0]; - inputs[node->src[1]->name] = node->src[1]; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_input_names.push_back(node->src[1]->name); - m_node_op_name[node->src[1]->name] = ggml_op_name(node->op); - outputs[node->name] = node; - m_output_names.push_back(node->name); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_input_names.push_back(src1_name); + m_node_op_name[src1_name] = ggml_op_name(node->op); + outputs[node_name] = node; + m_output_names.push_back(node_name); if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); - inputs[node->src[2]->name] = node->src[2]; - m_input_names.push_back(node->src[2]->name); - m_node_op_name[node->src[2]->name] = ggml_op_name(node->op); + std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); + inputs[src2_name] = node->src[2]; + m_input_names.push_back(src2_name); + m_node_op_name[src2_name] = ggml_op_name(node->op); } break; } @@ -358,6 +360,15 @@ std::vector GgmlOvDecoder::get_input_stride(const std::string& name) con return stride; } +std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { + std::vector stride; + ggml_tensor * node = m_outputs.at(name); + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + stride.push_back(static_cast(node->nb[i])); + } + return stride; +} + ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { ov::element::Type type = ov::element::dynamic; switch (m_inputs.at(name)->type) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 0921fd8bb5828..98c418dd6adab 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -39,6 +39,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual ov::PartialShape get_output_shape(const std::string& name) const override; + virtual std::vector get_output_stride(const std::string& name) const override; + virtual ov::element::Type get_output_type(const std::string& name) const override; virtual int32_t* get_output_op_params(const std::string& name) const override; From 7abd6f0cb579d3bf937066354c15878178bca73d Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sat, 1 Mar 2025 22:18:43 +0800 Subject: [PATCH 030/156] Execute CONT & VIEW operators in OV Frontend is OK --- ggml/src/ggml-openvino.cpp | 69 ++++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.cpp | 53 +++++++++++-------- ggml/src/ggml-openvino/utils.cpp | 20 +++---- 3 files changed, 91 insertions(+), 51 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index e1c294a1d9f19..35f04f32c3586 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -482,6 +482,9 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { // flat shapes: ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; + // Same as above + // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; + // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; // Create a Parameter node for collecting non-continuous data auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); @@ -526,9 +529,6 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto batched_matmul = std::make_shared(B, A, false, false); // batched_matmul output: shape = [32,7,32] - std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; - auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); ov::Core core; @@ -541,7 +541,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); - ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); + ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data); infer_request.set_output_tensor(0, tensor_dst); infer_request.infer(); @@ -564,6 +564,9 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; + // Same as above + // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; + // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); @@ -602,6 +605,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml"); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); @@ -618,8 +622,35 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { + ov::Core core; + ov::Shape tensor_shape{static_cast(dst->ne[3]), static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - GGML_UNUSED(dst); + // auto param = std::make_shared(ov::element::f32, tensor_shape); + auto param = std::make_shared(ov::element::f16, tensor_shape); + + auto reshaped = std::make_shared(param, + ov::op::v0::Constant::create(ov::element::i64, { tensor_shape.size() }, tensor_shape), + false); + + auto model = std::make_shared(ov::NodeVector{reshaped}, ov::ParameterVector{param}); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/003_backend_view_model.xml"); + + auto compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // ov::Tensor input_tensor(ov::element::f32, tensor_shape, dst->data); + ov::Tensor input_tensor(ov::element::f16, tensor_shape, dst->data); + // infer_request.set_tensor(param, input_tensor); + infer_request.set_input_tensor(0, input_tensor); + + // ov::Tensor output_tensor(ov::element::f32, tensor_shape, dst->data); + ov::Tensor output_tensor(ov::element::f16, tensor_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + // auto output_tensor = infer_request.get_output_tensor(0); + // dst->data = output_tensor.data(); } void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { @@ -992,31 +1023,33 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // openvino_frontend_compute(backend, cgraph); // Process nodes in order for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; - while (i < cgraph->n_nodes && - // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && - // std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && - std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { + while (i < cgraph->n_nodes + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + ) { i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); + openvino_frontend_compute(backend, cgraph, start_index, --i); } } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6a249c103f065..fab8d4aed6527 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,12 +6,20 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { - m_node_op_name[node->name] = ggml_op_name(node->op); + // m_node_op_name[node->name] = ggml_op_name(node->op); + + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); + // Execute singel CONT operator is OK - std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); - std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); + + std::string src0_name = std::string(node->src[0]->name); + std::string node_name = std::string(node->name); switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -151,6 +159,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); inputs[node_name] = node; outputs[node_name] = node; m_input_names.push_back(node_name); @@ -161,21 +170,29 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; + inputs[src0_name] = node->src[0]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_node_op_name[node_name] = ggml_op_name(node->op); + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); m_output_names.push_back(node_name); break; } case GGML_OP_MUL_MAT: { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + ov::Shape flat_shape_src0 = { node->src[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; + ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; + auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + m_params.push_back(param_src0); + m_params.push_back(param_src1); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { m_continuous = true; } + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; outputs[node_name] = node; @@ -200,7 +217,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; m_node_op_name[src1_name] = ggml_op_name(node->op); m_input_names.push_back(src1_name); @@ -210,7 +228,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; m_input_names.push_back(src0_name); @@ -220,7 +239,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[2]) { - std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); + // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); + std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; m_input_names.push_back(src2_name); m_node_op_name[src2_name] = ggml_op_name(node->op); @@ -334,13 +354,6 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ggml_tensor * node = m_inputs.at(name); std::vector shape; - // [TODO], 在这里判断如果是MUL_MAT就设置shape为一维 - if(m_node_op_name.at(name) == "MUL_MAT") { - shape.push_back(static_cast(node->ne[0] * node->ne[1] * node->ne[2])); - input_shape = ov::PartialShape(shape); - return input_shape; - } - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0) { return input_shape; @@ -405,10 +418,8 @@ std::vector GgmlOvDecoder::get_input_names() const { const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { auto it = m_node_op_name.find(name); - if (it != m_node_op_name.end()) { - return it->second; - } - return ""; + static const std::string empty_str; + return (it != m_node_op_name.end()) ? it->second : empty_str; } const std::vector>& GgmlOvDecoder::get_params() const { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8fa1f99a01007..21edad596b4e0 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -26,18 +26,9 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - // } else if (node_op_name == "CONT" || node_op_name == "MUL_MAT") { - // // auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); - // // size_t total_size = 1; - // // for (auto dim : input_shape) { - // // total_size *= dim; - // // } - // // ov::Shape new_shape = {total_size}; - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {ggml_decoder->get_input_shape(name).to_shape()[0]}, input_data); - // } else { if (node_op_name == "CONT" && ggml_decoder->check_if_continuous()) { - ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - ggml_decoder->get_input_shape(name).to_shape()[1] * + ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + ggml_decoder->get_input_shape(name).to_shape()[1] * ggml_decoder->get_input_shape(name).to_shape()[2] }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); } else if ( node_op_name == "CONT" && @@ -59,6 +50,11 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), flat_input_shape, input_data); + } else if (node_op_name == "MUL_MAT") { + ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + ggml_decoder->get_input_shape(name).to_shape()[1] * + ggml_decoder->get_input_shape(name).to_shape()[2] }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } @@ -125,7 +121,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); From 3928535afe911d5ab1312a92ede96adc6beb8a12 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 4 Mar 2025 00:05:00 +0800 Subject: [PATCH 031/156] OV Frontend supports GET_ROWS/RMS_NORM/MUL/MUL_MAT graph conversion of consecutive OPs --- ggml/src/ggml-openvino.cpp | 64 +++++++++++++------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 46 +++++++++++++----- ggml/src/ggml-openvino/utils.cpp | 11 ++++- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 35f04f32c3586..883e43365fe85 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1020,39 +1020,41 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node); // openvino_frontend_compute(backend, cgraph); // Process nodes in order - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); + // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i); + // } + // } + // } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fab8d4aed6527..90755ec9a6a84 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -20,6 +20,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name); std::string node_name = std::string(node->name); + switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -110,7 +111,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne, node->src[0]->ne + 4); + ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); auto input_param = std::make_shared(ov::element::f32, src_shape); m_params.push_back(input_param); break; @@ -217,6 +218,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); if (node->src[1]) { + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; @@ -228,6 +230,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; @@ -239,6 +242,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[2]) { + // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; @@ -253,7 +257,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapn_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -269,9 +280,14 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " << std::setw(5) << node->ne[2] << "] " - << std::left << std::setw(16) << ggml_op_name(node->op) << std::right << " " - << " " << node->name - << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") << "\n"; + << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " " + << std::left << std::setw(44) << node->name << std::right + << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") + << std::setw(2) << "[ " + << std::setw(0) << node->nb[0] << ", " + << std::setw(5) << node->nb[1] << ", " + << std::setw(5) << node->nb[2] << "] " + << "\n"; if (node->src[0]) { file << std::setw(10) << " [ " @@ -279,15 +295,19 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[0]->ne[1] << ", " << std::setw(5) << node->src[0]->ne[2] << "] " << std::setw(12) - << "0: " << ggml_op_name(node->src[0]->op) << " "; + << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; // // Custom logic to handle '\000' // const char* name_ptr = node->src[0]->name; // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { // file << *name_ptr; // name_ptr++; // } - file << node->src[0]->name; - file << "\n"; + file << std::left << std::setw(30) << node->src[0]->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << node->src[0]->nb[0] << ", " + << std::setw(5) << node->src[0]->nb[1] << ", " + << std::setw(5) << node->src[0]->nb[2] << "] " + << "\n"; } if (node->src[1]) { file << std::setw(10) << " [ " @@ -295,15 +315,19 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[1]->ne[1] << ", " << std::setw(5) << node->src[1]->ne[2] << "] " << std::setw(12) - << "1: " << ggml_op_name(node->src[1]->op) << " "; + << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; // // Custom logic to handle '\000' // const char* name_ptr = node->src[1]->name; // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { // file << *name_ptr; // name_ptr++; // } - file << node->src[1]->name; - file << "\n"; + file << std::left << std::setw(30) << node->src[1]->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << node->src[1]->nb[0] << ", " + << std::setw(5) << node->src[1]->nb[1] << ", " + << std::setw(5) << node->src[1]->nb[2] << "] " + << "\n"; } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 21edad596b4e0..4b25c136896fd 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -121,7 +121,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -145,6 +145,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + + // auto input_tensor = infer_request.get_input_tensor(i); + // auto input_shape = input_tensor.get_shape(); + // std::cout << "Input tensor " << i << " shape: "; + // for (const auto& dim : input_shape) { + // std::cout << dim << " "; + // } + // std::cout << std::endl; } infer_request.infer(); @@ -155,6 +163,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c for (size_t i = 0; i < output_names.size(); i++) { // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); + // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); From 54146087b1bf419a04da973237561ff15bb44cd0 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 5 Mar 2025 18:50:18 +0800 Subject: [PATCH 032/156] OV Frontend supports GET_ROWS/RMS_NORM/MUL/MUL_MAT/ROPE/SCALE/SOFTMAX/ADD adjacent op graph conversion --- ggml/src/ggml-openvino.cpp | 1 - ggml/src/ggml-openvino/decoder.h | 2 ++ ggml/src/ggml-openvino/ggml-decoder.cpp | 38 +++++++++++++++++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 3 +- ggml/src/ggml-openvino/utils.cpp | 17 ++++++----- 5 files changed, 49 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 883e43365fe85..8cc4de05b1db8 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1279,7 +1279,6 @@ static const std::set& openvino_ops = []() -> const std::setop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -43,6 +44,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; @@ -67,13 +69,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); const size_t element_size = ggml_type_size(node->src[0]->type); size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 ov::Shape flat_input_shape = { total_phys }; auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); m_params.push_back(flat_input_param); @@ -87,6 +91,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 @@ -108,6 +113,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; @@ -130,6 +136,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = false; break; @@ -161,10 +168,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); inputs[node_name] = node; outputs[node_name] = node; m_input_names.push_back(node_name); m_node_op_name[node_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -175,6 +184,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -199,8 +209,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); m_node_op_name[src1_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -216,6 +228,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); if (node->src[1]) { // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); @@ -223,6 +236,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src1_name] = node->src[1]; m_node_op_name[src1_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); } break; @@ -237,8 +251,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]; m_input_names.push_back(src0_name); m_node_op_name[src0_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); m_node_op_name[src1_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); outputs[node_name] = node; m_output_names.push_back(node_name); if (node->src[2]) { @@ -248,6 +264,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[2]; m_input_names.push_back(src2_name); m_node_op_name[src2_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); } break; } @@ -359,8 +376,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { - // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - for (int node_n = start_index; node_n <= end_index; node_n++) { + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + // for (int node_n = start_index; node_n <= end_index; node_n++) { auto cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); // Init model input and output @@ -446,6 +463,21 @@ const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) cons return (it != m_node_op_name.end()) ? it->second : empty_str; } +std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { + if (index == -1) { + for (size_t i = 0; i < m_op_node_name.size(); ++i) { + if (m_op_node_name[i].first == key_name) { + return m_op_node_name[i].second; + } + } + } else { + return m_op_node_name[index].second; + } + + static std::string empty_string = ""; + return empty_string; // empty string +} + const std::vector>& GgmlOvDecoder::get_params() const { return m_params; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 98c418dd6adab..238f1d79b4257 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -72,7 +72,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } virtual const std::string& get_node_op_name(const std::string& name) const override; - // virtual const std::string& get_node_op_info(const std::string& name) const override; + std::string& get_op_node_name(const std::string& key_name, const int index) override; virtual const std::vector>& get_params() const override; @@ -92,5 +92,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_continuous; std::map m_node_op_name; std::vector> m_params; + std::vector> m_op_node_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 4b25c136896fd..8f27bbc97db8f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,9 +14,11 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr input_tensors; auto input_names = ggml_decoder->get_input_names(); // auto node_name = ggml_decoder->get_op_name(); + size_t iter = 0; for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; - auto node_op_name = ggml_decoder->get_node_op_name(name); + std::string op_node_name = ggml_decoder->get_op_node_name(name, iter++); + // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); @@ -26,12 +28,12 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - if (node_op_name == "CONT" && ggml_decoder->check_if_continuous()) { + if (op_node_name == "CONT" && ggml_decoder->check_if_continuous()) { ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * ggml_decoder->get_input_shape(name).to_shape()[1] * ggml_decoder->get_input_shape(name).to_shape()[2] }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); - } else if ( node_op_name == "CONT" && + } else if ( op_node_name == "CONT" && !ggml_decoder->check_if_continuous() && input_shape[0] == 1) { size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 3072 @@ -40,17 +42,18 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr strides = ggml_decoder->get_input_stride(name); size_t phys_stride = static_cast(strides[1]) / element_size; - size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; + // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; + size_t total_phys = num_rows* phys_stride; ov::Shape flat_input_shape = { total_phys }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (node_op_name == "CONT") { + } else if (op_node_name == "CONT") { size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 ov::Shape flat_input_shape = { total_valid }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (node_op_name == "MUL_MAT") { + } else if (op_node_name == "MUL_MAT") { ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * ggml_decoder->get_input_shape(name).to_shape()[1] * ggml_decoder->get_input_shape(name).to_shape()[2] }; @@ -144,7 +147,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + infer_request.set_input_tensor(i, input_tensors[input_names[i]]); // auto input_tensor = infer_request.get_input_tensor(i); // auto input_shape = input_tensor.get_shape(); From c8692b8a8aac3aa8079dae7624226f373190e45f Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 5 Mar 2025 23:07:22 +0800 Subject: [PATCH 033/156] Change the input parameter shape of CONT operator --- ggml/src/ggml-openvino.cpp | 234 +++++++++++++++---------------------- 1 file changed, 92 insertions(+), 142 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 8cc4de05b1db8..034bd698c3d2f 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -665,44 +665,46 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Case 1: Both tensors are contiguous if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - ov::Shape flat_shape = { static_cast(ggml_nelements(dst)) }; + ov::Shape input_shape = { + static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) + }; + size_t num_elements = 1; + for (auto d : input_shape) { + num_elements *= d; + } + ov::Shape flat_shape = { num_elements }; - // Construct the logical shape of the target tensor ov::Shape dst_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; - // --- Construct the OpenVINO computation graph --- - // 1. Define input parameter, type f32, shape flat_shape: [8192] - auto input_param = std::make_shared(ov::element::f32, flat_shape); + auto input_param = std::make_shared(ov::element::f32, input_shape); - // 2. Create a Constant node to represent the new shape of the target Reshape(dst_shape) - // Note: dst_shape needs to be converted to an int64_t array - std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); + std::vector flat_shape_vec(flat_shape.begin(), flat_shape.end()); + auto flat_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { flat_shape_vec.size() }, flat_shape_vec); + auto flat_reshape = std::make_shared(input_param, flat_reshape_const, false); - // 3. Use the Reshape operator to reshape the input tensor to the target shape(dst_shape) - auto reshape_op = std::make_shared(input_param, reshape_const, false); + std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); + auto dst_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); + auto final_reshape = std::make_shared(flat_reshape, dst_reshape_const, false); - // 4. Construct the model, whose output is the result of reshape_op - auto model = std::make_shared(ov::OutputVector{ reshape_op }, ov::ParameterVector{ input_param }); + auto model = std::make_shared(ov::OutputVector{ final_reshape }, ov::ParameterVector{ input_param }); - // --- Compile and execute --- ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // Construct input Tensor: directly wrap src0->data, shape is flat_shape[8192] - ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); + ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); infer_request.set_input_tensor(0, input_tensor); - // Construct output Tensor: dst->data, shape is dst_shape: [1,1,8192] ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); - // Execute inference, the computation graph flattens the data of src0 and reshapes it to the shape of dst->ne, and writes it directly to dst->data infer_request.infer(); return; } @@ -715,69 +717,42 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t nb0 = dst->nb[0]; if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - // Assume that the data type is f32 and each element is 4 bytes - // Logically, the number of valid elements per row is 3072 (src0->ne[0]), and the number of rows is 7 (src0->ne[1]) - size_t valid_elems = static_cast(src0->ne[0]); // 3072 - size_t num_rows = static_cast(src0->ne[1]); // 7 - - // Number of floats physically stored per row = nb[1] / element_size = 36864/4 = 9216 - size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 - - // Total number of physical elements = (num_rows - 1)*phys_stride + valid_elems - size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - // size_t total_phys = num_rows * phys_stride; - - // 1. Wrap src0->data into a 1D tensor with shape [58368] - ov::Shape flat_input_shape = { total_phys }; - auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); - - // 2. Construct index tensor idx with shape [3072,7] - // For each logical position (i,j) (i in [0,3072), j in [0,7)), calculate index = j*phys_stride + i. - std::vector indices; - indices.reserve(valid_elems * num_rows); + const size_t valid_elems = static_cast(src0->ne[0]); + const size_t num_rows = static_cast(src0->ne[1]); + const size_t dim2 = static_cast(src0->ne[2]); + const size_t dim3 = static_cast(src0->ne[3]); + + size_t phys_stride = static_cast(src0->nb[1]) / element_size; + size_t total_logical = valid_elems * num_rows * dim2 * dim3; + + std::vector contiguous_data(total_logical); + for (size_t j = 0; j < num_rows; j++) { - for (size_t i = 0; i < valid_elems; i++) { - indices.push_back(static_cast(j * phys_stride + i)); - } + const float *src_row = reinterpret_cast(src0->data) + j * phys_stride; + float *dst_row = contiguous_data.data() + j * valid_elems; + std::copy(src_row, src_row + valid_elems, dst_row); } - ov::Shape indices_shape = { valid_elems, num_rows }; // [3072,7] - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); - - // 3. Use the Gather operator (axis=0) to collect valid data - // Note: The third parameter is axis, and a value of 0 means collecting data from the 1D input according to the index - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared(flat_input_param, indices_const, axis_const); - // The shape of gathered should be [3072,7] - // 4. Reshape gathered into a 4D tensor [3072,7,1,1] - auto reshape_const = ov::op::v0::Constant::create( - ov::element::i64, {4}, std::vector{ static_cast(valid_elems), static_cast(num_rows), 1, 1 } - ); - auto reshaped = std::make_shared(gathered, reshape_const, false); - // The reshaped shape is [3072,7,1,1] + ov::Shape logical_shape = { valid_elems, num_rows, dim2, dim3 }; + auto input_param = std::make_shared(ov::element::f32, logical_shape); + auto identity_const = ov::op::v0::Constant::create(ov::element::i64, + { logical_shape.size() }, + std::vector(logical_shape.begin(), logical_shape.end())); + auto identity_op = std::make_shared(input_param, identity_const, false); - // 5. Construct the model and output it as reshaped - auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{flat_input_param}); + auto model = std::make_shared(ov::OutputVector{identity_op}, + ov::ParameterVector{input_param}); - // --- Compile and execute --- ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // Construct input Tensor: directly wrap src0->data, shape is flat_input_shape = [58368] - ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + ov::Tensor input_tensor(ov::element::f32, logical_shape, contiguous_data.data()); infer_request.set_input_tensor(0, input_tensor); - // Construct output Tensor: dst is continuous storage, and its logical shape is [3072,7,1,1] - ov::Shape output_shape = { static_cast(dst->ne[0]), - static_cast(dst->ne[1]), - static_cast(dst->ne[2]), - static_cast(dst->ne[3])}; - ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); - // Execute inference. The computation graph uses Gather to collect the first 3072 valid elements of each row of src0, - // and reshape them to [3072,7,1,1] and write them directly to dst->data infer_request.infer(); /* for (size_t i01 = 0; i01 < ne01; ++i01) { @@ -804,74 +779,48 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { size_t valid_i = static_cast(src0->ne[0]); // 96 size_t valid_j = static_cast(src0->ne[1]); // 32 size_t valid_k = static_cast(src0->ne[2]); // 7 + size_t valid_l = static_cast(src0->ne[3]); // 1 - // Output the logical shape of dst: dst->ne = [3072, 7, 1, 1] - // 3072 = 32 * 96, 7 is consistent with src0->ne[2] size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + size_t stride_j = static_cast(src0->nb[1]) / element_size; // 672 + size_t stride_k = static_cast(src0->nb[2]) / element_size; // 96 - // Physics step length: - size_t stride_j = static_cast(src0->nb[1]) / ggml_type_size(src0->type); // 2688/4 = 672 - size_t stride_k = static_cast(src0->nb[2]) / ggml_type_size(src0->type); // 384/4 = 96 - - // Construct index array, output order: for k in [0,6], for j in [0,31], for i in [0,95]: - // desired input index = j * stride_j + k * stride_k + i - std::vector indices; - indices.reserve(total_valid); + std::vector contiguous_data(total_valid); + const float *src_data = reinterpret_cast(src0->data); for (size_t k = 0; k < valid_k; k++) { for (size_t j = 0; j < valid_j; j++) { for (size_t i = 0; i < valid_i; i++) { - int64_t idx = static_cast(j * stride_j + k * stride_k + i); - indices.push_back(idx); + size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; + size_t src_index = j * stride_j + k * stride_k + i; + contiguous_data[out_index] = src_data[src_index]; } } } - // The size of indices should be 21504 - - // 1. Construct input: treat src0->data as a 1D tensor. The valid range is 0~21503. - ov::Shape flat_input_shape = { total_valid }; - auto input_param = std::make_shared(ov::element::f32, flat_input_shape); - // 2. Construct index constant: 1D tensor, shape [21504] - ov::Shape indices_shape = { total_valid }; - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); - - // 3. Set axis=0 (collect data from 1D input) - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; + auto input_param = std::make_shared(ov::element::f32, input_shape); - // 4. Use the Gather operator (OpenVINO v8 Gather is used here) to collect valid data - auto gathered = std::make_shared(input_param, indices_const, axis_const); - // gathered has a shape of [21504] + ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; + std::vector target_shape_vec = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), dst->ne[2]}; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, target_shape_vec); + auto reshaped = std::make_shared(input_param, reshape_const, false); - // 5. Reshape gathered to [3072,7,1,1], because 3072*7 = 21504 - ov::Shape target_shape = { static_cast(dst->ne[0]), - static_cast(dst->ne[1]), - static_cast(dst->ne[2]), - static_cast(dst->ne[3])}; // [3072,7,1,1] - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {4}, - std::vector{ static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }); - auto reshaped = std::make_shared(gathered, reshape_const, false); - - // 6. Construct model auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{input_param}); - // --- Compile and execute --- ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // Construct input Tensor: directly wrap src0->data. Note: src0->data is regarded as a one-dimensional array according to the physical valid area, flat_input_shape: [21504] - ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + ov::Tensor input_tensor(ov::element::f32, input_shape, contiguous_data.data()); infer_request.set_input_tensor(0, input_tensor); - // Construct output Tensor: dst->data is stored continuously, with shape target_shape: [3072,7,1,1] ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); - // Execute reasoning: The computation graph uses Gather+Reshape to collect each valid element of src0 in a predetermined order and write it directly to dst->data infer_request.infer(); return; } - std::cout << "Duplication of bytes completed successfully." << std::endl; } static void ggml_backend_openvino_transpose(ggml_tensor *dst) { @@ -1021,40 +970,40 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); + // openvino_frontend_compute(backend, cgraph, 0, end_node); // openvino_frontend_compute(backend, cgraph); // Process nodes in order - // for (int i = 0; i < cgraph->n_nodes; i++) { - // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); - // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // // ggml_backend_openvino_transpose(cgraph->nodes[i]); - // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i); - // } - // } - // } + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i); + } + } + } return GGML_STATUS_SUCCESS; @@ -1522,3 +1471,4 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { return ® } + From ff53b14352baceb8fc5b68a1f087780bac5570c8 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 01:38:01 +0800 Subject: [PATCH 034/156] Change the input and ouput node shape of MUL_MAT operator --- ggml/src/ggml-openvino.cpp | 201 ++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 90 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 034bd698c3d2f..afd616a33840d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -458,68 +458,72 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { const ggml_tensor * src1 = dst->src[1]; // src1 type F32 if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { - int valid_cols_src0 = dst->src[0]->ne[0]; - int num_rows_src0 = dst->src[0]->ne[1]; - int batch_src0 = dst->src[0]->ne[2]; - int valid_cols_src1 = dst->src[1]->ne[0]; - int num_rows_src1 = dst->src[1]->ne[1]; - int batch_src1 = dst->src[1]->ne[2]; - int row_stride_src0 = dst->src[0]->nb[1] / dst->src[0]->nb[0]; - int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0]; - - int row_stride_src1 = dst->src[1]->nb[1] / dst->src[1]->nb[0]; - int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0]; + int valid_cols_src0 = src0->ne[0]; // 96 + int num_rows_src0 = src0->ne[1]; // 32 + int batch_src0 = src0->ne[2]; // 32 + + int valid_cols_src1 = src1->ne[0]; // 96 + int num_rows_src1 = src1->ne[1]; // 7 + int batch_src1 = src1->ne[2]; // 32 + + // 对 src0:row_stride = nb[1] / nb[0] + int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072 + int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96 + + // 对 src1:row_stride = nb[1] / nb[0] + int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072 + int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96 std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - // Total number of elements size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - // Treat src0->data and src1->data as 1D tensors - // Note: The total length of physical data should be enough to cover the last valid element index + 1. - // flat shapes: + ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) }; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src1->ne[1]), + static_cast(src1->ne[2]), + static_cast(src1->ne[3]) }; + + auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); + ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; - // Same as above - // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; - // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; - // Create a Parameter node for collecting non-continuous data - auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + auto flatten_src0 = std::make_shared( + param_src0, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src0) }), + false); + auto flatten_src1 = std::make_shared( + param_src1, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src1) }), + false); - // Create an index Constant node auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - - // Use the Gather operator to collect valid data - // axis = 0 auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered_src0 = std::make_shared(param_src0, indices_const_src0, axis_const); - auto gathered_src1 = std::make_shared(param_src1, indices_const_src1, axis_const); - // Reshape to batched form: - // For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32, - // Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch. + auto gathered_src0 = std::make_shared(flatten_src0, indices_const_src0, axis_const); + auto gathered_src1 = std::make_shared(flatten_src1, indices_const_src1, axis_const); + std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; auto reshape_src0 = std::make_shared( gathered_src0, ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), false); - // For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32, - // Reshape to 3D Tensor: shape = [32, 7, 96]. + std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; auto reshape_src1 = std::make_shared( gathered_src1, ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), false); - // For src0, first Convert from F16 to F32 auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - - // Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1] auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); auto src0_transposed = std::make_shared(src0_f32, transpose_order); @@ -527,89 +531,105 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto B = reshape_src1; auto batched_matmul = std::make_shared(B, A, false, false); - // batched_matmul output: shape = [32,7,32] + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, + ov::ParameterVector{ param_src0, param_src1 }); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); + ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; + ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; + ov::Shape output_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]) }; + ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - - // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively - ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); - ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); - - ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data); infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); return ; } - // Valid shape + int rank = 0; + if (dst->ne[2] == 1 && dst->ne[3] == 1) { + rank = 2; + } else if (dst->ne[3] == 1) { + rank = 3; + } else { + throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation."); + } + std::vector eff_shape_src0 = get_effective_shape(src0); std::vector eff_shape_src1 = get_effective_shape(src1); std::vector eff_shape_dst = get_effective_shape(dst); - // Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1) - int rank = static_cast(eff_shape_dst.size()); - if (rank != 1 && rank != 2 && rank != 3) - throw std::runtime_error("Only rank 1, 2 or 3 supported"); - - // Total number of flattened elements - size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d; - size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d; + ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) }; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src1->ne[1]), + static_cast(src1->ne[2]), + static_cast(src1->ne[3]) }; - ov::Shape flat_shape_src0 = { total_src0 }; - ov::Shape flat_shape_src1 = { total_src1 }; - // Same as above - // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; - // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; - - auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); auto reshape_src0 = std::make_shared( - param_flat_src0, + param_src0, ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), false); auto reshape_src1 = std::make_shared( - param_flat_src1, + param_src1, ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), false); - // Convert src0: F16 -> F32 auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - // Transpose src0_f32: - // For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072]. - // For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1]. ov::Output A_for_mul; - if (rank == 1) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else if (rank == 2) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + if (rank == 2) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector{1, 0}); A_for_mul = std::make_shared(src0_f32, trans_order); - } else { // rank == 3 - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + } else if (rank == 3) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector{0, 2, 1}); A_for_mul = std::make_shared(src0_f32, trans_order); + } else { + A_for_mul = src0_f32; } - ov::Core core; - ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data}; - ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data}; - ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); + auto matmul = std::make_shared(reshape_src1, A_for_mul, false, false); + + auto matmul_output_shape = matmul->get_output_shape(0); + std::vector final_output_shape; + if (matmul_output_shape.size() == 1) { + final_output_shape = { 1, 1, static_cast(matmul_output_shape[0]) }; + } else if (matmul_output_shape.size() == 2) { + final_output_shape = { 1, static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]) }; + } else { + final_output_shape = { static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]), static_cast(matmul_output_shape[2]) }; + } + + auto reshape_output = std::make_shared( + matmul, + ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), + false + ); + + auto model = std::make_shared(ov::NodeVector{ reshape_output }, + ov::ParameterVector{ param_src0, param_src1 }); - std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml"); + ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data }; + ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data }; + ov::Shape output_shape = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); + + ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); infer_request.set_output_tensor(0, tensor_dst); @@ -980,22 +1000,22 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() ) { i++; } @@ -1228,6 +1248,7 @@ static const std::set& openvino_ops = []() -> const std::set Date: Thu, 6 Mar 2025 01:49:14 +0800 Subject: [PATCH 035/156] Change the input and ouput node shape of MUL_MAT operator --- ggml/src/ggml-openvino.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index afd616a33840d..c45f778e804fb 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -531,14 +531,25 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto B = reshape_src1; auto batched_matmul = std::make_shared(B, A, false, false); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, + + std::vector final_output_shape = {static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0])}; + + auto reshape_output = std::make_shared( + batched_matmul, + ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), + false + ); + + auto model = std::make_shared(ov::NodeVector{ reshape_output }, ov::ParameterVector{ param_src0, param_src1 }); ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; - ov::Shape output_shape = { static_cast(dst->ne[0]), + ov::Shape output_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), - static_cast(dst->ne[2]) }; + static_cast(dst->ne[0]) }; ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); ov::Core core; From 4d78d8036d4456de9f6abc96a4b06c545b827afd Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 10:22:20 +0800 Subject: [PATCH 036/156] change CONT and MULMAT input node shape --- ggml/src/ggml-openvino.cpp | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index c45f778e804fb..109003d686e63 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -482,12 +482,10 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), static_cast(src0->ne[1]), - static_cast(src0->ne[2]), - static_cast(src0->ne[3]) }; + static_cast(src0->ne[2])}; ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), static_cast(src1->ne[1]), - static_cast(src1->ne[2]), - static_cast(src1->ne[3]) }; + static_cast(src1->ne[2])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -577,13 +575,10 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), static_cast(src0->ne[1]), - static_cast(src0->ne[2]), - static_cast(src0->ne[3]) }; + static_cast(src0->ne[2])}; ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), static_cast(src1->ne[1]), - static_cast(src1->ne[2]), - static_cast(src1->ne[3]) }; - + static_cast(src1->ne[2])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -697,10 +692,9 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Case 1: Both tensors are contiguous if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { ov::Shape input_shape = { - static_cast(src0->ne[0]), - static_cast(src0->ne[1]), static_cast(src0->ne[2]), - static_cast(src0->ne[3]) + static_cast(src0->ne[1]), + static_cast(src0->ne[0]) }; size_t num_elements = 1; for (auto d : input_shape) { @@ -764,7 +758,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { std::copy(src_row, src_row + valid_elems, dst_row); } - ov::Shape logical_shape = { valid_elems, num_rows, dim2, dim3 }; + ov::Shape logical_shape = { dim2, num_rows, valid_elems}; auto input_param = std::make_shared(ov::element::f32, logical_shape); auto identity_const = ov::op::v0::Constant::create(ov::element::i64, { logical_shape.size() }, @@ -828,12 +822,16 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { } } - ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; + // ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; + ov::Shape input_shape = { dst->src[0]->ne[2], dst->src[0]->ne[1], dst->src[0]->ne[0]}; auto input_param = std::make_shared(ov::element::f32, input_shape); - ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; - std::vector target_shape_vec = { static_cast(dst->ne[0]), - static_cast(dst->ne[1]), dst->ne[2]}; + // ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; + // std::vector target_shape_vec = { static_cast(dst->ne[0]), + // static_cast(dst->ne[1]), dst->ne[2]}; + ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; + std::vector target_shape_vec = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), dst->ne[0]}; auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, target_shape_vec); auto reshaped = std::make_shared(input_param, reshape_const, false); From 5b72d6c7ad874468058a89699256dc2f8156704f Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 13:51:34 +0800 Subject: [PATCH 037/156] All adjacent ops can conversion but calculation result is wrong and need debugging --- ggml/src/ggml-openvino.cpp | 87 ++++++++++++------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 50 +++++++------- ggml/src/ggml-openvino/utils.cpp | 74 +++++++++++++-------- 3 files changed, 114 insertions(+), 97 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 109003d686e63..230edded11095 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -480,12 +480,12 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), static_cast(src0->ne[1]), - static_cast(src0->ne[2])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src0->ne[0])}; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), static_cast(src1->ne[1]), - static_cast(src1->ne[2])}; + static_cast(src1->ne[0])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -573,12 +573,12 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::vector eff_shape_src1 = get_effective_shape(src1); std::vector eff_shape_dst = get_effective_shape(dst); - ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), - static_cast(src0->ne[1]), - static_cast(src0->ne[2])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), - static_cast(src1->ne[1]), - static_cast(src1->ne[2])}; + ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), + static_cast(src0->ne[1]), + static_cast(src0->ne[0])}; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), + static_cast(src1->ne[1]), + static_cast(src1->ne[0])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -999,40 +999,40 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node); + openvino_frontend_compute(backend, cgraph, 0, end_node); // openvino_frontend_compute(backend, cgraph); // Process nodes in order - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i); + // } + // } + // } return GGML_STATUS_SUCCESS; @@ -1257,14 +1257,13 @@ static const std::set& openvino_ops = []() -> const std::set(ggml_nelements(node)) }; - auto input_param = std::make_shared(ov::element::f32, flat_shape); - m_params.push_back(input_param); + // ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; + // auto input_param = std::make_shared(ov::element::f32, flat_shape); + // m_params.push_back(input_param); break; } @@ -72,15 +72,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - const size_t element_size = ggml_type_size(node->src[0]->type); - size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - ov::Shape flat_input_shape = { total_phys }; - auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); - m_params.push_back(flat_input_param); + // const size_t element_size = ggml_type_size(node->src[0]->type); + // size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 + // size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 + // size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 + // // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + // size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 + // ov::Shape flat_input_shape = { total_phys }; + // auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); + // m_params.push_back(flat_input_param); m_continuous = false; break; @@ -94,13 +94,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 - size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 - size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 - size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - ov::Shape flat_input_shape = { total_valid }; - auto input_param = std::make_shared(ov::element::f32, flat_input_shape); - m_params.push_back(input_param); + // size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 + // size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 + // size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 + // size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + // ov::Shape flat_input_shape = { total_valid }; + // auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + // m_params.push_back(input_param); m_continuous = false; break; @@ -190,12 +190,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; - ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; - auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); - m_params.push_back(param_src0); - m_params.push_back(param_src1); + // ov::Shape flat_shape_src0 = { node->src[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; + // ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; + // auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + // auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + // m_params.push_back(param_src0); + // m_params.push_back(param_src1); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8f27bbc97db8f..a0234ebd3033e 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,12 +14,15 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr input_tensors; auto input_names = ggml_decoder->get_input_names(); // auto node_name = ggml_decoder->get_op_name(); - size_t iter = 0; + size_t op_iter = 0; for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; - std::string op_node_name = ggml_decoder->get_op_node_name(name, iter++); + std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + std::vector input_stride = ggml_decoder->get_input_stride(name); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif @@ -28,36 +31,51 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - if (op_node_name == "CONT" && ggml_decoder->check_if_continuous()) { - ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - ggml_decoder->get_input_shape(name).to_shape()[1] * - ggml_decoder->get_input_shape(name).to_shape()[2] }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); - } else if ( op_node_name == "CONT" && - !ggml_decoder->check_if_continuous() && - input_shape[0] == 1) { - size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 3072 - size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 7 - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); - std::vector strides = ggml_decoder->get_input_stride(name); - size_t phys_stride = static_cast(strides[1]) / element_size; - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; - size_t total_phys = num_rows* phys_stride; - ov::Shape flat_input_shape = { total_phys }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (op_node_name == "CONT") { + if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous() && input_shape[0] == 1) { + const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); + const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); + const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); + size_t phys_stride = static_cast(input_stride[1]) / element_size; + size_t total_logical = valid_elems * num_rows * dim2; + + std::vector contiguous_data(total_logical); + + for (size_t j = 0; j < num_rows; j++) { + const float *src_row = reinterpret_cast(input_data) + j * phys_stride; + float *dst_row = contiguous_data.data() + j * valid_elems; + std::copy(src_row, src_row + valid_elems, dst_row); + } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), + ggml_decoder->get_input_shape(name).to_shape(), + contiguous_data.data()); + } else if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous()){ size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - ov::Shape flat_input_shape = { total_valid }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (op_node_name == "MUL_MAT") { - ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - ggml_decoder->get_input_shape(name).to_shape()[1] * - ggml_decoder->get_input_shape(name).to_shape()[2] }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); + size_t stride_j = static_cast(input_stride[1]) / element_size; // 672 + size_t stride_k = static_cast(input_stride[0]) / element_size; // 96 + + std::vector contiguous_data(total_valid); + const float *src_data = reinterpret_cast(input_data); + for (size_t k = 0; k < valid_k; k++) { + for (size_t j = 0; j < valid_j; j++) { + for (size_t i = 0; i < valid_i; i++) { + size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; + size_t src_index = j * stride_j + k * stride_k + i; + contiguous_data[out_index] = src_data[src_index]; + } + } + } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), + ggml_decoder->get_input_shape(name).to_shape(), + contiguous_data.data()); + // } else if (op_node_name == "MUL_MAT") { + // ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + // ggml_decoder->get_input_shape(name).to_shape()[1] * + // ggml_decoder->get_input_shape(name).to_shape()[2] }; + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } From 4b10c9f5b61c3a3546321ca48df36844aaff5992 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sun, 9 Mar 2025 23:35:18 +0800 Subject: [PATCH 038/156] 1. All operators implemented using OpenVINO can be successfully executed individually. 2. VIEW op output tensor shape is not same with CONT(non-contiguous) input tensor shape 3. CPY(non-contiguous) can't be implemented with original input/output tensor shape and data(need change the original shape when create input/output tensor) Currently. VIEW op executed in the ggml backend and others executed in the OpenVINO Frontend. --- ggml/src/ggml-openvino.cpp | 195 ++++++++++++------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 86 ++++------- ggml/src/ggml-openvino/utils.cpp | 76 +++------ ggml/src/ggml-openvino/utils.h | 2 +- 4 files changed, 141 insertions(+), 218 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 230edded11095..082ab274585f1 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -537,8 +537,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto reshape_output = std::make_shared( batched_matmul, ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false - ); + false); auto model = std::make_shared(ov::NodeVector{ reshape_output }, ov::ParameterVector{ param_src0, param_src1 }); @@ -659,6 +658,7 @@ void ggml_backend_openvino_view(ggml_tensor *dst) { false); auto model = std::make_shared(ov::NodeVector{reshaped}, ov::ParameterVector{param}); + // auto model = std::make_shared(ov::NodeVector{param}, ov::ParameterVector{param}); // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/003_backend_view_model.xml"); auto compiled_model = core.compile_model(model, "CPU"); @@ -742,106 +742,91 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t nb0 = dst->nb[0]; if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - const size_t valid_elems = static_cast(src0->ne[0]); - const size_t num_rows = static_cast(src0->ne[1]); - const size_t dim2 = static_cast(src0->ne[2]); - const size_t dim3 = static_cast(src0->ne[3]); + const size_t valid_elems = static_cast(src0->ne[0]); // 3072 + const size_t num_rows = static_cast(src0->ne[1]); // 7 + const size_t dim2 = static_cast(src0->ne[2]); // 1 - size_t phys_stride = static_cast(src0->nb[1]) / element_size; - size_t total_logical = valid_elems * num_rows * dim2 * dim3; + size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 - std::vector contiguous_data(total_logical); + ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } + ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} - for (size_t j = 0; j < num_rows; j++) { - const float *src_row = reinterpret_cast(src0->data) + j * phys_stride; - float *dst_row = contiguous_data.data() + j * valid_elems; - std::copy(src_row, src_row + valid_elems, dst_row); - } + auto input_param = std::make_shared(ov::element::f32, input_shape); - ov::Shape logical_shape = { dim2, num_rows, valid_elems}; - auto input_param = std::make_shared(ov::element::f32, logical_shape); - auto identity_const = ov::op::v0::Constant::create(ov::element::i64, - { logical_shape.size() }, - std::vector(logical_shape.begin(), logical_shape.end())); - auto identity_op = std::make_shared(input_param, identity_const, false); + std::vector begin = { 0, 0, 0 }; + std::vector end = { static_cast(dim2), + static_cast(num_rows), + static_cast(valid_elems) }; + std::vector strides = { 1, 1, 1 }; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); + + std::vector begin_mask = {0, 0, 0}; + std::vector end_mask = {0, 0, 0}; + auto slice = std::make_shared( + input_param, + begin_const, + end_const, + strides_const, + begin_mask, + end_mask + ); - auto model = std::make_shared(ov::OutputVector{identity_op}, - ov::ParameterVector{input_param}); + auto model = std::make_shared(ov::OutputVector{ slice }, + ov::ParameterVector{ input_param }); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor input_tensor(ov::element::f32, logical_shape, contiguous_data.data()); + //[NOTE]: input_shape should be {1, 7, 9216} not the original shap of src0. + ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); infer_request.set_input_tensor(0, input_tensor); ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); infer_request.infer(); - /* - for (size_t i01 = 0; i01 < ne01; ++i01) { - const char *src_row = reinterpret_cast(src0->data) + i01 * nb01; - char *dst_row = reinterpret_cast(dst->data) + i01 * dst->nb[1]; - - ov::Tensor src_row_tensor(ov::element::f32, {ne00}, const_cast(reinterpret_cast(src_row))); - ov::Tensor dst_row_tensor(ov::element::f32, {ne00}, reinterpret_cast(dst_row)); - - std::memcpy(dst_row_tensor.data(), src_row_tensor.data(), ne00 * sizeof(float)); - }*/ return; } // Case 3: Non-contiguous source, contiguous destination - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - const int64_t nb02 = src0->nb[2]; - const int64_t nb03 = src0->nb[3]; - // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 if (ggml_is_contiguous(dst)) { size_t valid_i = static_cast(src0->ne[0]); // 96 size_t valid_j = static_cast(src0->ne[1]); // 32 size_t valid_k = static_cast(src0->ne[2]); // 7 - size_t valid_l = static_cast(src0->ne[3]); // 1 - - size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - size_t stride_j = static_cast(src0->nb[1]) / element_size; // 672 - size_t stride_k = static_cast(src0->nb[2]) / element_size; // 96 - - std::vector contiguous_data(total_valid); - const float *src_data = reinterpret_cast(src0->data); - for (size_t k = 0; k < valid_k; k++) { - for (size_t j = 0; j < valid_j; j++) { - for (size_t i = 0; i < valid_i; i++) { - size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; - size_t src_index = j * stride_j + k * stride_k + i; - contiguous_data[out_index] = src_data[src_index]; - } - } - } - // ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; - ov::Shape input_shape = { dst->src[0]->ne[2], dst->src[0]->ne[1], dst->src[0]->ne[0]}; - auto input_param = std::make_shared(ov::element::f32, input_shape); + ov::Shape src_shape = { valid_k, valid_j, valid_i }; // {7, 32, 96}; + auto src_param = std::make_shared(ov::element::f32, src_shape); - // ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; - // std::vector target_shape_vec = { static_cast(dst->ne[0]), - // static_cast(dst->ne[1]), dst->ne[2]}; - ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; - std::vector target_shape_vec = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), dst->ne[0]}; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, target_shape_vec); - auto reshaped = std::make_shared(input_param, reshape_const, false); + ov::Shape input_shape = { valid_j, valid_k, valid_i }; // {32, 7, 96} + auto tmp_param = ov::op::v0::Constant::create(ov::element::i64, { input_shape.size() }, input_shape); + auto input_param = std::make_shared(src_param, tmp_param, false); + + // 添加 Transpose 节点,将 {32,7,96} 变换为 {7,32,96},恢复逻辑顺序 + // 这里交换第 0 与第 1 维,即 permutation = {1, 0, 2} + std::vector order = {1, 0, 2}; + auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); + auto transpose = std::make_shared(input_param, order_const); - auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{input_param}); + ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; // {1, 7, 3072} + std::vector target_shape_vec = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { target_shape_vec.size() }, target_shape_vec); + auto reshaped = std::make_shared(transpose, reshape_const, false); + auto model = std::make_shared(ov::OutputVector{ reshaped }, + ov::ParameterVector{ src_param }); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor input_tensor(ov::element::f32, input_shape, contiguous_data.data()); + ov::Tensor input_tensor(ov::element::f32, src_shape, src0->data); infer_request.set_input_tensor(0, input_tensor); ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); @@ -998,40 +983,48 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); - // openvino_frontend_compute(backend, cgraph); + // Process nodes in order - // for (int i = 0; i < cgraph->n_nodes; i++) { - // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i); - // } - // } + + // if (cgraph->nodes[0]->ne[1] == 1) { + // bool prompt_process_flag = false; + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + // } else { + + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i); + } + } + } + // } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 90bfdcd103873..2b04cd632a8a8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -46,12 +46,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); - m_continuous = true; - // ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; - // auto input_param = std::make_shared(ov::element::f32, flat_shape); - // m_params.push_back(input_param); + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); + m_continuous = true; break; } @@ -59,12 +61,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->nb[0] == ggml_type_size(node->src[0]->type) && node->nb[0] == ggml_type_size(node->src[0]->type)) { - // for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { - // const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; - // char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; - // std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); - // } - inputs[src0_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -72,15 +68,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - // const size_t element_size = ggml_type_size(node->src[0]->type); - // size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - // size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - // size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - // size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - // ov::Shape flat_input_shape = { total_phys }; - // auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); - // m_params.push_back(flat_input_param); + const size_t element_size = ggml_type_size(node->src[0]->type); + size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 + size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 + size_t dim2 = static_cast(node->src[0]->ne[2]); // 1 + size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 + // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 + ov::Shape input_shape = { dim2, num_rows, phys_stride }; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); m_continuous = false; break; @@ -94,13 +91,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - // size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 - // size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 - // size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 - // size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - // ov::Shape flat_input_shape = { total_valid }; - // auto input_param = std::make_shared(ov::element::f32, flat_input_shape); - // m_params.push_back(input_param); + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); m_continuous = false; break; @@ -117,9 +112,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne, node->src[0]->ne + 3); - auto input_param = std::make_shared(ov::element::f32, src_shape); - m_params.push_back(input_param); + // ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); + // auto input_param = std::make_shared(ov::element::f32, src_shape); + // m_params.push_back(input_param); break; } else { for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 @@ -139,27 +134,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); m_continuous = false; - break; - - // inputs[src0_name] = node->src[0]; - // std::string temp_name = src0_name + std::string("_cpy_tmp"); - // inputs[temp_name] = node; - - // outputs[node_name] = node; - // m_input_names.push_back(src0_name); - // m_input_names.push_back(temp_name); - // m_node_op_name[src0_name] = ggml_op_name(node->op); - // m_node_op_name[temp_name] = ggml_op_name(node->op); - // m_output_names.push_back(node_name); - // m_continuous = false; - - // ov::Shape flat_src0_shape = {node->src[0]->nb[2]}; - // auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // m_params.push_back(param_src0); - - // ov::Shape flat_dst_shape = {node->nb[2], 1}; - // auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // m_params.push_back(param_dst_base); break; } @@ -167,8 +141,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); inputs[node_name] = node; outputs[node_name] = node; m_input_names.push_back(node_name); @@ -190,12 +162,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; - // ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; - // auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - // auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); - // m_params.push_back(param_src0); - // m_params.push_back(param_src1); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { @@ -376,8 +342,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - // for (int node_n = start_index; node_n <= end_index; node_n++) { + // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + for (int node_n = start_index; node_n <= end_index; node_n++) { auto cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); // Init model input and output diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a0234ebd3033e..c44aa2568bfda 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -10,8 +10,10 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con return std::make_shared(nullptr, cgraph, start_index, end_index); } -std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { - std::map input_tensors; +// std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { +std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder, bool flag) { + // std::map input_tensors; + std::vector> input_tensors; auto input_names = ggml_decoder->get_input_names(); // auto node_name = ggml_decoder->get_op_name(); size_t op_iter = 0; @@ -19,10 +21,7 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - std::vector input_stride = ggml_decoder->get_input_stride(name); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif @@ -31,58 +30,22 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous() && input_shape[0] == 1) { - const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); + if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { + std::vector input_stride = ggml_decoder->get_input_stride(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + // const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; - size_t total_logical = valid_elems * num_rows * dim2; - - std::vector contiguous_data(total_logical); - - for (size_t j = 0; j < num_rows; j++) { - const float *src_row = reinterpret_cast(input_data) + j * phys_stride; - float *dst_row = contiguous_data.data() + j * valid_elems; - std::copy(src_row, src_row + valid_elems, dst_row); - } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), - ggml_decoder->get_input_shape(name).to_shape(), - contiguous_data.data()); - } else if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous()){ - size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 - size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 - size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 - - size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - size_t stride_j = static_cast(input_stride[1]) / element_size; // 672 - size_t stride_k = static_cast(input_stride[0]) / element_size; // 96 - - std::vector contiguous_data(total_valid); - const float *src_data = reinterpret_cast(input_data); - for (size_t k = 0; k < valid_k; k++) { - for (size_t j = 0; j < valid_j; j++) { - for (size_t i = 0; i < valid_i; i++) { - size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; - size_t src_index = j * stride_j + k * stride_k + i; - contiguous_data[out_index] = src_data[src_index]; - } - } - } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), - ggml_decoder->get_input_shape(name).to_shape(), - contiguous_data.data()); - // } else if (op_node_name == "MUL_MAT") { - // ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - // ggml_decoder->get_input_shape(name).to_shape()[1] * - // ggml_decoder->get_input_shape(name).to_shape()[2] }; - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); + ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - // } - input_tensors[name] = input_tensor; + // input_tensors[name] = input_tensor; + input_tensors.emplace_back(name, input_tensor); } return input_tensors; } @@ -114,11 +77,11 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { - ov::Core core; - auto devices = core.get_available_devices(); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { + static ov::Core core; + // auto devices = core.get_available_devices(); // Get GGML Frontend - auto front_end = get_ggml_frontend(); + static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; @@ -161,11 +124,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Get input tensor auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); + auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + // infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + infer_request.set_input_tensor(i, input_tensors.at(i).second); // auto input_tensor = infer_request.get_input_tensor(i); // auto input_shape = input_tensor.get_shape(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index fc5268d98a993..7806c418cb62b 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0, bool flag = true); From 4f5b1b6d48f956b8d7c5db3142e22de06a612b76 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 11 Mar 2025 10:32:50 +0800 Subject: [PATCH 039/156] 1. Update the implementation of CPY node when it's non-contiguous 2. Remove duplicate get node operation function --- ggml/src/ggml-openvino.cpp | 108 ++++++++++++++---------- ggml/src/ggml-openvino/decoder.h | 2 - ggml/src/ggml-openvino/ggml-decoder.cpp | 86 +++++++------------ ggml/src/ggml-openvino/ggml-decoder.h | 2 - ggml/src/ggml-openvino/utils.cpp | 21 +++-- 5 files changed, 110 insertions(+), 109 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 082ab274585f1..679b030dfaaf8 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -849,6 +849,7 @@ static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; assert(src0 != nullptr); assert(ggml_nelements(dst) == ggml_nelements(src0)); @@ -889,64 +890,81 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { + int src0_elem_size = ggml_type_size(src0->type); + int src1_elem_size = ggml_type_size(src1->type); + + int src0_logical_cols = src0->ne[0]; + int src0_logical_rows = src0->ne[1]; + int src1_logical_cols = src1->ne[0]; + int src1_logical_rows = src1->ne[1]; + + int src0_phys_cols = src0->nb[0] / src0_elem_size; + int src0_phys_rows = src0_logical_rows; + + int src1_phys_cols = src1->nb[1] / src1_elem_size; + int src1_phys_rows = src1_logical_rows; + + ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + + size_t logical_elems = static_cast(src0_logical_cols * src0_logical_rows); + size_t src_flat_size = 1 * src0_phys_cols * src0_phys_rows; + size_t dst_flat_size = 1 * src1_phys_rows * src1_phys_cols; + + ov::Core core; + std::vector gather_idx; - for (int row = 0; row < dst->src[0]->ne[1]; row++) { - for (int col = 0; col < dst->src[0]->ne[0]; col++) { - gather_idx.push_back((row*dst->src[0]->nb[1]+col*dst->src[0]->nb[0])/4); + gather_idx.reserve(logical_elems); + for (int row = 0; row < src0_logical_rows; row++) { + for (int col = 0; col < src0_logical_cols; col++) { + gather_idx.push_back(static_cast(row + col * src0_phys_rows)); } } - size_t N = gather_idx.size(); - ov::Shape gather_idx_shape = {N, 1}; + ov::Shape gather_idx_shape = { logical_elems }; + std::vector scatter_idx; - for (int row = 0; row < dst->ne[1]; row++) { - for (int col = 0; col < dst->ne[0]; col++) { - scatter_idx.push_back(row * dst->nb[1] / 2 + col); + scatter_idx.reserve(logical_elems); + for (int row = 0; row < src1_logical_rows; row++) { + for (int col = 0; col < src1_logical_cols; col++) { + scatter_idx.push_back(static_cast(row * src1_phys_cols + col)); } } - ov::Shape scatter_idx_shape = {N, 1}; + ov::Shape scatter_idx_shape = { logical_elems, 1 }; - // param_src0 shape => 1D, rank=1, size is large enough. For example, row*col= 21504 + some padding, e.g. 80000 - // ov::Shape flat_src0_shape = {80000}; - ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; - auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // auto param_src00 = std::make_shared(ov::element::f32, flat_src0_shape); + auto param_src0 = std::make_shared(ov::element::f32, src0_phys_shape); + auto param_src1 = std::make_shared(ov::element::f16, src1_phys_shape); - auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); - auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared( - param_src0, gather_indices_const, gather_axis_const); + auto src_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, + { static_cast(src_flat_size) }); + auto reshape_src = std::make_shared(param_src0, src_flat_shape_const, false); + auto dst_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, + { static_cast(dst_flat_size) }); + auto reshape_dst = std::make_shared(param_src1, dst_flat_shape_const, false); + auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared(reshape_src, gather_indices_const, axis_const); auto converted = std::make_shared(gathered, ov::element::f16); - // param_dst_base shape => 1D, rank=1, size够大, e.g. row=3072 => i up to 3071 => offset i*64=196544 + j*2, e.g.200000 - // ov::Shape flat_dst_shape = {200000, 1}; - ov::Shape flat_dst_shape = {dst->nb[2], 1}; - auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // auto param_dst_base11 = std::make_shared(ov::element::f16, flat_dst_shape); - auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); + auto scatter = std::make_shared(reshape_dst, scatter_indices_const, converted); - // ScatterNDUpdate( base, scatter_indices, updates ) - // scatter_indices last dimension = 1 => each index is 1D coordinate - auto scatter = std::make_shared( - param_dst_base, scatter_indices_const, converted - ); - - ov::ParameterVector params = { param_src0, param_dst_base }; - // ov::ParameterVector params = { param_src0}; - // ov::ParameterVector params = { param_src00, param_dst_base11}; - auto model = std::make_shared(ov::OutputVector{ scatter }, params); + std::vector dst_phys_shape_vec = {1, static_cast(src1_phys_rows), + static_cast(src1_phys_cols) }; + auto dst_phys_shape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, dst_phys_shape_vec); + auto final_output = std::make_shared(scatter, dst_phys_shape_const, false); + ov::ParameterVector params = { param_src0, param_src1 }; + auto model = std::make_shared(ov::OutputVector{ final_output }, params); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor tensor_src0(ov::element::f32, flat_src0_shape, src0->data); - ov::Tensor tensor_dst_base(ov::element::f16, flat_dst_shape, dst->data); - - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_dst_base); + ov::Tensor tensor_src(ov::element::f32, src0_phys_shape, src0->data); + ov::Tensor tensor_dst(ov::element::f16, src1_phys_shape, src1->data); + infer_request.set_input_tensor(0, tensor_src); + infer_request.set_input_tensor(1, tensor_dst); - ov::Tensor out_tensor(ov::element::f16, flat_dst_shape, dst->data); + ov::Tensor out_tensor(ov::element::f16, src1_phys_shape, dst->data); infer_request.set_output_tensor(0, out_tensor); infer_request.infer(); @@ -986,15 +1004,17 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process nodes in order - // if (cgraph->nodes[0]->ne[1] == 1) { - // bool prompt_process_flag = false; + bool prompt_process_flag = true; + if (cgraph->nodes[0]->ne[1] == 1) { + prompt_process_flag = false; + } // int end_node = cgraph->n_nodes - 1; // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); // } else { for (int i = 0; i < cgraph->n_nodes; i++) { if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { @@ -1020,7 +1040,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); } } } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 729946ac39179..584f16986c1f2 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -36,8 +36,6 @@ class GgmlDecoder : public DecoderBase { virtual std::vector get_input_names() const = 0; - virtual const std::string& get_node_op_name(const std::string& name) const = 0; - virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0; // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2b04cd632a8a8..218c53f09f753 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,18 +6,6 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { - // m_node_op_name[node->name] = ggml_op_name(node->op); - - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); - - // Execute singel CONT operator is OK - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); - - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); - std::string src0_name = std::string(node->src[0]->name); std::string node_name = std::string(node->name); @@ -32,7 +20,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -43,7 +30,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -64,7 +50,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -87,7 +72,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -107,32 +91,45 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; - // ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); - // auto input_param = std::make_shared(ov::element::f32, src_shape); - // m_params.push_back(input_param); + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); break; } else { - for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 - for (int64_t i0 = 0; i0 < node->ne[0]; ++i0) { // ne[0] = 7 - int64_t src_index = i0 * node->src[0]->nb[0] / sizeof(float) + // stride in nb[0] - i1 * node->src[0]->nb[1] / sizeof(float); // stride in nb[1] - char *dst_ptr = static_cast(node->data) + - i0 * node->nb[0] + i1 * node->nb[1]; - *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(((float*)node->src[0]->data)[src_index]); - } - } - // inputs[node->src[0]->name] = node->src[0]; - inputs[node_name] = node; + std::string src1_name = std::string(node->src[1]->name); + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_node_op_name[node_name] = ggml_op_name(node->op); + m_input_names.push_back(src0_name); + m_input_names.push_back(src1_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + + int src0_elem_size = ggml_type_size(node->src[0]->type); + int src1_elem_size = ggml_type_size(node->src[1]->type); + + int src0_logical_rows = node->src[0]->ne[1]; + int src1_logical_rows = node->src[1]->ne[1]; + + int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + int src0_phys_rows = src0_logical_rows; + + int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + int src1_phys_rows = src1_logical_rows; + ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + m_params.push_back(input0_param); + m_params.push_back(input1_param); + m_continuous = false; break; @@ -144,7 +141,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -155,7 +151,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -167,17 +162,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -193,15 +184,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); if (node->src[1]) { - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); } @@ -210,26 +197,19 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); outputs[node_name] = node; m_output_names.push_back(node_name); if (node->src[2]) { - // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); - // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; m_input_names.push_back(src2_name); - m_node_op_name[src2_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); } break; @@ -423,12 +403,6 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { - auto it = m_node_op_name.find(name); - static const std::string empty_str; - return (it != m_node_op_name.end()) ? it->second : empty_str; -} - std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { if (index == -1) { for (size_t i = 0; i < m_op_node_name.size(); ++i) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 238f1d79b4257..fc1d87840976a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -71,7 +71,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_continuous; } - virtual const std::string& get_node_op_name(const std::string& name) const override; std::string& get_op_node_name(const std::string& key_name, const int index) override; virtual const std::vector>& get_params() const override; @@ -90,7 +89,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::string m_op_name; mutable std::string m_name; bool m_continuous; - std::map m_node_op_name; std::vector> m_params; std::vector> m_op_node_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c44aa2568bfda..a0adc917e72cc 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -22,24 +22,35 @@ std::vector> get_ggml_graph_input_tensors(std std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + auto check_if_contiguous = ggml_is_contiguous(ggml_decoder->get_input_ggml_tensor(name)); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); - // if (node_op_name == "CPY" && (input_shape[0] != 7)) { - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {80000}, input_data); if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - // const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous + std::vector input_stride = ggml_decoder->get_input_stride(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + ov::Shape phys_shape; + static int iter = 0; + if (iter++ % 2 == 0) { + phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + } else { + phys_shape = {1, input_shape[1], input_stride[1] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } @@ -105,7 +116,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -117,7 +128,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); - ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); + // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); From bb3834b2e611f8b46e5ececc520d4417d0a959a0 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 11 Mar 2025 15:16:40 +0800 Subject: [PATCH 040/156] Minor Update --- ggml/src/ggml-openvino.cpp | 12 ++++++------ ggml/src/ggml-openvino/ggml-decoder.cpp | 20 +++++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 679b030dfaaf8..4608019d9f1e0 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -813,7 +813,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); auto transpose = std::make_shared(input_param, order_const); - ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; // {1, 7, 3072} + ov::Shape target_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; // {1, 7, 3072} std::vector target_shape_vec = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; @@ -866,7 +866,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { std::shared_ptr model; if (ggml_is_contiguous(dst)) { // Contiguous Case: Flatten src and reshape to dst shape - ov::Shape flattened_shape = {ggml_nelements(src0)}; + ov::Shape flattened_shape = {static_cast(ggml_nelements(src0))}; auto flatten = std::make_shared( src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); @@ -1013,12 +1013,12 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // } else { for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); + if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 218c53f09f753..55a82b0580ce4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -231,7 +231,7 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file << "n_nodes = " << cgraph->n_nodes << "\n"; file << " " << std::setw(3) << "nodes" << std::setw(15) << "shape" - << std::setw(16) << "op" + << std::setw(20) << "op" << std::setw(20) << "name" << std::setw(3) << " " << std::setw(50) << "stride" @@ -242,21 +242,24 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " - << std::setw(5) << node->ne[2] << "] " + << std::setw(5) << node->ne[2] << ", " + << std::setw(5) << node->ne[3] << "] " << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " " << std::left << std::setw(44) << node->name << std::right << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") << std::setw(2) << "[ " << std::setw(0) << node->nb[0] << ", " << std::setw(5) << node->nb[1] << ", " - << std::setw(5) << node->nb[2] << "] " + << std::setw(5) << node->nb[2] << ", " + << std::setw(5) << node->nb[3] << "] " << "\n"; if (node->src[0]) { file << std::setw(10) << " [ " << std::setw(5) << node->src[0]->ne[0] << ", " << std::setw(5) << node->src[0]->ne[1] << ", " - << std::setw(5) << node->src[0]->ne[2] << "] " + << std::setw(5) << node->src[0]->ne[2] << ", " + << std::setw(5) << node->src[0]->ne[3] << "] " << std::setw(12) << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; // // Custom logic to handle '\000' @@ -269,14 +272,16 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(16) << "[ " << std::setw(0) << node->src[0]->nb[0] << ", " << std::setw(5) << node->src[0]->nb[1] << ", " - << std::setw(5) << node->src[0]->nb[2] << "] " + << std::setw(5) << node->src[0]->nb[2] << ", " + << std::setw(5) << node->src[0]->nb[3] << "] " << "\n"; } if (node->src[1]) { file << std::setw(10) << " [ " << std::setw(5) << node->src[1]->ne[0] << ", " << std::setw(5) << node->src[1]->ne[1] << ", " - << std::setw(5) << node->src[1]->ne[2] << "] " + << std::setw(5) << node->src[1]->ne[2] << ", " + << std::setw(5) << node->src[1]->ne[3] << "] " << std::setw(12) << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; // // Custom logic to handle '\000' @@ -289,7 +294,8 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(16) << "[ " << std::setw(0) << node->src[1]->nb[0] << ", " << std::setw(5) << node->src[1]->nb[1] << ", " - << std::setw(5) << node->src[1]->nb[2] << "] " + << std::setw(5) << node->src[1]->nb[2] << ", " + << std::setw(5) << node->src[1]->nb[3] << "] " << "\n"; } } From a9a4feda6fd8eb75a1168a5d466e865b9011075a Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 12 Mar 2025 21:43:23 +0800 Subject: [PATCH 041/156] Try to add VIEW node to OV Frontend and have some issues that need to be dealt with --- ggml/src/ggml-openvino.cpp | 232 +++++++++++++++++++++--- ggml/src/ggml-openvino/decoder.h | 2 + ggml/src/ggml-openvino/ggml-decoder.cpp | 27 ++- ggml/src/ggml-openvino/ggml-decoder.h | 2 + 4 files changed, 230 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 4608019d9f1e0..d2a21511ddfbf 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -647,36 +647,169 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { - ov::Core core; - ov::Shape tensor_shape{static_cast(dst->ne[3]), static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - // auto param = std::make_shared(ov::element::f32, tensor_shape); - auto param = std::make_shared(ov::element::f16, tensor_shape); + /* + // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator + if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) { + // if (dst->view_offs == 0) { + // return; + // } + ov::Core core; + ov::Shape input_shape{ static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + ov::Shape out_shape{ static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto reshaped = std::make_shared(param, - ov::op::v0::Constant::create(ov::element::i64, { tensor_shape.size() }, tensor_shape), - false); + auto input_param = std::make_shared(ov::element::f32, input_shape); - auto model = std::make_shared(ov::NodeVector{reshaped}, ov::ParameterVector{param}); - // auto model = std::make_shared(ov::NodeVector{param}, ov::ParameterVector{param}); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/003_backend_view_model.xml"); + // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + // ov::Shape{input_shape.size()}, + // std::vector(input_shape.begin(), input_shape.end())); + // auto res = std::make_shared(input_param, new_shape_node, false); - auto compiled_model = core.compile_model(model, "CPU"); + int64_t split_addr = dst->view_offs / dst->nb[0]; + std::vector begin = { 0, 0, split_addr }; + std::vector end = { static_cast(dst->src[0]->ne[2]), + static_cast(dst->src[0]->ne[1]), + split_addr + static_cast(dst->ne[0]) }; + std::vector strides = { 1, 1, 1 }; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); + + std::vector begin_mask = {0, 0, 0}; + std::vector end_mask = {0, 0, 0}; + auto slice = std::make_shared( + input_param, + begin_const, + end_const, + strides_const, + begin_mask, + end_mask + ); + + auto model = std::make_shared(ov::OutputVector{ slice }, + ov::ParameterVector{ input_param }); + + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + infer_request.set_input_tensor(0, input_tensor); + + ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + } + */ + + + /* + // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1] + if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) { + ov::Core core; + ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), + static_cast(dst->src[0]->ne[1]), + static_cast(dst->src[0]->ne[0])}; + ov::Shape output_shape = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0])}; + auto input_param = std::make_shared(ov::element::f16, input_shape); + + + std::vector begin = { 0, 0, 0 }; + std::vector end = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + std::vector strides = { 1, 1, 1 }; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); + + std::vector begin_mask = {0, 0, 0}; + std::vector end_mask = {0, 0, 0}; + auto slice = std::make_shared( + input_param, + begin_const, + end_const, + strides_const, + begin_mask, + end_mask + ); + + std::shared_ptr model = std::make_shared(ov::OutputVector{ slice }, + ov::ParameterVector{ input_param }); + + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); + ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); + infer_request.set_input_tensor(0, input_tensor); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + } + */ + + /* + // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape) + if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) { + ov::Core core; + ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), + static_cast(dst->src[0]->ne[1]), + static_cast(dst->src[0]->ne[0])}; + ov::Shape output_shape = { static_cast(dst->nb[2]), + static_cast(dst->ne[1]), + static_cast(dst->nb[1] / dst->nb[0])}; + auto input_param = std::make_shared(ov::element::f16, input_shape); + + auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + ov::Shape{output_shape.size()}, + std::vector(output_shape.begin(), output_shape.end())); + auto res = std::make_shared(input_param, new_shape_node, false); + + std::shared_ptr model = std::make_shared(ov::OutputVector{res}, + ov::ParameterVector{input_param}); + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); + ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); + infer_request.set_input_tensor(0, input_tensor); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + } + */ + + /* + // Case 4: + if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) { + + } + */ + + ov::Core core; + ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + + std::shared_ptr model = std::make_shared(ov::OutputVector{input_param}, + ov::ParameterVector{input_param}); + auto compiled_model = core.compile_model(model, "CPU"); ov::InferRequest infer_request = compiled_model.create_infer_request(); - // ov::Tensor input_tensor(ov::element::f32, tensor_shape, dst->data); - ov::Tensor input_tensor(ov::element::f16, tensor_shape, dst->data); - // infer_request.set_tensor(param, input_tensor); + ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data); infer_request.set_input_tensor(0, input_tensor); - - // ov::Tensor output_tensor(ov::element::f32, tensor_shape, dst->data); - ov::Tensor output_tensor(ov::element::f16, tensor_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); + // infer_request.set_output_tensor(0, output_tensor); infer_request.infer(); - // auto output_tensor = infer_request.get_output_tensor(0); - // dst->data = output_tensor.data(); + + GGML_UNUSED(dst); } void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { @@ -747,12 +880,20 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t dim2 = static_cast(src0->ne[2]); // 1 size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 + // size_t phys_stride = static_cast(src0->ne[0]); // 3072 ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} + // std::cout << "CONT input shape: " << input_shape << std::endl; auto input_param = std::make_shared(ov::element::f32, input_shape); + // int64_t split_addr = dst->src[0]->view_offs / dst->src[0]->nb[0]; + // std::vector begin = { 0, 0, split_addr }; + // std::vector end = { static_cast(dim2), + // static_cast(num_rows), + // split_addr + static_cast(valid_elems) }; + std::vector begin = { 0, 0, 0 }; std::vector end = { static_cast(dim2), static_cast(num_rows), @@ -838,6 +979,35 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { } static void ggml_backend_openvino_transpose(ggml_tensor *dst) { + ov::Core core; + ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + + //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + + + + auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + ov::Shape{output_shape.size()}, + std::vector(output_shape.begin(), output_shape.end())); + auto res = std::make_shared(input_param, new_shape_node, false); + + + + + std::shared_ptr model = std::make_shared(ov::OutputVector{res}, + ov::ParameterVector{input_param}); + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + infer_request.set_input_tensor(0, input_tensor); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + // NOP GGML_UNUSED(dst); } @@ -1013,29 +1183,31 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // } else { for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() ) { i++; } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 584f16986c1f2..e287f31e23c8c 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -46,6 +46,8 @@ class GgmlDecoder : public DecoderBase { virtual element::Type get_output_type(const std::string& name) const = 0; + virtual int32_t* get_input_op_params(const std::string& name) const = 0; + virtual int32_t* get_output_op_params(const std::string& name) const = 0; virtual std::string& get_output_name(size_t index) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 55a82b0580ce4..4483241481554 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -138,11 +138,28 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); + m_input_names.push_back(src0_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + + // ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + // static_cast(node->src[0]->ne[1]), + // static_cast(node->src[0]->ne[0])}; + // auto input_param = std::make_shared(ov::element::f32, input_shape); + // m_params.push_back(input_param); + + // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { + // m_continuous = false; + // } else { + // m_continuous = true; + + // } + // m_continuous = false; + + // [TODO]: multiple cases + break; } // SCALE @@ -467,6 +484,10 @@ ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const return type; } +int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const{ + return m_inputs.at(name)->op_params; +} + int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ return m_outputs.at(name)->op_params; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index fc1d87840976a..eac045d158300 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -43,6 +43,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual ov::element::Type get_output_type(const std::string& name) const override; + virtual int32_t* get_input_op_params(const std::string& name) const override; + virtual int32_t* get_output_op_params(const std::string& name) const override; virtual std::string& get_output_name(size_t index) const override; From b3cf661a0664e28c1344a9115bde40b0df0544fc Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sat, 15 Mar 2025 19:32:40 +0800 Subject: [PATCH 042/156] 1. In the Prompt process and predict first token stage, the PERMUTE node needs to be integrated into the OV Frontend 2. In the predict latest token stage, the VIEW, CONT, Reshape need to be integrated into the OV Frontend. --- ggml/src/ggml-openvino.cpp | 242 ++++-------------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 40 ++-- ggml/src/ggml-openvino/utils.cpp | 25 ++- 3 files changed, 83 insertions(+), 224 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index d2a21511ddfbf..fd24356412218 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -647,168 +647,6 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { - - /* - // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator - if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) { - // if (dst->view_offs == 0) { - // return; - // } - ov::Core core; - ov::Shape input_shape{ static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - ov::Shape out_shape{ static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{input_shape.size()}, - // std::vector(input_shape.begin(), input_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - int64_t split_addr = dst->view_offs / dst->nb[0]; - std::vector begin = { 0, 0, split_addr }; - std::vector end = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - split_addr + static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - - /* - // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1] - if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - std::shared_ptr model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape) - if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->nb[2]), - static_cast(dst->ne[1]), - static_cast(dst->nb[1] / dst->nb[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - ov::Shape{output_shape.size()}, - std::vector(output_shape.begin(), output_shape.end())); - auto res = std::make_shared(input_param, new_shape_node, false); - - std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 4: - if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) { - - } - */ - - ov::Core core; - ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::shared_ptr model = std::make_shared(ov::OutputVector{input_param}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - GGML_UNUSED(dst); } @@ -823,7 +661,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t element_size = ggml_type_size(src0->type); // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { ov::Shape input_shape = { static_cast(src0->ne[2]), static_cast(src0->ne[1]), @@ -1152,6 +990,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector permute_indices; std::vector mul_mat_indices; + std::vector add_indices; for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op == GGML_OP_CONT) { @@ -1168,6 +1007,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe permute_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { mul_mat_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { + add_indices.push_back(i); } } @@ -1177,48 +1018,49 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - } - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // } else { - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - ) { - i++; + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } + } else { + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } } } - // } - return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4483241481554..d91338127a131 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,7 +26,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]) && ggml_is_contiguous(node)) { + if (ggml_is_contiguous(node->src[0]) + && ggml_is_contiguous(node) + && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { inputs[src0_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -112,22 +114,31 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - int src0_elem_size = ggml_type_size(node->src[0]->type); - int src1_elem_size = ggml_type_size(node->src[1]->type); + // int src0_elem_size = ggml_type_size(node->src[0]->type); + // int src1_elem_size = ggml_type_size(node->src[1]->type); - int src0_logical_rows = node->src[0]->ne[1]; - int src1_logical_rows = node->src[1]->ne[1]; + // int src0_logical_rows = node->src[0]->ne[1]; + // int src1_logical_rows = node->src[1]->ne[1]; - int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; + // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + // int src0_phys_rows = src0_logical_rows; - int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + // int src1_phys_rows = src1_logical_rows; + // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // m_params.push_back(input0_param); + // m_params.push_back(input1_param); + + ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input0_param = std::make_shared(ov::element::f32, input0_shape); m_params.push_back(input0_param); + ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; + auto input1_param = std::make_shared(ov::element::f16, input1_shape); m_params.push_back(input1_param); m_continuous = false; @@ -147,7 +158,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map(node->src[0]->ne[2]), // static_cast(node->src[0]->ne[1]), // static_cast(node->src[0]->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); + // auto type = get_input_type(src0_name); + // auto input_param = std::make_shared(type, input_shape); // m_params.push_back(input_param); // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a0adc917e72cc..b8315a0013489 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,12 +27,12 @@ std::vector> get_ggml_graph_input_tensors(std printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; - auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; @@ -42,14 +42,14 @@ std::vector> get_ggml_graph_input_tensors(std std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - ov::Shape phys_shape; + // ov::Shape phys_shape; static int iter = 0; if (iter++ % 2 == 0) { - phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); } else { - phys_shape = {1, input_shape[1], input_stride[1] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); @@ -161,6 +161,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0] + // << std::setw(15) << ((float*)output_tensor.data())[1] << std::right + // << std::endl; #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif From d183de84783c50d8346e01ae8141f1b40eeecbc3 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 17 Mar 2025 17:00:43 +0800 Subject: [PATCH 043/156] add debug info --- ggml/src/ggml-openvino.cpp | 35 ++++++++++++++++++++++++++------ ggml/src/ggml-openvino/utils.cpp | 12 +++++++++-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index fd24356412218..2c83edaeb590b 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -419,6 +419,11 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { } } +static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { + // NOP + GGML_UNUSED(dst); +} + // Extracting valid shapes std::vector get_effective_shape(const ggml_tensor * t) { std::vector shape; @@ -850,11 +855,6 @@ static void ggml_backend_openvino_transpose(ggml_tensor *dst) { GGML_UNUSED(dst); } -static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { - // NOP - GGML_UNUSED(dst); -} - void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { const struct ggml_tensor *src0 = dst->src[0]; const struct ggml_tensor *src1 = dst->src[1]; @@ -984,6 +984,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector cont_indices; std::vector reshape_indices; std::vector view_indices; + std::vector view_indices_prompt; std::vector cpy_indices; std::vector transpose_indices; @@ -997,8 +998,12 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe cont_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { reshape_indices.push_back(i); + // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { view_indices.push_back(i); + if (cgraph->nodes[i]->ne[0] == 96) { + view_indices_prompt.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { cpy_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { @@ -1043,14 +1048,32 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } else { + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + ggml_backend_openvino_add_forward(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() ) { i++; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index b8315a0013489..3909afbe2d382 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -161,10 +161,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " // << "output_names: " << std::setw(20) << output_names[i] - // << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0] - // << std::setw(15) << ((float*)output_tensor.data())[1] << std::right + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << ((float*)output_tensor.data())[0] + // << std::setw(15) << ((float*)output_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0]] << std::right + // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0] + 1] << std::right + // << std::right // << std::endl; #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); From d5e5a2447b770ddc7d9a965777b4a2432597f1b7 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 26 Mar 2025 16:31:52 +0800 Subject: [PATCH 044/156] Process Prompt and predict first token is OK --- ggml/src/ggml-openvino.cpp | 146 ++++++++++++----- ggml/src/ggml-openvino/ggml-decoder.cpp | 68 +++++--- ggml/src/ggml-openvino/utils.cpp | 208 ++++++++++++++++++++---- 3 files changed, 330 insertions(+), 92 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2c83edaeb590b..a508aeea40a8d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -652,6 +652,7 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { + GGML_UNUSED(dst); } @@ -985,8 +986,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector reshape_indices; std::vector view_indices; std::vector view_indices_prompt; + std::vector view_split; std::vector cpy_indices; + std::vector cpy_split_16; + std::vector cpy_split_19; std::vector transpose_indices; std::vector permute_indices; @@ -1000,12 +1004,23 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe reshape_indices.push_back(i); // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { + // if (cgraph->nodes[i]->src[0]->ne[0] == 98304) + // continue; view_indices.push_back(i); - if (cgraph->nodes[i]->ne[0] == 96) { + if (cgraph->nodes[i]->ne[0] == 32) { view_indices_prompt.push_back(i); } + if (i == 18) { + view_split.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { cpy_indices.push_back(i); + if (i == 16) { + cpy_split_16.push_back(i); + } + if (i == 19) { + cpy_split_19.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { transpose_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { @@ -1023,10 +1038,18 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + ggml_backend_openvino_add_forward(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); @@ -1036,6 +1059,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() + && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() @@ -1047,41 +1075,85 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } - } else { // int end_node = cgraph->n_nodes - 1; // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - ggml_backend_openvino_add_forward(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // // if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + // } + // } + // } + } else { + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + // ggml_backend_openvino_add_forward(cgraph->nodes[i]); + // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // // ggml_backend_openvino_permute(cgraph->nodes[i]); + // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() + // // && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() + // // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() + // // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + // } + // } + // } } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d91338127a131..4ec1be7b4d62f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,23 +90,43 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; + // inputs[src1_name] = node->src[1]; + // outputs[node_name] = node; + src1_name = std::string(node->src[1]->view_src->name); + inputs[src1_name] = node->src[1]; + node_name = std::string(node->view_src->name); outputs[node_name] = node; m_input_names.push_back(src0_name); + m_input_names.push_back(src1_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + ov::Shape input1_shape = { static_cast(node->src[0]->ne[2]), static_cast(node->src[0]->ne[1]), static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); + auto input1_param = std::make_shared(ov::element::f32, input1_shape); + m_params.push_back(input1_param); + // ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), + // static_cast(node->src[1]->ne[1]), + // static_cast(node->src[1]->ne[0])}; + ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), + static_cast(node->src[1]->ne[1]), + static_cast(node->src[1]->view_src->ne[0])}; + auto input2_param = std::make_shared(ov::element::f16, input2_shape); + m_params.push_back(input2_param); break; } else { std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; + // inputs[src1_name] = node->src[1]; + // outputs[node_name] = node; + src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; + node_name = std::string(node->view_src->name); outputs[node_name] = node; m_input_names.push_back(src0_name); m_input_names.push_back(src1_name); @@ -114,24 +134,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - // int src0_elem_size = ggml_type_size(node->src[0]->type); - // int src1_elem_size = ggml_type_size(node->src[1]->type); - - // int src0_logical_rows = node->src[0]->ne[1]; - // int src1_logical_rows = node->src[1]->ne[1]; - - // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - // int src0_phys_rows = src0_logical_rows; - - // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - // int src1_phys_rows = src1_logical_rows; - // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); - // m_params.push_back(input0_param); - // m_params.push_back(input1_param); - ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), static_cast(node->src[0]->ne[1]), static_cast(node->src[0]->ne[0])}; @@ -150,6 +152,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; + // if (node->ne[0] == 21504 || node->ne[0] == 7 + // || node->ne[0] == 3072 && node->src[0]->ne[0] == 98304 + // || node->ne[0] == 1 && node->src[0]->ne[0] == 98304) { + // // if (node->ne[0] == 21504 || node->ne[0] == 7) { + // node_name = std::string(node->view_src->name); + // outputs[node_name] = node; + // } else { + // outputs[node_name] = node; + // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); @@ -193,6 +204,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; + // if (node->ne[0] == 32 &&node->src[0]->type == GGML_TYPE_I32) { + // static_cast(inputs[src0_name]->data)[0] = 1; + // } else if (node->ne[0] == 32 && node->src[0]->type == GGML_TYPE_F16) { + // static_cast(inputs[src0_name]->data)[0] = static_cast(1); + // } inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -346,13 +362,17 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { } GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) - :m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + :m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { m_inputs.clear(); m_outputs.clear(); m_input_names.clear(); m_output_names.clear(); + m_params.clear(); + m_op_node_name.clear(); + m_decoders.clear(); + // If first init if (m_node) { set_input_output(m_node, m_inputs, m_outputs); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3909afbe2d382..53fecd3b23b6e 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include #include +#include using ov::frontend::ggml::GgmlDecoder; @@ -32,32 +33,70 @@ std::vector> get_ggml_graph_input_tensors(std ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); std::vector input_stride = ggml_decoder->get_input_stride(name); - if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { + if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node + && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1]) + ) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } + // if (!flag) { + // std::cout << "CONT input shape: " << input_shape << std::endl; + // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); - // ov::Shape phys_shape; - static int iter = 0; - if (iter++ % 2 == 0) { - // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); - } else { - ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); - } + // } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous + // } else if (op_node_name == "CPY") { + // std::vector input_stride = ggml_decoder->get_input_stride(name); + // ov::element::Type input_type = ggml_decoder->get_input_type(name); + // size_t element_size = input_type.size(); + // // ov::Shape phys_shape; + // static int iter = 0; + // if (iter++ % 2 == 0) { + // // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + // input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); + // } else { + // ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + // input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); + // } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // if(!flag) { + // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // << "Input Name: " << std::setw(20) << name + // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) + // << "OP: " << std::setw(10) << op_node_name + // << "CONT: " << check_if_contiguous + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } + // if (op_node_name == "MUL_MAT") { + // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // << "Input MUL_MAT name: " << std::setw(20) << name + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } } // input_tensors[name] = input_tensor; input_tensors.emplace_back(name, input_tensor); } + // std::cout << "input_names.size(): " << input_names.size() << std::endl; return input_tensors; } @@ -117,7 +156,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); - + + // auto cloned_model = model->clone(); + // std::string model_dir = "/home/user/zhan/merge_git_commits/llama.cpp-ov"; + // auto path_base = model_dir + "/" + cloned_model->get_name(); + // // ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); + // ov::serialize(cloned_model, path_base + ".xml", path_base + ".bin"); + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -126,9 +171,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } + // model = core.read_model("/home/user/zhan/merge_git_commits/llama.cpp-ov/replaceWithInputLayer_000_model.xml"); // Loading a model to the device + // std::cout << "Compile ..." << std::endl; ov::CompiledModel compiled_model = core.compile_model(model); // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); + // std::ofstream output_file("/home/user/zhan/merge_git_commits/llama.cpp-ov/000_compile_model.xml"); + // compiled_model.export_model(output_file); + // output_file.close(); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -151,34 +201,130 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // std::cout << std::endl; } + // std::cout << "Infer ..." << std::endl; infer_request.infer(); // Set dst data for outputs auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { - // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); - // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); - // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " - // << "output_names: " << std::setw(20) << output_names[i] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << ((float*)output_tensor.data())[0] - // << std::setw(15) << ((float*)output_tensor.data())[1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0]] << std::right - // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0] + 1] << std::right - // << std::right - // << std::endl; + // if(!flag) { + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // if (i == 19) { + // auto output_tensor_18 = infer_request.get_output_tensor(18); + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[18]); + // std::cout << std::left << " " << std::setw(2) << 18 << " : " + // << "output_names: " << std::setw(20) << output_names[18] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // } + // if(i == 23) { + // auto output_tensor_15 = infer_request.get_output_tensor(15); + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[15]); + // std::cout << std::left << " " << std::setw(2) << 15 << " : " + // << "output_names: " << std::setw(20) << output_names[15] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // auto cache_k_l0_20 = ggml_decoder->get_input_names()[20]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor = input_tensors.at(20).second; + // std::cout << std::left << " " << std::setw(2) << 20 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_20 + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_27 = ggml_decoder->get_input_names()[27]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_27 = input_tensors.at(27).second; + // std::cout << std::left << " " << std::setw(2) << 27 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_27 + // << ", shape: " << std::setw(4) << input_tensor_27.get_shape()[0] << " " << std::setw(4) << input_tensor_27.get_shape()[1] << " " << input_tensor_27.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_27.data() << " " + // << std::setw(15) << ((float*)input_tensor_27.data())[0] + // << std::setw(15) << ((float*)input_tensor_27.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_29 = ggml_decoder->get_input_names()[29]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_29 = input_tensors.at(29).second; + // std::cout << std::left << " " << std::setw(2) << 29 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_29 + // << ", shape: " << std::setw(4) << input_tensor_29.get_shape()[0] << " " << std::setw(4) << input_tensor_29.get_shape()[1] << " " << input_tensor_29.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_29.data() << " " + // << std::setw(15) << ((float*)input_tensor_29.data())[0] + // << std::setw(15) << ((float*)input_tensor_29.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_30 = ggml_decoder->get_input_names()[30]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_30 = input_tensors.at(30).second; + // std::cout << std::left << " " << std::setw(2) << 30 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_30 + // << ", shape: " << std::setw(4) << input_tensor_30.get_shape()[0] << " " << std::setw(4) << input_tensor_30.get_shape()[1] << " " << input_tensor_30.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_30.data() << " " + // << std::setw(15) << ((float*)input_tensor_30.data())[0] + // << std::setw(15) << ((float*)input_tensor_30.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } + // } #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif } - + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } From 9a2d72352b0055756bf0c881c7d4b7673cf03345 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 31 Mar 2025 10:41:04 +0800 Subject: [PATCH 045/156] =?UTF-8?q?1.=20Solve=20the=20AC=20issue=20of=20Pe?= =?UTF-8?q?rmute+VIEW=20and=20MULMAL=20issue=20in=20the=20phase=20of=20?= =?UTF-8?q?=E2=80=9C1.=20Process=20Prompt=20and=20predict=20the=20first=20?= =?UTF-8?q?token=E2=80=9D.=202.=20There=20is=20still=20an=20AC=20issue=20i?= =?UTF-8?q?n=20the=20"2.=20Predict=20the=20subsequent=20tokens=20phase"=20?= =?UTF-8?q?and=20it=20is=20being=20debugged.=20=20=20=20A=20deviation=20ha?= =?UTF-8?q?s=20been=20detected=20in=20the=20computation=20of=20OpenVINO's?= =?UTF-8?q?=20CPY=20Node=20at=20stage=202,=20and=20it=20is=20currently=20b?= =?UTF-8?q?eing=20fixed.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ggml/src/ggml-openvino.cpp | 140 +++++++----------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 ++ ggml/src/ggml-openvino/utils.cpp | 43 ++++---- 3 files changed, 70 insertions(+), 120 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index a508aeea40a8d..2279df1d6d3c0 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -823,34 +823,34 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { } static void ggml_backend_openvino_transpose(ggml_tensor *dst) { - ov::Core core; - ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); + // ov::Core core; + // ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + // auto input_param = std::make_shared(ov::element::f32, input_shape); - //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + // //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - ov::Shape{output_shape.size()}, - std::vector(output_shape.begin(), output_shape.end())); - auto res = std::make_shared(input_param, new_shape_node, false); + // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + // ov::Shape{output_shape.size()}, + // std::vector(output_shape.begin(), output_shape.end())); + // auto res = std::make_shared(input_param, new_shape_node, false); - std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); + // std::shared_ptr model = std::make_shared(ov::OutputVector{res}, + // ov::ParameterVector{input_param}); + // auto compiled_model = core.compile_model(model, "CPU"); + // ov::InferRequest infer_request = compiled_model.create_infer_request(); - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); + // ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + // ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + // infer_request.set_input_tensor(0, input_tensor); + // infer_request.set_output_tensor(0, output_tensor); - infer_request.infer(); + // infer_request.infer(); // NOP GGML_UNUSED(dst); @@ -1004,7 +1004,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe reshape_indices.push_back(i); // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - // if (cgraph->nodes[i]->src[0]->ne[0] == 98304) + // if (cgraph->nodes[i]->src[0]->ne[0] == 98304 && (cgraph->nodes[i]->ne[0] == 3072 || cgraph->nodes[i]->ne[0] == 1)) // continue; view_indices.push_back(i); if (cgraph->nodes[i]->ne[0] == 32) { @@ -1045,16 +1045,25 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); + + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; @@ -1062,11 +1071,16 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() + // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() + // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() ) { i++; } @@ -1075,85 +1089,9 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // for (int i = 0; i < cgraph->n_nodes; i++) { - // // if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - // } - // } - // } } else { int end_node = cgraph->n_nodes - 1; openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // for (int i = 0; i < cgraph->n_nodes; i++) { - // if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - // ggml_backend_openvino_add_forward(cgraph->nodes[i]); - // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // // ggml_backend_openvino_transpose(cgraph->nodes[i]); - // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // // ggml_backend_openvino_permute(cgraph->nodes[i]); - // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - // // && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - // // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() - // // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() - // // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - // } - // } - // } } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4ec1be7b4d62f..ec827e80069ec 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -161,6 +161,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapne[0] == 3072 && node->ne[1] == 1 && node->ne[2] == 1) { + // outputs[src0_name] = node; + // m_output_names.push_back(src0_name); + // } else { + // outputs[node_name] = node; + // m_output_names.push_back(node_name); + // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 53fecd3b23b6e..642f2b666233a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -44,24 +44,8 @@ std::vector> get_ggml_graph_input_tensors(std // std::cout << "CONT input shape: " << input_shape << std::endl; // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - // } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous - // } else if (op_node_name == "CPY") { - // std::vector input_stride = ggml_decoder->get_input_stride(name); - // ov::element::Type input_type = ggml_decoder->get_input_type(name); - // size_t element_size = input_type.size(); - // // ov::Shape phys_shape; - // static int iter = 0; - // if (iter++ % 2 == 0) { - // // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - // input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); - // } else { - // ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; - // input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); - // } - } else { - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); // if(!flag) { - // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // std::cout << std::left << "*[" << std::setw(2) << inp << "]*: " // << "Input Name: " << std::setw(20) << name // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) // << "OP: " << std::setw(10) << op_node_name @@ -77,14 +61,21 @@ std::vector> get_ggml_graph_input_tensors(std // << std::right // << std::endl; // } - // if (op_node_name == "MUL_MAT") { + } else { + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // if(!flag) { // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " - // << "Input MUL_MAT name: " << std::setw(20) << name + // << "Input Name: " << std::setw(20) << name + // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) + // << "OP: " << std::setw(10) << op_node_name + // << "CONT: " << check_if_contiguous // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] // << ", address: " // << std::setw(15) << input_tensor.data() << " " // << std::setw(15) << ((float*)input_tensor.data())[0] // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]-1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right @@ -219,6 +210,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << tensor->data << " " // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) + // << ", ne[0]-1: " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] - 1]) // << ", ne[0]: " // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right @@ -234,6 +227,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << tensor->data << " " // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) + // << ", ne[0]-1: " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] - 1]) // << ", ne[0]: " // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right @@ -250,6 +245,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << tensor->data << " " // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) + // << ", ne[0]-1: " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] - 1]) // << ", ne[0]: " // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right @@ -265,6 +262,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor.data() << " " // << std::setw(15) << ((float*)input_tensor.data())[0] // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right @@ -281,6 +280,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor_27.data() << " " // << std::setw(15) << ((float*)input_tensor_27.data())[0] // << std::setw(15) << ((float*)input_tensor_27.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right @@ -297,6 +298,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor_29.data() << " " // << std::setw(15) << ((float*)input_tensor_29.data())[0] // << std::setw(15) << ((float*)input_tensor_29.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right @@ -313,6 +316,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor_30.data() << " " // << std::setw(15) << ((float*)input_tensor_30.data())[0] // << std::setw(15) << ((float*)input_tensor_30.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right From 2d3e49d8aa801d02d5cc6b69ff7573e7337c13c0 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 31 Mar 2025 20:09:40 +0800 Subject: [PATCH 046/156] 1. Delete some comments 2. Process Prompt and predict first token is OK --- ggml/src/ggml-openvino.cpp | 20 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 46 ------ ggml/src/ggml-openvino/utils.cpp | 190 ------------------------ 3 files changed, 256 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2279df1d6d3c0..b9f1b89722607 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1045,25 +1045,12 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); - - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; @@ -1071,16 +1058,9 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() - // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() - // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() ) { i++; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ec827e80069ec..3b396c05f71f1 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -92,8 +92,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; - // inputs[src1_name] = node->src[1]; - // outputs[node_name] = node; src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; node_name = std::string(node->view_src->name); @@ -110,9 +108,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map(node->src[0]->ne[0])}; auto input1_param = std::make_shared(ov::element::f32, input1_shape); m_params.push_back(input1_param); - // ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), - // static_cast(node->src[1]->ne[1]), - // static_cast(node->src[1]->ne[0])}; ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), static_cast(node->src[1]->ne[1]), static_cast(node->src[1]->view_src->ne[0])}; @@ -122,8 +117,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; - // inputs[src1_name] = node->src[1]; - // outputs[node_name] = node; src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; node_name = std::string(node->view_src->name); @@ -152,44 +145,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; - // if (node->ne[0] == 21504 || node->ne[0] == 7 - // || node->ne[0] == 3072 && node->src[0]->ne[0] == 98304 - // || node->ne[0] == 1 && node->src[0]->ne[0] == 98304) { - // // if (node->ne[0] == 21504 || node->ne[0] == 7) { - // node_name = std::string(node->view_src->name); - // outputs[node_name] = node; - // } else { - // outputs[node_name] = node; - // } - // if (node->ne[0] == 3072 && node->ne[1] == 1 && node->ne[2] == 1) { - // outputs[src0_name] = node; - // m_output_names.push_back(src0_name); - // } else { - // outputs[node_name] = node; - // m_output_names.push_back(node_name); - // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); - - // ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - // static_cast(node->src[0]->ne[1]), - // static_cast(node->src[0]->ne[0])}; - // auto type = get_input_type(src0_name); - // auto input_param = std::make_shared(type, input_shape); - // m_params.push_back(input_param); - - // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { - // m_continuous = false; - // } else { - // m_continuous = true; - - // } - // m_continuous = false; - - // [TODO]: multiple cases - break; } // SCALE @@ -211,11 +170,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; - // if (node->ne[0] == 32 &&node->src[0]->type == GGML_TYPE_I32) { - // static_cast(inputs[src0_name]->data)[0] = 1; - // } else if (node->ne[0] == 32 && node->src[0]->type == GGML_TYPE_F16) { - // static_cast(inputs[src0_name]->data)[0] = static_cast(1); - // } inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 642f2b666233a..736c7f690b974 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -11,12 +11,9 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con return std::make_shared(nullptr, cgraph, start_index, end_index); } -// std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder, bool flag) { - // std::map input_tensors; std::vector> input_tensors; auto input_names = ggml_decoder->get_input_names(); - // auto node_name = ggml_decoder->get_op_name(); size_t op_iter = 0; for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; @@ -40,48 +37,9 @@ std::vector> get_ggml_graph_input_tensors(std const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } - // if (!flag) { - // std::cout << "CONT input shape: " << input_shape << std::endl; - // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - // if(!flag) { - // std::cout << std::left << "*[" << std::setw(2) << inp << "]*: " - // << "Input Name: " << std::setw(20) << name - // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) - // << "OP: " << std::setw(10) << op_node_name - // << "CONT: " << check_if_contiguous - // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor.data() << " " - // << std::setw(15) << ((float*)input_tensor.data())[0] - // << std::setw(15) << ((float*)input_tensor.data())[1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - // } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - // if(!flag) { - // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " - // << "Input Name: " << std::setw(20) << name - // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) - // << "OP: " << std::setw(10) << op_node_name - // << "CONT: " << check_if_contiguous - // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor.data() << " " - // << std::setw(15) << ((float*)input_tensor.data())[0] - // << std::setw(15) << ((float*)input_tensor.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]-1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - // } } // input_tensors[name] = input_tensor; @@ -146,13 +104,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); - - // auto cloned_model = model->clone(); - // std::string model_dir = "/home/user/zhan/merge_git_commits/llama.cpp-ov"; - // auto path_base = model_dir + "/" + cloned_model->get_name(); - // // ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); - // ov::serialize(cloned_model, path_base + ".xml", path_base + ".bin"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -162,14 +113,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // model = core.read_model("/home/user/zhan/merge_git_commits/llama.cpp-ov/replaceWithInputLayer_000_model.xml"); - // Loading a model to the device - // std::cout << "Compile ..." << std::endl; ov::CompiledModel compiled_model = core.compile_model(model); - // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); - // std::ofstream output_file("/home/user/zhan/merge_git_commits/llama.cpp-ov/000_compile_model.xml"); - // compiled_model.export_model(output_file); - // output_file.close(); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -180,19 +124,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - // infer_request.set_input_tensor(i, input_tensors[input_names[i]]); infer_request.set_input_tensor(i, input_tensors.at(i).second); - - // auto input_tensor = infer_request.get_input_tensor(i); - // auto input_shape = input_tensor.get_shape(); - // std::cout << "Input tensor " << i << " shape: "; - // for (const auto& dim : input_shape) { - // std::cout << dim << " "; - // } - // std::cout << std::endl; } - // std::cout << "Infer ..." << std::endl; infer_request.infer(); // Set dst data for outputs @@ -201,130 +135,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - // if(!flag) { - // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); - // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " - // << "output_names: " << std::setw(20) << output_names[i] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) - // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) - // << ", ne[0]-1: " - // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] - 1]) - // << ", ne[0]: " - // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right - // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right - // << std::right - // << std::endl; - // if (i == 19) { - // auto output_tensor_18 = infer_request.get_output_tensor(18); - // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[18]); - // std::cout << std::left << " " << std::setw(2) << 18 << " : " - // << "output_names: " << std::setw(20) << output_names[18] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) - // << ", ne[0]-1: " - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] - 1]) - // << ", ne[0]: " - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right - // << std::right - // << std::endl; - // } - // if(i == 23) { - // auto output_tensor_15 = infer_request.get_output_tensor(15); - // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[15]); - // std::cout << std::left << " " << std::setw(2) << 15 << " : " - // << "output_names: " << std::setw(20) << output_names[15] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) - // << ", ne[0]-1: " - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] - 1]) - // << ", ne[0]: " - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right - // << std::right - // << std::endl; - // auto cache_k_l0_20 = ggml_decoder->get_input_names()[20]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor = input_tensors.at(20).second; - // std::cout << std::left << " " << std::setw(2) << 20 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_20 - // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor.data() << " " - // << std::setw(15) << ((float*)input_tensor.data())[0] - // << std::setw(15) << ((float*)input_tensor.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - - // auto cache_k_l0_27 = ggml_decoder->get_input_names()[27]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor_27 = input_tensors.at(27).second; - // std::cout << std::left << " " << std::setw(2) << 27 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_27 - // << ", shape: " << std::setw(4) << input_tensor_27.get_shape()[0] << " " << std::setw(4) << input_tensor_27.get_shape()[1] << " " << input_tensor_27.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor_27.data() << " " - // << std::setw(15) << ((float*)input_tensor_27.data())[0] - // << std::setw(15) << ((float*)input_tensor_27.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - - // auto cache_k_l0_29 = ggml_decoder->get_input_names()[29]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor_29 = input_tensors.at(29).second; - // std::cout << std::left << " " << std::setw(2) << 29 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_29 - // << ", shape: " << std::setw(4) << input_tensor_29.get_shape()[0] << " " << std::setw(4) << input_tensor_29.get_shape()[1] << " " << input_tensor_29.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor_29.data() << " " - // << std::setw(15) << ((float*)input_tensor_29.data())[0] - // << std::setw(15) << ((float*)input_tensor_29.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - - // auto cache_k_l0_30 = ggml_decoder->get_input_names()[30]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor_30 = input_tensors.at(30).second; - // std::cout << std::left << " " << std::setw(2) << 30 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_30 - // << ", shape: " << std::setw(4) << input_tensor_30.get_shape()[0] << " " << std::setw(4) << input_tensor_30.get_shape()[1] << " " << input_tensor_30.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor_30.data() << " " - // << std::setw(15) << ((float*)input_tensor_30.data())[0] - // << std::setw(15) << ((float*)input_tensor_30.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - // } - // } #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif From 69a3aa94868210aa666a632cef7688a32bb0f02c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 14 Apr 2025 18:04:03 +0800 Subject: [PATCH 047/156] * Use find_package in CMake to configure OpenVINO * Remove OPENVINO_OP_DEBUG * Simplify set_input_output in decoder * Fix CPY in set_input_output * Use params from converted ov model in setting input --- ggml/src/ggml-openvino.cpp | 28 ++- ggml/src/ggml-openvino/ggml-decoder.cpp | 274 +++++------------------- ggml/src/ggml-openvino/utils.cpp | 55 +++-- 3 files changed, 114 insertions(+), 243 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index b9f1b89722607..762ed786a94e9 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml-openvino.h" #include "ggml-openvino/utils.h" +#include "ggml.h" #include #include @@ -1367,7 +1368,7 @@ static const std::set& openvino_ops = []() -> const std::set& openvino_ops = []() -> const std::setop); - if (it == op_mapping.end()) { - return false; + static const std::map> op_mapping_unary = { + {GGML_UNARY_OP_SILU, {"Sigmoid", "Multiply"}}, + }; + + std::vector mapped_ops; + if (op->op == GGML_OP_UNARY) { + auto it = op_mapping_unary.find(ggml_get_unary_op(op)); + if (it == op_mapping_unary.end()) { + return false; + } + mapped_ops = it->second; + } else { + auto it = op_mapping.find(op->op); + if (it == op_mapping.end()) { + return false; + } + mapped_ops = it->second; } - for (const std::string& op_name : it->second) { + for (const std::string& op_name : mapped_ops) { if (openvino_ops.count(op_name) == 0) { return false; } } return true; -#endif } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3b396c05f71f1..d7895c3d7f93a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,222 +6,66 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { - std::string src0_name = std::string(node->src[0]->name); - std::string node_name = std::string(node->name); + std::string node_name; + if (node->op == GGML_OP_CPY) { + // CPY updates the input tensor in place. For later ov op that uses the + // input tensor of CPY, we need to make sure they get the updated tensor + // by putting the src tensor name in the tensor_map in + // /src/frontends/ggml/src/translate_session.cpp + node_name = std::string(node->view_src->name); + } else { + node_name = std::string(node->name); + } - switch (node->op) { - // Unary OPs - case GGML_OP_UNARY: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - case GGML_OP_RMS_NORM: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - case GGML_OP_CONT: - { - if (ggml_is_contiguous(node->src[0]) - && ggml_is_contiguous(node) - && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = true; - break; - } + std::string src0_name = std::string(node->src[0]->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + if (node->op == GGML_OP_CPY && node->view_src) { + m_output_names.push_back(node->view_src->name); + } else { + m_output_names.push_back(node_name); + } - if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && - node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && - node->nb[0] == ggml_type_size(node->src[0]->type)) { - - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - const size_t element_size = ggml_type_size(node->src[0]->type); - size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - size_t dim2 = static_cast(node->src[0]->ne[2]); // 1 - size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - ov::Shape input_shape = { dim2, num_rows, phys_stride }; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = false; - break; - } + if (node->src[1]) { + std::string src1_name = std::string(node->src[1]->name); + inputs[src1_name] = node->src[1]; + m_input_names.push_back(src1_name); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); + } + if (node->src[2]) { + std::string src2_name = std::string(node->src[2]->name); + inputs[src2_name] = node->src[2]; + m_input_names.push_back(src2_name); + m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); + } - if (ggml_is_contiguous(node)) { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = false; - break; - } - } - case GGML_OP_CPY: - { - if (ggml_is_contiguous(node)) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - src1_name = std::string(node->src[1]->view_src->name); - inputs[src1_name] = node->src[1]; - node_name = std::string(node->view_src->name); - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - m_continuous = true; - - ov::Shape input1_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input1_param = std::make_shared(ov::element::f32, input1_shape); - m_params.push_back(input1_param); - ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), - static_cast(node->src[1]->ne[1]), - static_cast(node->src[1]->view_src->ne[0])}; - auto input2_param = std::make_shared(ov::element::f16, input2_shape); - m_params.push_back(input2_param); - break; - } else { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - src1_name = std::string(node->src[1]->view_src->name); - inputs[src1_name] = node->src[1]; - node_name = std::string(node->view_src->name); - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input0_param = std::make_shared(ov::element::f32, input0_shape); - m_params.push_back(input0_param); - ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; - auto input1_param = std::make_shared(ov::element::f16, input1_shape); - m_params.push_back(input1_param); - - m_continuous = false; - - break; - } - } - // For view, input is node itself - case GGML_OP_VIEW: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - // SCALE - case GGML_OP_SCALE: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; + switch (node->op) { + case GGML_OP_CONT: { + if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node) && + (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { + m_continuous = true; + } else { + m_continuous = false; } - case GGML_OP_MUL_MAT: - { - if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { - m_continuous = false; - } else { - m_continuous = true; - } - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - inputs[src1_name] = node->src[1]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; + break; + } + case GGML_OP_CPY: { + m_continuous = ggml_is_contiguous(node); + break; + } + case GGML_OP_MUL_MAT: { + if (!ggml_is_contiguous(node->src[1]) || + node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { + m_continuous = false; + } else { + m_continuous = true; } - // OPs with 2 inputs - case GGML_OP_ADD: - case GGML_OP_DIV: - case GGML_OP_MUL: - case GGML_OP_SUB: - case GGML_OP_GET_ROWS: - case GGML_OP_SOFT_MAX: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src1_name] = node->src[1]; - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - } - break; - } - // OPs with 3 inputs: - case GGML_OP_ROPE: - { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - inputs[src1_name] = node->src[1]; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - outputs[node_name] = node; - m_output_names.push_back(node_name); - if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name); - inputs[src2_name] = node->src[2]; - m_input_names.push_back(src2_name); - m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); - } - break; - } - default: - break; + break; + } + default: + break; } } @@ -334,7 +178,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr m_op_node_name.clear(); m_decoders.clear(); - // If first init if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { @@ -353,7 +196,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ov::PartialShape input_shape; - // Use input_node->ne + // Use input_node->ne ggml_tensor * node = m_inputs.at(name); std::vector shape; @@ -440,7 +283,6 @@ const std::vector>& GgmlOvDecoder::get_pa ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; - // Use input_node->ne ggml_tensor * node = m_outputs.at(name); std::vector shape; @@ -552,10 +394,10 @@ const std::string& GgmlOvDecoder::get_op_type() const { auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); if (unary_it != unaryOpTypeMap.end()) { return unary_it->second; - } + } } return it->second; - } + } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 736c7f690b974..f4d9c7705ab22 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,9 +1,11 @@ #include "utils.h" -#include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include +#include +#include #include #include -#include using ov::frontend::ggml::GgmlDecoder; @@ -20,27 +22,14 @@ std::vector> get_ggml_graph_input_tensors(std std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - auto check_if_contiguous = ggml_is_contiguous(ggml_decoder->get_input_ggml_tensor(name)); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); std::vector input_stride = ggml_decoder->get_input_stride(name); - if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node - && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1]) - ) { - const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); - const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); - size_t phys_stride = static_cast(input_stride[1]) / element_size; - ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - } else { - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); // input_tensors[name] = input_tensor; input_tensors.emplace_back(name, input_tensor); @@ -49,6 +38,18 @@ std::vector> get_ggml_graph_input_tensors(std return input_tensors; } +ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { + auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data)); + #endif + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; +} + std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; auto output_names = ggml_decoder->get_output_names(); @@ -79,7 +80,7 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { static ov::Core core; // auto devices = core.get_available_devices(); - // Get GGML Frontend + // Get GGML Frontend static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); @@ -102,9 +103,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // Convert InputModel -> ov::Model + // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + if (getenv("OPENVINO_DUMP_GRAPH")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), + "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -122,10 +131,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto input_names = ggml_decoder->get_input_names(); auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); - // Set input tensor - for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors.at(i).second); + auto ov_params = model->get_parameters(); + for (size_t i = 0; i < ov_params.size(); i++) { + auto param_name = ov_params[i]->get_friendly_name(); + infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); } + // for (size_t i = 0; i < input_names.size(); i++) { + // infer_request.set_input_tensor(i, input_tensors.at(i).second); + // } infer_request.infer(); From 382938538b7b5a6dbeafe28d516a4296729c8efc Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 15 Apr 2025 14:34:00 +0800 Subject: [PATCH 048/156] change op mappings to list in openvino_supports_op --- ggml/src/ggml-openvino.cpp | 96 +++----------------------------- ggml/src/ggml-openvino/utils.cpp | 21 +++---- ggml/src/ggml-openvino/utils.h | 2 +- 3 files changed, 17 insertions(+), 102 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 762ed786a94e9..5ea2351e06cb1 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1036,9 +1036,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process nodes in order - bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { - prompt_process_flag = false; for (int i = 0; i < cgraph->n_nodes; i++) { if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { ggml_backend_openvino_add_forward(cgraph->nodes[i]); @@ -1066,13 +1064,13 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + openvino_frontend_compute(backend, cgraph, start_index, --i); } } } } else { int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + openvino_frontend_compute(backend, cgraph, 0, end_node); } return GGML_STATUS_SUCCESS; @@ -1331,91 +1329,11 @@ static const std::set& openvino_ops = []() -> const std::set> op_mapping = { - {GGML_OP_ACC, {"Add"}}, - {GGML_OP_ADD, {"Add"}}, - {GGML_OP_ADD1, {"Add"}}, - {GGML_OP_ADD_REL_POS, {"Add", "MatMul", "Reshape"}}, - {GGML_OP_ARANGE, {"Range"}}, - {GGML_OP_ARGMAX, {"TopK"}}, - {GGML_OP_ARGSORT, {"TopK"}}, - {GGML_OP_CLAMP, {"Clamp"}}, - {GGML_OP_CONCAT, {"Concat"}}, - {GGML_OP_CONV_TRANSPOSE_1D, {"ConvolutionBackpropData"}}, - {GGML_OP_CONV_TRANSPOSE_2D, {"ConvolutionBackpropData"}}, - {GGML_OP_COS, {"Cos"}}, - {GGML_OP_CROSS_ENTROPY_LOSS, {"Softmax", "Log", "Multiply", "ReduceSum", "Negative"}}, - {GGML_OP_DIAG, {"Eye", "Multiply"}}, - {GGML_OP_DIAG_MASK_INF, {"Eye", "Multiply", "Select", "Broadcast"}}, - {GGML_OP_DIAG_MASK_ZERO, {"Eye", "Multiply", "Select", "Broadcast"}}, - {GGML_OP_DIV, {"Divide"}}, - {GGML_OP_FLASH_ATTN_EXT, {"ScaledDotProductAttention"}}, - {GGML_OP_GET_ROWS, {"Gather"}}, - {GGML_OP_GROUP_NORM, {"GroupNormalization"}}, - {GGML_OP_IM2COL, {"Custom", "Reshape", "Transpose"}}, - {GGML_OP_LEAKY_RELU, {"PReLU"}}, - {GGML_OP_LOG, {"Log"}}, - {GGML_OP_MEAN, {"ReduceMean"}}, - {GGML_OP_MUL, {"Multiply"}}, - {GGML_OP_MUL_MAT, {"MatMul"}}, - {GGML_OP_MUL_MAT_ID, {"MatMul", "Identity"}}, - {GGML_OP_NORM, {"NormalizeL2"}}, - {GGML_OP_OUT_PROD, {"MatMul", "Reshape"}}, - {GGML_OP_PAD, {"Pad"}}, - {GGML_OP_PERMUTE, {"Transpose"}}, - {GGML_OP_POOL_1D, {"AvgPool", "MaxPool"}}, - {GGML_OP_POOL_2D, {"AvgPool", "MaxPool"}}, - {GGML_OP_REPEAT, {"Tile"}}, - {GGML_OP_RESHAPE, {"Reshape"}}, - {GGML_OP_RMS_NORM, {"Multiply", "Divide", "Sqrt"}}, - {GGML_OP_ROPE, {"Sin", "Cos", "Multiply", "Add", "Subtract", "Split", "StridedSlice", "Concat"}}, - {GGML_OP_SCALE, {"Multiply", "Constant"}}, - {GGML_OP_SET, {"Assign"}}, - {GGML_OP_SIN, {"Sin"}}, - {GGML_OP_SOFT_MAX, {"Softmax"}}, - {GGML_OP_SQR, {"Power"}}, - {GGML_OP_SQRT, {"Sqrt"}}, - {GGML_OP_SSM_CONV, {"Custom"}}, - {GGML_OP_SSM_SCAN, {"Custom"}}, - {GGML_OP_SUB, {"Subtract"}}, - {GGML_OP_SUM, {"ReduceSum"}}, - {GGML_OP_SUM_ROWS, {"ReduceSum", "Squeeze", "Unsqueeze"}}, - {GGML_OP_TIMESTEP_EMBEDDING, {"Range", "Power", "Multiply", "Sin", "Cos", "Concat"}}, - {GGML_OP_TRANSPOSE, {"Transpose"}}, - {GGML_OP_UPSCALE, {"Interpolate"}}, - {GGML_OP_VIEW, {"Reshape"}}, - {GGML_OP_CONT, {"Reshape", "StridedSlice"}}, - {GGML_OP_CPY, {"Reshape", "ScatterNDUpdate"}}, - {GGML_OP_WIN_PART, {"StridedSlice", "Concat", "Reshape", "Custom"}}, - {GGML_OP_WIN_UNPART, {"Reshape", "Transpose", "Custom"}}, - }; - - static const std::map> op_mapping_unary = { - {GGML_UNARY_OP_SILU, {"Sigmoid", "Multiply"}}, - }; - - std::vector mapped_ops; - if (op->op == GGML_OP_UNARY) { - auto it = op_mapping_unary.find(ggml_get_unary_op(op)); - if (it == op_mapping_unary.end()) { - return false; - } - mapped_ops = it->second; - } else { - auto it = op_mapping.find(op->op); - if (it == op_mapping.end()) { - return false; - } - mapped_ops = it->second; - } - - for (const std::string& op_name : mapped_ops) { - if (openvino_ops.count(op_name) == 0) { - return false; - } - } - - return true; + if (op->op == GGML_OP_UNARY) { + return supported_unary_ops.find(ggml_get_unary_op(op)) != + supported_unary_ops.end(); + } + return supported_ops.find(op->op) != supported_ops.end(); } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f4d9c7705ab22..c32ad6584274b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include "ggml-backend-impl.h" #include "ggml-impl.h" +#include "ggml.h" #include #include #include @@ -13,7 +14,7 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con return std::make_shared(nullptr, cgraph, start_index, end_index); } -std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder, bool flag) { +std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::vector> input_tensors; auto input_names = ggml_decoder->get_input_names(); size_t op_iter = 0; @@ -77,10 +78,13 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, + struct ggml_cgraph *cgraph, + const int32_t start_index, + const int32_t end_index) { static ov::Core core; + // auto devices = core.get_available_devices(); - // Get GGML Frontend static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); @@ -90,6 +94,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML FrontEnd is initialized \n"); #endif } + auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); std::shared_ptr graph_decoder = ggml_decoder; // Load GraphIterator -> InputModel @@ -123,26 +128,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } ov::CompiledModel compiled_model = core.compile_model(model); - - // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); - // Get input tensor auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); - + auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); } - // for (size_t i = 0; i < input_names.size(); i++) { - // infer_request.set_input_tensor(i, input_tensors.at(i).second); - // } infer_request.infer(); - // Set dst data for outputs auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 7806c418cb62b..0f5617ab4bc17 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0, bool flag = true); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); From 09422dd8ae14730b9f4ed79e1beb34eb112c95da Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 15 Apr 2025 19:43:29 +0800 Subject: [PATCH 049/156] 2nd+ token correct by fix CPY in OV, remove single op backend compute code --- ggml/src/ggml-openvino.cpp | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 5ea2351e06cb1..efb8ff12bc3d1 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1,18 +1,14 @@ #include "ggml-backend-impl.h" -#include "ggml-cpu-impl.h" #include "ggml-impl.h" #include "ggml-openvino.h" #include "ggml-openvino/utils.h" #include "ggml.h" -#include #include -#include #include -#include -#include -#include -#include +#include +#include +#include #define GGML_OPENVINO_MAX_STREAMS 8 @@ -55,10 +51,10 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( GGML_UNUSED(backend); } -static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { - // Step 1: get the input tensor src0 和 src1 - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; +static enum ggml_status +ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node); ov::Core core; @@ -1267,17 +1263,6 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } -std::set get_openvino_available_opsets() { - ov::Core core; - std::set unique_ops; - for (const auto& opset : ov::get_available_opsets()) { - for (const auto& op : opset.second().get_type_info_set()) { - unique_ops.insert(op.name); - } - } - return unique_ops; -} - static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); From fd5956f3b2cea80a20e5cfa9b91b544b5ed7373e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 17 Apr 2025 17:42:44 +0800 Subject: [PATCH 050/156] Arbitrary token len (>32) work; Fix bug in mulmat --- ggml/src/ggml-openvino/ggml-decoder.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d7895c3d7f93a..b1fc8ec67eaab 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -56,13 +56,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]) || - node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { - m_continuous = false; - } else { - m_continuous = true; - } - break; + m_continuous = node->src[0]->view_src == nullptr; + break; } default: break; From f346cf4dd97a9c68c1c52e3eec3bdbd664410f4f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 21 Apr 2025 15:14:43 +0800 Subject: [PATCH 051/156] FEAT: do PERMUTE eagerly --- ggml/src/ggml-openvino/ggml-decoder.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b1fc8ec67eaab..c639d630f3473 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -43,12 +43,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop) { case GGML_OP_CONT: { - if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node) && - (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { - m_continuous = true; - } else { - m_continuous = false; - } + // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE + m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); break; } case GGML_OP_CPY: { @@ -183,9 +179,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr // Init model input and output set_input_output(cur_node, m_inputs, m_outputs); } - #ifdef GGML_OPENVINO_DEBUG - ggml_graph_op_print(m_cgraph); - #endif + if (getenv("GGML_OPENVINO_DEBUG")) { + ggml_graph_op_print(m_cgraph); + } } } From d3ad665a52971e072540a0019fadb9f2cb5856f8 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 22 Apr 2025 19:03:12 +0800 Subject: [PATCH 052/156] FEAT: Add interleaved mode for ROPE --- ggml/src/ggml-openvino/ggml-decoder.cpp | 28 ++++++++++++++----------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c639d630f3473..2dbde9ea5af19 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -103,12 +103,6 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[0]->ne[3] << "] " << std::setw(12) << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; - // // Custom logic to handle '\000' - // const char* name_ptr = node->src[0]->name; - // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { - // file << *name_ptr; - // name_ptr++; - // } file << std::left << std::setw(30) << node->src[0]->name << std::right << std::setw(16) << "[ " << std::setw(0) << node->src[0]->nb[0] << ", " @@ -125,12 +119,6 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[1]->ne[3] << "] " << std::setw(12) << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; - // // Custom logic to handle '\000' - // const char* name_ptr = node->src[1]->name; - // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { - // file << *name_ptr; - // name_ptr++; - // } file << std::left << std::setw(30) << node->src[1]->name << std::right << std::setw(16) << "[ " << std::setw(0) << node->src[1]->nb[0] << ", " @@ -139,6 +127,22 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[1]->nb[3] << "] " << "\n"; } + if (node->src[2]) { + file << std::setw(10) << " [ " + << std::setw(5) << node->src[2]->ne[0] << ", " + << std::setw(5) << node->src[2]->ne[1] << ", " + << std::setw(5) << node->src[2]->ne[2] << ", " + << std::setw(5) << node->src[2]->ne[3] << "] " + << std::setw(12) + << "2: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; + file << std::left << std::setw(30) << node->src[2]->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << node->src[2]->nb[0] << ", " + << std::setw(5) << node->src[2]->nb[1] << ", " + << std::setw(5) << node->src[2]->nb[2] << ", " + << std::setw(5) << node->src[2]->nb[3] << "] " + << "\n"; + } } file << "n_leafs = " << cgraph->n_leafs << "\n"; From 8cd0c1e2081414c65422749cb478e6f248a1b22b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 28 Apr 2025 12:00:13 +0800 Subject: [PATCH 053/156] REFACTOR: support weigts as constant --- ggml/src/ggml-openvino.cpp | 3 +- ggml/src/ggml-openvino/decoder.h | 22 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 371 ++++++++++++++---------- ggml/src/ggml-openvino/ggml-decoder.h | 34 ++- ggml/src/ggml-openvino/utils.cpp | 154 +++++----- ggml/src/ggml-openvino/utils.h | 2 +- 6 files changed, 321 insertions(+), 265 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index efb8ff12bc3d1..5221a1ff8bd48 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -53,8 +53,7 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); + openvino_frontend_compute(backend, cgraph); ov::Core core; diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index e287f31e23c8c..c0641e2662208 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,19 +1,14 @@ #pragma once +#include + #include "openvino/core/node.hpp" #include "openvino/frontend/decoder.hpp" -#include "openvino/op/parameter.hpp" namespace ov { namespace frontend { namespace ggml { -// 定义 tensor_info 结构体 -struct tensor_info { - - std::vector shape; - std::vector stride; -}; // TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: @@ -36,10 +31,6 @@ class GgmlDecoder : public DecoderBase { virtual std::vector get_input_names() const = 0; - virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0; - - // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; - virtual PartialShape get_output_shape(const std::string& name) const = 0; virtual std::vector get_output_stride(const std::string& name) const = 0; @@ -64,14 +55,11 @@ class GgmlDecoder : public DecoderBase { virtual void visit_subgraph(std::function)> node_visitor) const = 0; - // virtual const std::vector& outputs() const = 0; - - // virtual size_t output(size_t index) const = 0; - virtual bool check_if_continuous() const = 0; - virtual const std::vector>& get_params() const = 0; - + virtual const std::unordered_map>& get_model_inputs() const = 0; + virtual const std::unordered_map>& get_model_weights() const = 0; + virtual const std::vector& get_model_output_names() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2dbde9ea5af19..05947ff579763 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,11 +1,62 @@ #include "ggml-decoder.h" -#include + #include -#include -#include +#include + +#include +#include +#include #include +#include +#include +#include +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph) + : m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + if (m_node) { + set_input_output(m_node); + } else { + // std::map> address_map; + // for (int node_n = start_index; node_n <= end_index; node_n++) { + // auto node = cgraph->nodes[node_n]; + // if (node->data) { + // auto it = address_map.find(node->data); + // if (it == address_map.end()) { + // address_map[node->data] = std::vector(); + // } + // address_map[node->data].push_back(node->name); + // } + // } + // for (const auto& pair : address_map) { + // std::cout << "Address: " << pair.first << " -> "; + // for (const auto& name : pair.second) { + // std::cout << name << " ;"; + // } + // std::cout << std::endl; + // } + + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + auto* cur_node = m_cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + // Init model input and output + set_input_output(cur_node); + } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + dump_cgraph(m_cgraph); + } + } +} -void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { +// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; +// 2. constructing a decoder for a node. +void GgmlOvDecoder::set_input_output(ggml_tensor* node) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -17,51 +68,130 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname); } - std::string src0_name = std::string(node->src[0]->name); - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - if (node->op == GGML_OP_CPY && node->view_src) { - m_output_names.push_back(node->view_src->name); - } else { - m_output_names.push_back(node_name); + m_output_names.push_back(node_name); + m_outputs[node_name] = node; + + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto* src = node->src[i]; + if (src == nullptr) { + continue; + } + std::string src_name = std::string(src->name); + m_input_names.push_back(src_name); + m_inputs[src_name] = src; + m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); + + // If called for the whole graph, create constant nodes for weights and param nodes for inputs + if (!m_node && !src->view_src) { + ggml_backend_buffer* buffer = src->buffer; + + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); + auto& weights_map = weight_as_input ? m_model_inputs : m_model_weights; + if (weights_map.find(src_name) != weights_map.end()) { + continue; + } + + std::shared_ptr weight_node = + weight_as_input + ? std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}) + : create_weight_node(src); + weight_node->set_friendly_name(src_name); + weights_map[src_name] = weight_node; + + } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); + } + if (m_model_inputs.find(src_name) != m_model_inputs.end()) { + continue; + } + auto param_node = std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}); + param_node->set_friendly_name(src_name); + m_model_inputs[src_name] = param_node; + } + } } - if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src1_name] = node->src[1]; - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); + if (!m_node) { + // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph + if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || + std::string(node->name).find("result") == 0) { + auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); + if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); + } + auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + if (it == m_model_output_names.end()) { + m_model_output_names.push_back(name); + } + } } - if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name); - inputs[src2_name] = node->src[2]; - m_input_names.push_back(src2_name); - m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); + + if (m_node) { + switch (node->op) { + case GGML_OP_CONT: { + // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE + m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); + break; + } + case GGML_OP_CPY: { + m_continuous = ggml_is_contiguous(node); + break; + } + case GGML_OP_MUL_MAT: { + m_continuous = node->src[0]->view_src == nullptr; + break; + } + default: + break; + } } +} - switch (node->op) { - case GGML_OP_CONT: { - // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE - m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { + std::shared_ptr weight_node; + auto node_type = get_ov_type(tensor); + auto node_shape = get_shape(tensor); + auto ne_total = ggml_nelements(tensor); + switch (tensor->type) { + case GGML_TYPE_I32: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); break; } - case GGML_OP_CPY: { - m_continuous = ggml_is_contiguous(node); + case GGML_TYPE_I64: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); break; } - case GGML_OP_MUL_MAT: { - m_continuous = node->src[0]->view_src == nullptr; - break; + case GGML_TYPE_F32: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; } - default: + case GGML_TYPE_F16: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_f16; + data_f16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_f16.push_back(ov::float16::from_bits(ptr[i])); + } + weight_node = std::make_shared(node_type, node_shape, data_f16); break; } + default: + throw std::invalid_argument("Unsupported tensor type"); + } + return weight_node; } -void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { - std::ofstream file("01_nodes.txt"); +void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { + std::ofstream file("cgraph.txt"); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; return; @@ -160,88 +290,53 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file.close(); } - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) - :m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { - m_inputs.clear(); - m_outputs.clear(); - m_input_names.clear(); - m_output_names.clear(); - m_params.clear(); - m_op_node_name.clear(); - m_decoders.clear(); - - if (m_node) { - set_input_output(m_node, m_inputs, m_outputs); - } else { - // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - for (int node_n = start_index; node_n <= end_index; node_n++) { - auto cur_node = m_cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); - // Init model input and output - set_input_output(cur_node, m_inputs, m_outputs); - } - if (getenv("GGML_OPENVINO_DEBUG")) { - ggml_graph_op_print(m_cgraph); - } - } -} - -ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { - ov::PartialShape input_shape; - // Use input_node->ne - ggml_tensor * node = m_inputs.at(name); +std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { std::vector shape; - - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - if (node->ne[i] == 0) { - return input_shape; - } - shape.push_back(static_cast(node->ne[i])); + for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { + shape.push_back(static_cast(tensor->ne[i])); } - input_shape = ov::PartialShape(shape); - return input_shape; + return shape; } -std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { +std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { std::vector stride; - ggml_tensor * node = m_inputs.at(name); for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - stride.push_back(static_cast(node->nb[i])); + stride.push_back(static_cast(tensor->nb[i])); } return stride; } -std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { - std::vector stride; - ggml_tensor * node = m_outputs.at(name); - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - stride.push_back(static_cast(node->nb[i])); +ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { + ov::element::Type type = ov::element::dynamic; + switch (tensor->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; } - return stride; + return type; +} + +ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { + return ov::PartialShape(get_shape(m_inputs.at(name))); +} + +std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { + return get_stride(m_inputs.at(name)); } ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { - ov::element::Type type = ov::element::dynamic; - switch (m_inputs.at(name)->type) { - case GGML_TYPE_F32: - type = ov::element::f32; - break; - case GGML_TYPE_F16: - type = ov::element::f16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; - case GGML_TYPE_I32: - type = ov::element::i32; - break; - default: - break; - } - return type; + return get_ov_type(m_inputs.at(name)); } size_t GgmlOvDecoder::get_input_size() const { @@ -257,69 +352,16 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { - if (index == -1) { - for (size_t i = 0; i < m_op_node_name.size(); ++i) { - if (m_op_node_name[i].first == key_name) { - return m_op_node_name[i].second; - } - } - } else { - return m_op_node_name[index].second; - } - - static std::string empty_string = ""; - return empty_string; // empty string -} - -const std::vector>& GgmlOvDecoder::get_params() const { - return m_params; +std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { + return get_stride(m_outputs.at(name)); } ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { - ov::PartialShape output_shape; - ggml_tensor * node = m_outputs.at(name); - std::vector shape; - - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - if (node->ne[i] == 0 ) { - // empty if any dimension has no elements - return output_shape; - } - shape.push_back(static_cast(node->ne[i])); - } - output_shape = ov::PartialShape(shape); - return output_shape; + return ov::PartialShape(get_shape(m_outputs.at(name))); } ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { - // TODO: Change to Output - ov::element::Type type = ov::element::dynamic; - switch (m_outputs.at(name)->type) { - case GGML_TYPE_F32: - type = ov::element::f32; - break; - case GGML_TYPE_F16: - type = ov::element::f16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; - case GGML_TYPE_I32: - type = ov::element::i32; - break; - default: - break; - } - return type; -} - -int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const{ - return m_inputs.at(name)->op_params; -} - -int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ - return m_outputs.at(name)->op_params; + return get_ov_type(m_outputs.at(name)); } std::string& GgmlOvDecoder::get_output_name(size_t index) const { @@ -335,10 +377,17 @@ const std::string& GgmlOvDecoder::get_op_name() const { return m_op_name; } +int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const { + return m_inputs.at(name)->op_params; +} + +int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { + return m_outputs.at(name)->op_params; +} + void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { auto decoder = std::make_shared(node, m_cgraph); - // m_decoders.push_back(decoder); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index eac045d158300..2182ad624d027 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,14 +1,17 @@ #pragma once +#include +#include +#include + #include "decoder.h" #include "ggml.h" -#include "openvino/op/parameter.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -73,12 +76,23 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_continuous; } - std::string& get_op_node_name(const std::string& key_name, const int index) override; - - virtual const std::vector>& get_params() const override; + virtual const std::unordered_map>& get_model_inputs() const override { + return m_model_inputs; + } + virtual const std::unordered_map>& get_model_weights() const override { + return m_model_weights; + } + virtual const std::vector& get_model_output_names() const override { + return m_model_output_names; + } private: - void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); + void set_input_output(ggml_tensor* node); + static void dump_cgraph(const struct ggml_cgraph* cgraph); + static std::vector get_shape(const ggml_tensor* tensor); + static std::vector get_stride(const ggml_tensor* tensor); + static ov::element::Type get_ov_type(const ggml_tensor* tensor); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); struct ggml_cgraph * m_cgraph; std::map m_inputs; @@ -86,12 +100,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map m_outputs; std::vector m_output_names; ggml_tensor* m_node; - std::vector m_nodes; - std::vector> m_decoders; + std::vector m_nodes; std::string m_op_name; mutable std::string m_name; bool m_continuous; - std::vector> m_params; std::vector> m_op_node_name; + std::unordered_map> m_model_inputs; + std::unordered_map> m_model_weights; + std::vector m_model_output_names; }; - diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c32ad6584274b..7937d5793a5f2 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,49 +1,22 @@ #include "utils.h" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" -#include "ggml.h" + +#include +#include #include -#include #include +#include #include #include -using ov::frontend::ggml::GgmlDecoder; - -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { - return std::make_shared(nullptr, cgraph, start_index, end_index); -} +#include "ggml-impl.h" +#include "ggml.h" -std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { - std::vector> input_tensors; - auto input_names = ggml_decoder->get_input_names(); - size_t op_iter = 0; - for (size_t inp = 0; inp < input_names.size(); ++inp) { - auto name = input_names[inp]; - std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); - // auto node_op_name = ggml_decoder->get_node_op_name(name); - auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); - #endif - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - - // input_tensors[name] = input_tensor; - input_tensors.emplace_back(name, input_tensor); - } - // std::cout << "input_names.size(): " << input_names.size() << std::endl; - return input_tensors; +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { + return std::make_shared(nullptr, cgraph); } ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { - auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data)); - #endif + auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; ov::Tensor input_tensor; ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); std::vector input_stride = ggml_decoder->get_input_stride(name); @@ -53,19 +26,16 @@ ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decod std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; - auto output_names = ggml_decoder->get_output_names(); + auto output_names = ggml_decoder->get_model_output_names(); for (size_t inp = 0; inp < output_names.size(); ++inp) { auto name = output_names[inp]; - auto output_data = ggml_decoder->get_output_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Output %d: %g\n", inp, *(double*)(output_data)); - #endif + const auto* tensor = ggml_decoder->get_output_ggml_tensor(name); + auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data; output_tensors[name] = output_data; } return output_tensors; } - static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); @@ -78,10 +48,9 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, - struct ggml_cgraph *cgraph, - const int32_t start_index, - const int32_t end_index) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { + auto start_time = ggml_time_us(); + static ov::Core core; // auto devices = core.get_available_devices(); @@ -89,65 +58,102 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("GGML FrontEnd is initialized \n"); - #endif } - auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); + auto ggml_decoder = get_ggml_decoder(cgraph); std::shared_ptr graph_decoder = ggml_decoder; - // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); if (!input_model) { GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Input Model loaded \n"); - #endif } - // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + auto conversion_end_time = ggml_time_us(); - if (getenv("OPENVINO_DUMP_GRAPH")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), - "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); } if (!model) { GGML_LOG_ERROR("Model is not converted \n"); - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Model converted \n"); - #endif } - ov::CompiledModel compiled_model = core.compile_model(model); + ov::CompiledModel compiled_model = + core.compile_model(model, "CPU", ov::device::properties("CPU", ov::cache_dir("/tmp/ov_cache"))); + auto compile_end_time = ggml_time_us(); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + auto infer_request_start_time = ggml_time_us(); auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); + auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape() + << ", Address: " << input_tensor.data() << std::endl; + switch (input_tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(input_tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(int32_t*)(input_tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(int64_t*)(input_tensor.data()) << std::endl; + break; + default: + break; + } + } + infer_request.set_input_tensor(i, input_tensor); } + auto input_end_time = ggml_time_us(); infer_request.infer(); + auto infer_end_time = ggml_time_us(); - auto output_names = ggml_decoder->get_output_names(); + auto output_names = ggml_decoder->get_model_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); - #endif + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape() + << ", Address: " << output_tensors[output_names[i]] << std::endl; + switch (output_tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(output_tensors[output_names[i]]) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensors[output_names[i]])) << std::endl; + break; + default: + break; + } + } + } + auto end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_PROFILING")) { + GGML_LOG_INFO("GGML OpenVINO Backend: \n"); + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n", + (infer_request_start_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000); + GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); + GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 0f5617ab4bc17..b4174c9f216b1 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); From 07bf406ace5ceabb1aa6ee907e69a0842a9e6b6f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 28 Apr 2025 17:03:21 +0800 Subject: [PATCH 054/156] STYLE: minor refactor --- ggml/src/ggml-openvino/ggml-decoder.cpp | 67 +++++++------------------ 1 file changed, 19 insertions(+), 48 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 05947ff579763..6b20159720034 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -199,6 +199,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { file << "=== GRAPH ===\n"; + // clang-format off file << "n_nodes = " << cgraph->n_nodes << "\n"; file << " " << std::setw(3) << "nodes" << std::setw(15) << "shape" @@ -225,53 +226,23 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { << std::setw(5) << node->nb[3] << "] " << "\n"; - if (node->src[0]) { - file << std::setw(10) << " [ " - << std::setw(5) << node->src[0]->ne[0] << ", " - << std::setw(5) << node->src[0]->ne[1] << ", " - << std::setw(5) << node->src[0]->ne[2] << ", " - << std::setw(5) << node->src[0]->ne[3] << "] " - << std::setw(12) - << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; - file << std::left << std::setw(30) << node->src[0]->name << std::right - << std::setw(16) << "[ " - << std::setw(0) << node->src[0]->nb[0] << ", " - << std::setw(5) << node->src[0]->nb[1] << ", " - << std::setw(5) << node->src[0]->nb[2] << ", " - << std::setw(5) << node->src[0]->nb[3] << "] " - << "\n"; - } - if (node->src[1]) { - file << std::setw(10) << " [ " - << std::setw(5) << node->src[1]->ne[0] << ", " - << std::setw(5) << node->src[1]->ne[1] << ", " - << std::setw(5) << node->src[1]->ne[2] << ", " - << std::setw(5) << node->src[1]->ne[3] << "] " - << std::setw(12) - << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; - file << std::left << std::setw(30) << node->src[1]->name << std::right - << std::setw(16) << "[ " - << std::setw(0) << node->src[1]->nb[0] << ", " - << std::setw(5) << node->src[1]->nb[1] << ", " - << std::setw(5) << node->src[1]->nb[2] << ", " - << std::setw(5) << node->src[1]->nb[3] << "] " - << "\n"; - } - if (node->src[2]) { - file << std::setw(10) << " [ " - << std::setw(5) << node->src[2]->ne[0] << ", " - << std::setw(5) << node->src[2]->ne[1] << ", " - << std::setw(5) << node->src[2]->ne[2] << ", " - << std::setw(5) << node->src[2]->ne[3] << "] " - << std::setw(12) - << "2: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; - file << std::left << std::setw(30) << node->src[2]->name << std::right - << std::setw(16) << "[ " - << std::setw(0) << node->src[2]->nb[0] << ", " - << std::setw(5) << node->src[2]->nb[1] << ", " - << std::setw(5) << node->src[2]->nb[2] << ", " - << std::setw(5) << node->src[2]->nb[3] << "] " - << "\n"; + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (auto* src = node->src[i]) { + file << std::setw(10) << " [ " + << std::setw(5) << src->ne[0] << ", " + << std::setw(5) << src->ne[1] << ", " + << std::setw(5) << src->ne[2] << ", " + << std::setw(5) << src->ne[3] << "] " + << std::setw(12) + << i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right; + file << std::left << std::setw(30) << src->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << src->nb[0] << ", " + << std::setw(5) << src->nb[1] << ", " + << std::setw(5) << src->nb[2] << ", " + << std::setw(5) << src->nb[3] << "] " + << "\n"; + } } } @@ -285,7 +256,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { << std::setw(8) << ggml_op_name(node->op) << " " << std::setw(16) << ggml_get_name(node) << "\n"; } - + // clang-format on file << "========================================\n"; file.close(); From ee63f9a89a48d6d2fb7a9f4fc55e493165e429a0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 28 Apr 2025 17:04:44 +0800 Subject: [PATCH 055/156] PERF: share const nodes for weights for diff infer --- ggml/src/ggml-openvino/ggml-decoder.cpp | 55 ++++++++++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6b20159720034..d42aaf4664f15 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "ggml-backend-impl.h" #include "ggml-backend.h" @@ -20,34 +22,16 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap : m_cgraph(cgraph), m_node(node), m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + static std::unordered_map> model_weights; if (m_node) { - set_input_output(m_node); + set_input_output(m_node, model_weights); } else { - // std::map> address_map; - // for (int node_n = start_index; node_n <= end_index; node_n++) { - // auto node = cgraph->nodes[node_n]; - // if (node->data) { - // auto it = address_map.find(node->data); - // if (it == address_map.end()) { - // address_map[node->data] = std::vector(); - // } - // address_map[node->data].push_back(node->name); - // } - // } - // for (const auto& pair : address_map) { - // std::cout << "Address: " << pair.first << " -> "; - // for (const auto& name : pair.second) { - // std::cout << name << " ;"; - // } - // std::cout << std::endl; - // } - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); - // Init model input and output - set_input_output(cur_node); + set_input_output(cur_node, model_weights); } + m_model_weights = model_weights; if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -56,7 +40,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node, + std::unordered_map>& model_weights) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -87,7 +72,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); - auto& weights_map = weight_as_input ? m_model_inputs : m_model_weights; + auto& weights_map = weight_as_input ? m_model_inputs : model_weights; if (weights_map.find(src_name) != weights_map.end()) { continue; } @@ -261,6 +246,28 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { file.close(); } + +void print_tensor_address_map(const struct ggml_cgraph* cgraph) { + std::map> address_map; + for (int node_n = 0; node_n <= cgraph->n_nodes; node_n++) { + auto* node = cgraph->nodes[node_n]; + if (node->data) { + auto it = address_map.find(node->data); + if (it == address_map.end()) { + address_map[node->data] = std::vector(); + } + address_map[node->data].push_back(node->name); + } + } + for (const auto& pair : address_map) { + std::cout << "Address: " << pair.first << std::endl; + for (const auto& name : pair.second) { + std::cout << name << " ; "; + } + std::cout << std::endl << std::endl; + } +} + std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { std::vector shape; for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2182ad624d027..a71c5e4e1f50b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -87,7 +87,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } private: - void set_input_output(ggml_tensor* node); + void set_input_output(ggml_tensor* node, std::unordered_map>& model_weights); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); From 86a1f2353d5e7fcc124061854200c4f38a05aaac Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 29 Apr 2025 14:31:35 +0800 Subject: [PATCH 056/156] BUILD: update build doc, add cmake preset, add CACHE_DIR env var --- CMakePresets.json | 20 ++++++++++++++++++++ ggml/src/ggml-openvino/utils.cpp | 8 +++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index b5afeb3c0f2f9..392c357f37c0b 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,6 +1,26 @@ { "version": 4, "configurePresets": [ + { + "name": "ReleaseOV", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build/${presetName}", + "installDir": "${sourceDir}/build/install/${presetName}", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "GGML_OPENVINO": true, + "OpenVINO_DIR": "$env{OPENVINO_LLAMA_PATH}/build/Release" + } + }, + { + "name": "ReleaseCPU", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build/${presetName}", + "installDir": "${sourceDir}/build/install/${presetName}", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release" + } + }, { "name": "base", "hidden": true, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 7937d5793a5f2..5feb67d681a88 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -37,7 +37,6 @@ std::map get_ggml_graph_output_dst(std::shared_ptr Date: Wed, 30 Apr 2025 13:40:43 +0800 Subject: [PATCH 057/156] FEAT: improve debug capability --- ggml/src/ggml-openvino/decoder.h | 6 +++--- ggml/src/ggml-openvino/ggml-decoder.cpp | 21 ++++++++++++++++----- ggml/src/ggml-openvino/ggml-decoder.h | 14 ++++++++------ ggml/src/ggml-openvino/utils.cpp | 15 +++++++++++++-- ggml/src/ggml-openvino/utils.h | 2 ++ 5 files changed, 42 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index c0641e2662208..b0775d43aa336 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include "openvino/core/node.hpp" #include "openvino/frontend/decoder.hpp" @@ -57,8 +57,8 @@ class GgmlDecoder : public DecoderBase { virtual bool check_if_continuous() const = 0; - virtual const std::unordered_map>& get_model_inputs() const = 0; - virtual const std::unordered_map>& get_model_weights() const = 0; + virtual const std::map>& get_model_inputs() const = 0; + virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d42aaf4664f15..44b46f2c637ad 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -8,12 +8,14 @@ #include #include #include +#include #include #include #include #include +#include +#include #include -#include #include "ggml-backend-impl.h" #include "ggml-backend.h" @@ -22,16 +24,24 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap : m_cgraph(cgraph), m_node(node), m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { - static std::unordered_map> model_weights; + static std::map> model_weights; + if (m_node) { set_input_output(m_node, model_weights); } else { + static bool printed = false; + if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + print_tensor_address_map(m_cgraph); + printed = true; + } + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node, model_weights); } m_model_weights = model_weights; + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -41,7 +51,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. void GgmlOvDecoder::set_input_output(ggml_tensor* node, - std::unordered_map>& model_weights) { + std::map>& model_weights) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -100,9 +110,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } if (!m_node) { + static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || - std::string(node->name).find("result") == 0) { + std::string(node->name).find("result") == 0 || debug_output_names.count(node->name)) { auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); @@ -249,7 +260,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { void print_tensor_address_map(const struct ggml_cgraph* cgraph) { std::map> address_map; - for (int node_n = 0; node_n <= cgraph->n_nodes; node_n++) { + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* node = cgraph->nodes[node_n]; if (node->data) { auto it = address_map.find(node->data); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index a71c5e4e1f50b..c4f7612d7647d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,7 +1,7 @@ #pragma once +#include #include -#include #include #include "decoder.h" @@ -76,10 +76,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_continuous; } - virtual const std::unordered_map>& get_model_inputs() const override { + virtual const std::map>& get_model_inputs() const override { return m_model_inputs; } - virtual const std::unordered_map>& get_model_weights() const override { + virtual const std::map>& get_model_weights() const override { return m_model_weights; } virtual const std::vector& get_model_output_names() const override { @@ -87,7 +87,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } private: - void set_input_output(ggml_tensor* node, std::unordered_map>& model_weights); + void set_input_output(ggml_tensor* node, std::map>& model_weights); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); @@ -105,7 +105,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { mutable std::string m_name; bool m_continuous; std::vector> m_op_node_name; - std::unordered_map> m_model_inputs; - std::unordered_map> m_model_weights; + std::map> m_model_inputs; + std::map> m_model_weights; std::vector m_model_output_names; }; + +void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 5feb67d681a88..32fa7cf481b91 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -135,10 +135,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c << ", Address: " << output_tensors[output_names[i]] << std::endl; switch (output_tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(output_tensors[output_names[i]]) << std::endl; + std::cout << *(float*)(output_tensor.data()) << std::endl; + std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensors[output_names[i]])) << std::endl; + std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl; + std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; break; default: break; @@ -161,3 +163,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } + +size_t checksum(const void* data, size_t size) { + const uint8_t* bytes = static_cast(data); + size_t sum = 0; + for (size_t i = 0; i < size; ++i) { + sum += bytes[i]; + } + return sum; +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index b4174c9f216b1..4458e71f54be8 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -2,3 +2,5 @@ #include "ggml-backend-impl.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); + +size_t checksum(const void* data, size_t size); From e16c3c3ae440edc26cbec8ba96f0f5c62e8926ab Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 8 May 2025 16:07:14 +0800 Subject: [PATCH 058/156] PERF: compile once (dynamic graph + cache) --- ggml/src/ggml-openvino/decoder.h | 1 + ggml/src/ggml-openvino/ggml-decoder.cpp | 67 ++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 13 +++ ggml/src/ggml-openvino/utils.cpp | 149 +++++++++++++++--------- ggml/src/ggml-openvino/utils.h | 6 + 5 files changed, 177 insertions(+), 59 deletions(-) diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index b0775d43aa336..790ed2e88d773 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -58,6 +58,7 @@ class GgmlDecoder : public DecoderBase { virtual bool check_if_continuous() const = 0; virtual const std::map>& get_model_inputs() const = 0; + virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 44b46f2c637ad..372f880b1d4a6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -10,9 +10,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -35,6 +37,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap printed = true; } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); @@ -42,6 +45,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } m_model_weights = model_weights; + add_extra_inputs(); + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -102,7 +107,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - auto param_node = std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}); + ov::PartialShape input_shape; + if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + } else if (std::string(src->name).find("KQ_mask") == 0) { + input_shape = + ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)}; + } else { + input_shape = ov::Shape{get_shape(src)}; + } + auto param_node = std::make_shared(get_ov_type(src), input_shape); param_node->set_friendly_name(src_name); m_model_inputs[src_name] = param_node; } @@ -146,6 +160,57 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } } +void GgmlOvDecoder::set_max_token_len() { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + auto* node = m_cgraph->nodes[i]; + if (std::string(node->name) == "v-0") { + auto* cache_v = node->src[0]; + m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2]; + break; + } + } +} + +void GgmlOvDecoder::add_extra_inputs() { + int64_t past_token_len; + int64_t attention_size; + + for (const auto& node : m_nodes) { + if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { + assert(std::string(node->view_src->name).find("cache_k") == 0); + int64_t head_size = node->src[0]->ne[0]; + int64_t num_heads = node->src[0]->ne[1]; + past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); + + std::string name = "past_token_len"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{}); + param_node->set_friendly_name(name); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{}); + *tensor->data() = past_token_len; + m_model_extra_input_values[name] = tensor; + break; + } + } + for (const auto& node : m_nodes) { + if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { + int64_t total_token_len = node->src[1]->ne[0] + past_token_len; + attention_size = (total_token_len + 31) / 32 * 32; + + std::string name = "attention_size"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = attention_size; + m_model_extra_input_values[name] = tensor; + break; + } + } +} + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c4f7612d7647d..22ff9d85f76d4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -79,6 +80,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual const std::map>& get_model_inputs() const override { return m_model_inputs; } + virtual const std::map>& get_model_extra_inputs() const override { + return m_model_extra_inputs; + } + virtual const std::map>& get_model_extra_input_values() const { + return m_model_extra_input_values; + } virtual const std::map>& get_model_weights() const override { return m_model_weights; } @@ -88,12 +95,16 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { private: void set_input_output(ggml_tensor* node, std::map>& model_weights); + void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); static std::shared_ptr create_weight_node(ggml_tensor* tensor); + void set_max_token_len(); + int64_t m_max_token_len; + struct ggml_cgraph * m_cgraph; std::map m_inputs; std::vector m_input_names; @@ -106,6 +117,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_continuous; std::vector> m_op_node_name; std::map> m_model_inputs; + std::map> m_model_extra_inputs; + std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 32fa7cf481b91..6166161c41127 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,10 +3,14 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include "ggml-impl.h" #include "ggml.h" @@ -63,61 +67,65 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c return GGML_STATUS_FAILED; } + using CachedItem = std::pair, ov::CompiledModel>; + static std::unordered_map compiled_cache; + + std::shared_ptr model; + ov::CompiledModel compiled_model; + int64_t conversion_end_time; + int64_t compile_end_time; + auto ggml_decoder = get_ggml_decoder(cgraph); - std::shared_ptr graph_decoder = ggml_decoder; - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; - } + auto it = compiled_cache.find(cgraph); + if (it != compiled_cache.end()) { + model = it->second.first; + conversion_end_time = ggml_time_us(); + + compiled_model = it->second.second; + compile_end_time = ggml_time_us(); + } else { + std::shared_ptr graph_decoder = ggml_decoder; + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); + if (!input_model) { + GGML_LOG_ERROR("Input Model is not loaded \n"); + return GGML_STATUS_FAILED; + } - std::shared_ptr model = front_end->convert(input_model); - auto conversion_end_time = ggml_time_us(); + model = front_end->convert(input_model); + conversion_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } + compiled_model = core.compile_model(model, "CPU"); + compile_end_time = ggml_time_us(); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - auto compile_end_time = ggml_time_us(); + compiled_cache[cgraph] = std::make_pair(model, compiled_model); + } ov::InferRequest infer_request = compiled_model.create_infer_request(); - auto infer_request_start_time = ggml_time_us(); - auto input_names = ggml_decoder->get_input_names(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } + infer_request.set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { - std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape() - << ", Address: " << input_tensor.data() << std::endl; - switch (input_tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(float*)(input_tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl; - break; - case ov::element::i32: - std::cout << *(int32_t*)(input_tensor.data()) << std::endl; - break; - case ov::element::i64: - std::cout << *(int64_t*)(input_tensor.data()) << std::endl; - break; - default: - break; - } + print_input_tensor_info(param_name, input_tensor); } - infer_request.set_input_tensor(i, input_tensor); } auto input_end_time = ggml_time_us(); @@ -131,20 +139,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape() - << ", Address: " << output_tensors[output_names[i]] << std::endl; - switch (output_tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(float*)(output_tensor.data()) << std::endl; - std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl; - std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; - break; - default: - break; - } + print_output_tensor_info(output_names[i], output_tensor, output_tensors); } } auto end_time = ggml_time_us(); @@ -153,9 +148,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); - GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n", - (infer_request_start_time - compile_end_time) / 1000); - GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000); + GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } @@ -172,3 +165,43 @@ size_t checksum(const void* data, size_t size) { } return sum; } + +void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { + std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() + << std::endl; + switch (tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(int32_t*)(tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(int64_t*)(tensor.data()) << std::endl; + break; + default: + break; + } +} + +void print_output_tensor_info(const std::string& name, + const ov::Tensor& tensor, + std::map& output_dst) { + std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() + << ", Address: " << output_dst[name] << std::endl; + switch (tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + default: + break; + } +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 4458e71f54be8..96b07008ec3fe 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -4,3 +4,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); size_t checksum(const void* data, size_t size); + +void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); + +void print_output_tensor_info(const std::string& name, + const ov::Tensor& tensor, + std::map& output_dst); From 0805ff5f525977ca6a12d37f89870b6e2c7d2786 Mon Sep 17 00:00:00 2001 From: Viraj Wadhwa Date: Fri, 9 May 2025 11:37:10 -0700 Subject: [PATCH 059/156] Rebase - Bring up to date and fix build process --- docs/build.md | 61 ++ ggml/CMakeLists.txt | 5 + ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 + ggml/src/ggml-openvino.cpp | 1074 +---------------------- ggml/src/ggml-openvino/CMakeLists.txt | 42 + ggml/src/ggml-openvino/decoder.h | 13 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 38 +- ggml/src/ggml-openvino/ggml-decoder.h | 14 +- ggml/src/ggml-openvino/utils.cpp | 9 +- ggml/src/ggml-openvino/utils.h | 4 +- 11 files changed, 152 insertions(+), 1116 deletions(-) create mode 100644 ggml/src/ggml-openvino/CMakeLists.txt diff --git a/docs/build.md b/docs/build.md index dcbcce7549ad2..1cd890e64b070 100644 --- a/docs/build.md +++ b/docs/build.md @@ -575,6 +575,67 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) +## OPENVINO + +### Build openvino-llama + + ```bash + git lfs install --skip-smudge + git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend + cd openvino-llama + git submodule update --init --recursive + + export OPENVINO_LLAMA_PATH=$(pwd) + + cmake --preset Release + cmake --build build/Release + ``` + +### Build llama.cpp-ov + + ```bash + git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino + cd llama.cpp-ov + + cmake --preset ReleaseOV + cmake --build build/ReleaseOV + ``` + +Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. + ``` bash + wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf + ``` + +Execute the following command to test. + ```bash + export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache + # Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance + export GGML_OPENVINO_WEIGHT_AS_INPUT=1 + ./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " + ``` + +Environment variables: +- GGML_OPENVINO_WEIGHT_AS_INPUT: + Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- GGML_OPENVINO_CACHE_DIR: + If set, model caching in OpenVINO will be used. +- GGML_OPENVINO_DUMP_CGRAPH: + Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one. +- GGML_OPENVINO_PROFILING: + Print the time taken for each phase in the OpenVINO backend. +- GGML_OPENVINO_DUMP_IR: + Dump the converted OpenVINO IR. The filenames are timestamps. +- GGML_OPENVINO_DEBUG_INPUT +- GGML_OPENVINO_DEBUG_OUTPUT + +To use Llama.cpp's builtin CPU backend: +```bash +cmake --preset ReleaseCPU +cmake --build build/ReleaseCPU + +./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " +``` + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 73032be68e153..8daee61276ebe 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -244,6 +244,10 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") +option(GGML_OPENVINO "ggml: use OPENVINO" OFF) +option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF) +option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON) + option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON) @@ -315,6 +319,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-sycl.h include/ggml-vulkan.h include/ggml-webgpu.h + include/ggml-openvino.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 892c23318a18e..746fd9632458b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -390,6 +390,7 @@ ggml_add_backend(Vulkan) ggml_add_backend(WebGPU) ggml_add_backend(zDNN) ggml_add_backend(OpenCL) +ggml_add_backend(OPENVINO) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 136afec748d96..1f5c10e83b1ef 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -69,6 +69,10 @@ #include "ggml-cann.h" #endif +#ifdef GGML_USE_OPENVINO +#include "ggml-openvino.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -208,6 +212,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif +#ifdef GGML_USE_OPENVINO + register_backend(ggml_backend_openvino_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 5221a1ff8bd48..f5d5c7ed6798d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -55,1023 +55,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { openvino_frontend_compute(backend, cgraph); - ov::Core core; - - // set the shape and stride of dst - dst->ne[0] = src0->ne[0]; - dst->ne[1] = src0->ne[1]; - dst->nb[0] = src0->nb[0]; - dst->nb[1] = src0->nb[1]; - - if (src0 == nullptr || src1 == nullptr) { - std::cerr << "Error: src0 or src1 is null." << std::endl; - return; - } - - // Step 2: Check that the input tensor types and shapes match - if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32) { - std::cerr << "Error: Unsupported tensor type. Only GGML_TYPE_F32 is supported for OpenVINO." << std::endl; - return; - } - if (src0->ne[0] != src1->ne[0] || src0->ne[1] != src1->ne[1]) { - std::cerr << "Error: src0 and src1 shapes do not match." << std::endl; - return; - } - - ov::Tensor input0 = ov::Tensor(ov::element::f32, {static_cast(src0->ne[0]), static_cast(src0->ne[1])}, src0->data); - ov::Tensor input1 = ov::Tensor(ov::element::f32, {static_cast(src1->ne[0]), static_cast(src1->ne[1])}, src1->data); - - auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); - auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); - auto add = std::make_shared(input0_param, input1_param); - auto model = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); - - // compile model and store in context -#ifdef GGML_OPENVINO_GPU - auto compiled_model = core.compile_model(model, "GPU"); -#elif GGML_OPENVINO_NPU - auto compiled_model = core.compile_model(model, "NPU"); -#else - auto compiled_model = core.compile_model(model, "CPU"); -#endif - // initialize infer request - auto infer_request = compiled_model.create_infer_request(); - - // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors - infer_request.set_tensor(input0_param, input0); - infer_request.set_tensor(input1_param, input1); - - // Step 5: execute inference - infer_request.infer(); - - // Step 6: get output data - ov::Tensor output = infer_request.get_tensor(compiled_model.output()); - - // // Allocate memory for dst->data if not already allocated - // if (dst->data == nullptr) { - // dst->data = malloc(dst->nb[0] * dst->ne[0]); - // if (dst->data == nullptr) { - // std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; - // return; - // } - // } - - std::memcpy(dst->data, output.data(), output.get_byte_size()); - - if (dst->ne[0] != src0->ne[0] || dst->ne[1] != src0->ne[1]) { - std::cerr << "Error: dst tensor shape does not match input tensor shape." << std::endl; - return; - } - - // float* dst_data1 = (float*)(dst->data); - // printf("Output data:");; - // for (int i = 0; i < (10 < (int)(dst->ne[0]) ? 10 : (int)(dst->ne[0])); ++i) { - // printf("%f ", dst_data1[i]); - // } - // printf("\n"); - // fflush(stdout); -} - -static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { - struct ggml_tensor *src0 = dst->src[0]; - struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - // define shape - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // For Example: [7, 3072] - ov::Shape shape1 = {static_cast(src1->ne[1]), static_cast(src1->ne[0])}; // For Example: [1, 3072] -> broadcast to [7, 3072] - - // create OpenVINO tensor (src0 and src1) - ov::Tensor tensor0(ov::element::f32, shape0, src0->data); - ov::Tensor tensor1(ov::element::f32, shape1, src1->data); - - // define input parameters - auto input0 = std::make_shared(ov::element::f32, shape0); - auto input1 = std::make_shared(ov::element::f32, shape1); - - // create a multiply operation using broadcasting - auto multiply = std::make_shared(input0, input1); - - // create model - auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); - // compile model and store in context -#ifdef GGML_OPENVINO_GPU - ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); -#elif GGML_OPENVINO_NPU - ov::CompiledModel compiled_model = core.compile_model(model, "NPU"); -#else - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); -#endif - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - // get output tensor and copy it back to dst->data - ov::Tensor output_tensor = infer_request.get_output_tensor(); - std::memcpy(dst->data, output_tensor.data(), src0->ne[0] * src0->ne[1] * sizeof(float)); -} - -static void ggml_backend_openvino_add(ggml_tensor * dst) { - // Placeholder for OpenVINO add operation - // GGML_ASSERT(ctx.device != 0); - GGML_ASSERT(dst->data != nullptr); - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F16) { - // ggml_backend_openvino_add_forward(ctx, dst, src0, src1); - } else if (src1->type == GGML_TYPE_F32) { - // ggml_compute_forward_add_f16_f32(params, dst); - } else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_F32: - { - if (src1->type == GGML_TYPE_F32) { - { - ggml_backend_openvino_add_forward(dst); - } - } - else { - GGML_ABORT("fatal error"); - } - } break; - default: - GGML_ABORT("%s: unsupported type %d\n", __func__, src1->type); - } - -} - -static void ggml_backend_openvino_mul(ggml_tensor * dst) { - GGML_ASSERT(dst->data != nullptr); - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_backend_openvino_mul_forward(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -void ggml_compute_forward_get_rows_f16(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] - ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] - - ov::Tensor tensor0(ov::element::f16, shape0, src0->data); - ov::Tensor tensor1(ov::element::i32, shape1, src1->data); - - auto input0 = std::make_shared(ov::element::f16, shape0); - auto input1 = std::make_shared(ov::element::i32, shape1); - - auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); - - auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Convert output tensor data type from f16 to f32 - ov::Tensor output_tensor_f32 = ov::Tensor(ov::element::f32, output_tensor.get_shape()); - for (size_t i = 0; i < output_tensor.get_size(); ++i) { - output_tensor_f32.data()[i] = static_cast(output_tensor.data()[i]); - } - - // Copy the converted data to dst->data - std::memcpy(dst->data, output_tensor_f32.data(), output_tensor_f32.get_byte_size()); -} - -void ggml_compute_forward_get_rows_f32(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] - ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] - - ov::Tensor tensor0(ov::element::f32, shape0, src0->data); - ov::Tensor tensor1(ov::element::i32, shape1, src1->data); - - auto input0 = std::make_shared(ov::element::f32, shape0); - auto input1 = std::make_shared(ov::element::i32, shape1); - - auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); - - auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - - // Copy the converted data to dst->data - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); -} - -void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_get_rows_f16(dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_get_rows_f32(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - -} - -void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - assert(src0 != nullptr); - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t input_size = ne0 * ne1 * ne2; - - const float *src_data = static_cast(src0->data); - float *dst_data = static_cast(dst->data); - assert(dst_data != nullptr); - - ov::Core core; - - ov::Shape input_shape = {static_cast(ne2), static_cast(ne1), static_cast(ne0)}; - ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); - - auto input_param = std::make_shared( - input_tensor.get_element_type(), - input_tensor.get_shape() - ); - assert(input_param != nullptr && "Input parameter creation failed!"); - - auto square = std::make_shared(input_param, input_param); - auto reduce_sum = std::make_shared( - square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true - ); - - auto mean = std::make_shared( - reduce_sum, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(ne0)}) - ); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - auto rms = std::make_shared( - std::make_shared( - mean, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}) - ) - ); - - auto scale = std::make_shared( - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), - rms - ); - - auto normalized_input = std::make_shared(input_param, scale); - - ov::ParameterVector parameters = {input_param}; - auto model = std::make_shared(ov::NodeVector{normalized_input}, parameters); - - // static bool model_saved = false; - // if (!model_saved) { - // std::cout << "\n rms model saved" << std::endl; - // ov::save_model(model, "//rms_norm_model.xml"); - // model_saved = true; - // } - - auto compiled_model = core.compile_model(model, "CPU"); - - auto infer_request = compiled_model.create_infer_request(); - - infer_request.set_input_tensor(0, input_tensor); - - infer_request.infer(); - - auto output_tensor = infer_request.get_output_tensor(); - assert(output_tensor.get_size() == input_size); - - std::memcpy(dst_data, output_tensor.data(), input_size * sizeof(float)); -} - -void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_backend_openvino_rms_norm_f32(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { - // NOP - GGML_UNUSED(dst); -} - -// Extracting valid shapes -std::vector get_effective_shape(const ggml_tensor * t) { - std::vector shape; - for (int i = 2; i >= 0; i--) { - if (t->ne[i] != 1 || t->ne[2] != 1) - shape.push_back(t->ne[i]); - } - return shape; -} - -/* -* Construct an index vector for Gather to extract non-contiguous data. -* Parameters: -* - valid_cols: number of valid columns per row (e.g., for src0, valid columns = 96) -* - num_rows: number of rows in each batch (e.g., src0: 32 rows per batch) -* - batch: number of batches (e.g., 32) -* - row_stride: physical row length (in elements), e.g., src0: nb[1]/(element_size) = 6144/2 = 3072 -* - batch_stride: physical batch stride (in elements), e.g., src0: nb[2]/(element_size) = 192/2 = 96 -*/ -std::vector build_indices(int valid_cols, int num_rows, int batch, int row_stride, int batch_stride) { - std::vector indices; - indices.reserve(valid_cols * num_rows * batch); - for (int b = 0; b < batch; b++) { - for (int r = 0; r < num_rows; r++) { - for (int c = 0; c < valid_cols; c++) { - // 计算物理索引 = b * batch_stride + r * row_stride + c - indices.push_back(b * batch_stride + r * row_stride + c); - } - } - } - return indices; -} - -void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { - assert(dst && dst->src[0] && dst->src[1]); - const ggml_tensor * src0 = dst->src[0]; // src0 type F16 - const ggml_tensor * src1 = dst->src[1]; // src1 type F32 - - if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { - int valid_cols_src0 = src0->ne[0]; // 96 - int num_rows_src0 = src0->ne[1]; // 32 - int batch_src0 = src0->ne[2]; // 32 - - int valid_cols_src1 = src1->ne[0]; // 96 - int num_rows_src1 = src1->ne[1]; // 7 - int batch_src1 = src1->ne[2]; // 32 - - // 对 src0:row_stride = nb[1] / nb[0] - int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072 - int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96 - - // 对 src1:row_stride = nb[1] / nb[0] - int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072 - int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96 - - std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); - std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - - size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 - size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - - ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), - static_cast(src1->ne[1]), - static_cast(src1->ne[0])}; - - auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); - - ov::Shape flat_shape_src0 = { total_src0 }; - ov::Shape flat_shape_src1 = { total_src1 }; - - auto flatten_src0 = std::make_shared( - param_src0, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src0) }), - false); - auto flatten_src1 = std::make_shared( - param_src1, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src1) }), - false); - - auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); - auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - - auto gathered_src0 = std::make_shared(flatten_src0, indices_const_src0, axis_const); - auto gathered_src1 = std::make_shared(flatten_src1, indices_const_src1, axis_const); - - std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; - auto reshape_src0 = std::make_shared( - gathered_src0, - ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), - false); - - std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; - auto reshape_src1 = std::make_shared( - gathered_src1, - ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), - false); - - auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); - auto src0_transposed = std::make_shared(src0_f32, transpose_order); - - auto A = src0_transposed; - auto B = reshape_src1; - - auto batched_matmul = std::make_shared(B, A, false, false); - - std::vector final_output_shape = {static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - - auto reshape_output = std::make_shared( - batched_matmul, - ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false); - - auto model = std::make_shared(ov::NodeVector{ reshape_output }, - ov::ParameterVector{ param_src0, param_src1 }); - - ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; - ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_src1); - infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); - return ; - } - - int rank = 0; - if (dst->ne[2] == 1 && dst->ne[3] == 1) { - rank = 2; - } else if (dst->ne[3] == 1) { - rank = 3; - } else { - throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation."); - } - - std::vector eff_shape_src0 = get_effective_shape(src0); - std::vector eff_shape_src1 = get_effective_shape(src1); - std::vector eff_shape_dst = get_effective_shape(dst); - - ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), - static_cast(src1->ne[1]), - static_cast(src1->ne[0])}; - auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); - - auto reshape_src0 = std::make_shared( - param_src0, - ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), - false); - auto reshape_src1 = std::make_shared( - param_src1, - ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), - false); - - auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - - ov::Output A_for_mul; - if (rank == 2) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector{1, 0}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else if (rank == 3) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector{0, 2, 1}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else { - A_for_mul = src0_f32; - } - - auto matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - - auto matmul_output_shape = matmul->get_output_shape(0); - std::vector final_output_shape; - if (matmul_output_shape.size() == 1) { - final_output_shape = { 1, 1, static_cast(matmul_output_shape[0]) }; - } else if (matmul_output_shape.size() == 2) { - final_output_shape = { 1, static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]) }; - } else { - final_output_shape = { static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]), static_cast(matmul_output_shape[2]) }; - } - - auto reshape_output = std::make_shared( - matmul, - ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false - ); - - auto model = std::make_shared(ov::NodeVector{ reshape_output }, - ov::ParameterVector{ param_src0, param_src1 }); - - ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data }; - ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data }; - - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_src1); - infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); -} - -void ggml_backend_openvino_reshape(ggml_tensor *dst) { - - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_view(ggml_tensor *dst) { - - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - - // Validate tensor properties - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(src0->type == dst->type); - - // Determine tensor properties - const size_t element_size = ggml_type_size(src0->type); - - // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { - ov::Shape input_shape = { - static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0]) - }; - size_t num_elements = 1; - for (auto d : input_shape) { - num_elements *= d; - } - ov::Shape flat_shape = { num_elements }; - - ov::Shape dst_shape = { - static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) - }; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::vector flat_shape_vec(flat_shape.begin(), flat_shape.end()); - auto flat_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { flat_shape_vec.size() }, flat_shape_vec); - auto flat_reshape = std::make_shared(input_param, flat_reshape_const, false); - - std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); - auto dst_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); - auto final_reshape = std::make_shared(flat_reshape, dst_reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{ final_reshape }, ov::ParameterVector{ input_param }); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } - - // Case 2: Compatible types, dimensions, and strides - const size_t ne00 = src0->ne[0]; - const size_t ne01 = src0->ne[1]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb0 = dst->nb[0]; - - if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - const size_t valid_elems = static_cast(src0->ne[0]); // 3072 - const size_t num_rows = static_cast(src0->ne[1]); // 7 - const size_t dim2 = static_cast(src0->ne[2]); // 1 - - size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 - // size_t phys_stride = static_cast(src0->ne[0]); // 3072 - - ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } - ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} - - // std::cout << "CONT input shape: " << input_shape << std::endl; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // int64_t split_addr = dst->src[0]->view_offs / dst->src[0]->nb[0]; - // std::vector begin = { 0, 0, split_addr }; - // std::vector end = { static_cast(dim2), - // static_cast(num_rows), - // split_addr + static_cast(valid_elems) }; - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dim2), - static_cast(num_rows), - static_cast(valid_elems) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - //[NOTE]: input_shape should be {1, 7, 9216} not the original shap of src0. - ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } - - // Case 3: Non-contiguous source, contiguous destination - // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 - // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 - if (ggml_is_contiguous(dst)) { - size_t valid_i = static_cast(src0->ne[0]); // 96 - size_t valid_j = static_cast(src0->ne[1]); // 32 - size_t valid_k = static_cast(src0->ne[2]); // 7 - - ov::Shape src_shape = { valid_k, valid_j, valid_i }; // {7, 32, 96}; - auto src_param = std::make_shared(ov::element::f32, src_shape); - - ov::Shape input_shape = { valid_j, valid_k, valid_i }; // {32, 7, 96} - auto tmp_param = ov::op::v0::Constant::create(ov::element::i64, { input_shape.size() }, input_shape); - auto input_param = std::make_shared(src_param, tmp_param, false); - - // 添加 Transpose 节点,将 {32,7,96} 变换为 {7,32,96},恢复逻辑顺序 - // 这里交换第 0 与第 1 维,即 permutation = {1, 0, 2} - std::vector order = {1, 0, 2}; - auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); - auto transpose = std::make_shared(input_param, order_const); - - ov::Shape target_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; // {1, 7, 3072} - std::vector target_shape_vec = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { target_shape_vec.size() }, target_shape_vec); - auto reshaped = std::make_shared(transpose, reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{ reshaped }, - ov::ParameterVector{ src_param }); - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, src_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } -} - -static void ggml_backend_openvino_transpose(ggml_tensor *dst) { - // ov::Core core; - // ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); - - // //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - - - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{output_shape.size()}, - // std::vector(output_shape.begin(), output_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - - - - // std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - // ov::ParameterVector{input_param}); - // auto compiled_model = core.compile_model(model, "CPU"); - // ov::InferRequest infer_request = compiled_model.create_infer_request(); - - // ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); - // infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - // infer_request.infer(); - - // NOP - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - assert(src0 != nullptr); - assert(ggml_nelements(dst) == ggml_nelements(src0)); - - // Extract shapes - ov::Shape src_shape(src0->ne, src0->ne + 4); - ov::Shape dst_shape(dst->ne, dst->ne + 4); - - // Initialize OpenVINO core - ov::Core core; - - // Create OpenVINO parameter for the source tensor - auto src_input = std::make_shared(ov::element::f32, src_shape); - - std::shared_ptr model; - if (ggml_is_contiguous(dst)) { - // Contiguous Case: Flatten src and reshape to dst shape - ov::Shape flattened_shape = {static_cast(ggml_nelements(src0))}; - auto flatten = std::make_shared( - src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); - - auto reshape_to_dst = std::make_shared( - flatten, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), false); - - auto dst_output = std::make_shared(reshape_to_dst, ov::element::f16); - - model = std::make_shared( - ov::ResultVector{std::make_shared(dst_output)}, - ov::ParameterVector{src_input}, - "ContiguousCopy"); - // Compile and execute the model - auto compiled_model = core.compile_model(model, "CPU"); - - ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); - ov::Tensor dst_tensor(ov::element::f16, dst_shape, dst->data); - - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, src_tensor); - infer_request.set_output_tensor(0, dst_tensor); - infer_request.infer(); - } else { - int src0_elem_size = ggml_type_size(src0->type); - int src1_elem_size = ggml_type_size(src1->type); - - int src0_logical_cols = src0->ne[0]; - int src0_logical_rows = src0->ne[1]; - int src1_logical_cols = src1->ne[0]; - int src1_logical_rows = src1->ne[1]; - - int src0_phys_cols = src0->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; - - int src1_phys_cols = src1->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - - size_t logical_elems = static_cast(src0_logical_cols * src0_logical_rows); - size_t src_flat_size = 1 * src0_phys_cols * src0_phys_rows; - size_t dst_flat_size = 1 * src1_phys_rows * src1_phys_cols; - - ov::Core core; - - std::vector gather_idx; - gather_idx.reserve(logical_elems); - for (int row = 0; row < src0_logical_rows; row++) { - for (int col = 0; col < src0_logical_cols; col++) { - gather_idx.push_back(static_cast(row + col * src0_phys_rows)); - } - } - ov::Shape gather_idx_shape = { logical_elems }; - - std::vector scatter_idx; - scatter_idx.reserve(logical_elems); - for (int row = 0; row < src1_logical_rows; row++) { - for (int col = 0; col < src1_logical_cols; col++) { - scatter_idx.push_back(static_cast(row * src1_phys_cols + col)); - } - } - ov::Shape scatter_idx_shape = { logical_elems, 1 }; - - auto param_src0 = std::make_shared(ov::element::f32, src0_phys_shape); - auto param_src1 = std::make_shared(ov::element::f16, src1_phys_shape); - - auto src_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, - { static_cast(src_flat_size) }); - auto reshape_src = std::make_shared(param_src0, src_flat_shape_const, false); - auto dst_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, - { static_cast(dst_flat_size) }); - auto reshape_dst = std::make_shared(param_src1, dst_flat_shape_const, false); - - auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared(reshape_src, gather_indices_const, axis_const); - auto converted = std::make_shared(gathered, ov::element::f16); - - auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); - auto scatter = std::make_shared(reshape_dst, scatter_indices_const, converted); - - std::vector dst_phys_shape_vec = {1, static_cast(src1_phys_rows), - static_cast(src1_phys_cols) }; - auto dst_phys_shape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, dst_phys_shape_vec); - auto final_output = std::make_shared(scatter, dst_phys_shape_const, false); - - ov::ParameterVector params = { param_src0, param_src1 }; - auto model = std::make_shared(ov::OutputVector{ final_output }, params); - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor tensor_src(ov::element::f32, src0_phys_shape, src0->data); - ov::Tensor tensor_dst(ov::element::f16, src1_phys_shape, src1->data); - infer_request.set_input_tensor(0, tensor_src); - infer_request.set_input_tensor(1, tensor_dst); - - ov::Tensor out_tensor(ov::element::f16, src1_phys_shape, dst->data); - infer_request.set_output_tensor(0, out_tensor); - - infer_request.infer(); - } -} - -static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // Find the indices of GGML_OP_CONT, GGML_OP_CPY nodes, GGML_OP_MUL_MAT and so on. - std::vector cont_indices; - std::vector reshape_indices; - std::vector view_indices; - std::vector view_indices_prompt; - std::vector view_split; - - std::vector cpy_indices; - std::vector cpy_split_16; - std::vector cpy_split_19; - std::vector transpose_indices; - std::vector permute_indices; - - std::vector mul_mat_indices; - std::vector add_indices; - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (cgraph->nodes[i]->op == GGML_OP_CONT) { - cont_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { - reshape_indices.push_back(i); - // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - // if (cgraph->nodes[i]->src[0]->ne[0] == 98304 && (cgraph->nodes[i]->ne[0] == 3072 || cgraph->nodes[i]->ne[0] == 1)) - // continue; - view_indices.push_back(i); - if (cgraph->nodes[i]->ne[0] == 32) { - view_indices_prompt.push_back(i); - } - if (i == 18) { - view_split.push_back(i); - } - } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { - cpy_indices.push_back(i); - if (i == 16) { - cpy_split_16.push_back(i); - } - if (i == 19) { - cpy_split_19.push_back(i); - } - } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { - transpose_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { - permute_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { - mul_mat_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { - add_indices.push_back(i); - } - } - - - // Process nodes in order - - if (cgraph->nodes[0]->ne[1] == 1) { - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - ggml_backend_openvino_add_forward(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } - } else { - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); - } - return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); - GGML_UNUSED(ctx); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -1265,53 +250,15 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); -#ifdef OPENVINO_OP_DEBUG -static const std::set& openvino_ops = []() -> const std::set& { - static const std::set ops = get_openvino_available_opsets(); - return ops; - }(); - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return true; - case GGML_OP_ADD: - return true; - case GGML_OP_MUL: - case GGML_OP_MUL_MAT: - return false; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(op)) - { - case GGML_UNARY_OP_SILU: - return true; - case GGML_UNARY_OP_ABS: - case GGML_UNARY_OP_SGN: - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_STEP: - case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_ELU: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_HARDSIGMOID: - case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_COUNT: - return false; - } - return false; - default: - return false; - } -#else - static const std::set& openvino_ops = []() -> const std::set& { - static const std::set ops = get_openvino_available_opsets(); - return ops; - }(); + static const std::set supported_ops{ + GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, + GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, + GGML_OP_SCALE, GGML_OP_SOFT_MAX, + }; + static const std::set supported_unary_ops{ + GGML_UNARY_OP_SILU, + }; if (op->op == GGML_OP_UNARY) { return supported_unary_ops.find(ggml_get_unary_op(op)) != @@ -1457,5 +404,4 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { } return ® -} - +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt new file mode 100644 index 0000000000000..75b11448436d2 --- /dev/null +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -0,0 +1,42 @@ +find_package(OpenVINO REQUIRED) +list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime) + +# Set header and libs +file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h") +list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h") +file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp") +list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp") + +list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO) + +if (OPENVINO_DEVICE) + if (OPENVINO_DEVICE STREQUAL "GPU") + add_compile_definitions(GGML_OPENVINO_GPU) + elseif (OPENVINO_DEVICE STREQUAL "NPU") + add_compile_definitions(GGML_OPENVINO_NPU) + endif() +endif() + +if(NOT DEFINED GGML_OV_FRONTEND) + set(GGML_OV_FRONTEND OpenVINO_DIR) +endif() +add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}") + +if (OpenVINO_DIR) + if (GGML_OPENVINO) + if (NOT UNIX) + set(GGML_OPENVINO OFF) + message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO") + endif() + endif() + + if (GGML_OPENVINO) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + set(GGML_OPENVINO OFF) + message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO") + endif() + endif() + +endif() diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 790ed2e88d773..3404e7c211e62 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,9 +1,8 @@ #pragma once #include - -#include "openvino/core/node.hpp" -#include "openvino/frontend/decoder.hpp" +#include +#include namespace ov { namespace frontend { @@ -43,11 +42,7 @@ class GgmlDecoder : public DecoderBase { virtual std::string& get_output_name(size_t index) const = 0; - virtual size_t get_output_size() const = 0; - - virtual bool is_graph_output(size_t index) const = 0; - - virtual std::string& get_output_name(size_t index) const = 0; + virtual std::vector get_output_names() const = 0; virtual const std::string& get_op_type() const = 0; @@ -65,4 +60,4 @@ class GgmlDecoder : public DecoderBase { } // namespace ggml } // namespace frontend -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 372f880b1d4a6..28409186f8274 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -354,7 +354,7 @@ std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { std::vector stride; - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { stride.push_back(static_cast(tensor->nb[i])); } return stride; @@ -448,27 +448,16 @@ void GgmlOvDecoder::visit_subgraph(std::function opTypeMap = { - {GGML_OP_ACC, "GGML_OP_ACC"}, - {GGML_OP_ADD, "GGML_OP_ADD"}, - {GGML_OP_ADD1, "GGML_OP_ADD1"}, - {GGML_OP_CONT, "GGML_OP_CONT"}, - {GGML_OP_CPY, "GGML_OP_CPY"}, - {GGML_OP_DIV, "GGML_OP_DIV"}, - {GGML_OP_DUP, "GGML_OP_DUP"}, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, - {GGML_OP_MUL, "GGML_OP_MUL"}, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, - {GGML_OP_ROPE, "GGML_OP_ROPE"}, - {GGML_OP_SCALE, "GGML_OP_SCALE"}, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, - {GGML_OP_SUB, "GGML_OP_SUB"}, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_UNARY, "GGML_OP_UNARY"}, - {GGML_OP_VIEW, "GGML_OP_VIEW"} - }; + {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, + {GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"}, + {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, + {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, + {GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"}, + {GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, + {GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, + {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}}; static const std::map unaryOpTypeMap = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, @@ -484,8 +473,7 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"} - }; + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}}; auto it = opTypeMap.find(m_node->op); if (it != opTypeMap.end()) { if (it->first == GGML_OP_UNARY) { @@ -498,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const { } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; -} +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 22ff9d85f76d4..a0f6cbea30b10 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -53,11 +53,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual std::string& get_output_name(size_t index) const override; - virtual size_t get_output_size() const override; - - virtual bool is_graph_output(size_t index) const override; - - virtual std::string& get_output_name(size_t index) const override; + virtual std::vector get_output_names() const override; virtual const std::string& get_op_type() const override; @@ -105,10 +101,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_max_token_len(); int64_t m_max_token_len; - struct ggml_cgraph * m_cgraph; - std::map m_inputs; + struct ggml_cgraph* m_cgraph; + std::map m_inputs; std::vector m_input_names; - std::map m_outputs; + std::map m_outputs; std::vector m_output_names; ggml_tensor* m_node; std::vector m_nodes; @@ -123,4 +119,4 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::vector m_model_output_names; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); +void print_tensor_address_map(const struct ggml_cgraph* cgraph); \ No newline at end of file diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6166161c41127..f36700d5ec148 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -42,12 +42,7 @@ std::map get_ggml_graph_output_dst(std::shared_ptr& output_dst); + std::map& output_dst); \ No newline at end of file From daabbb3d6cf812f928abbea61737b30cac875665 Mon Sep 17 00:00:00 2001 From: Zijun Yu Date: Tue, 13 May 2025 14:31:23 +0800 Subject: [PATCH 060/156] fix build error --- ggml/include/ggml-openvino.h | 38 ++++++--------- ggml/src/ggml-openvino/CMakeLists.txt | 47 +++++-------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 6 +-- .../src/{ => ggml-openvino}/ggml-openvino.cpp | 31 +++++------- 4 files changed, 42 insertions(+), 80 deletions(-) rename ggml/src/{ => ggml-openvino}/ggml-openvino.cpp (94%) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 9172414c291b5..151c48d40d067 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -14,37 +14,29 @@ extern "C" { #define GGML_OPENVINO_MAX_DEVICES 16 // backend API -GGML_API ggml_backend_t ggml_backend_openvino_init(int device); +GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); -GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend); // device buffer -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_split_buffer_type(const float *tensor_split); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU // and GPU -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_host_buffer_type(void); - -GGML_API int ggml_backend_openvino_get_device_count(void); -// GGML_API void ggml_backend_openvino_get_device_description(int device, -// char *description, -// size_t -// description_size); -// GGML_API void ggml_backend_openvino_get_device_memory(int device, size_t -// *free, -// size_t *total); - -// GGML_API bool ggml_backend_openvino_register_host_buffer(void *buffer, size_t -// size); GGML_API void ggml_backend_openvino_unregister_host_buffer(void -// *buffer); - -GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); + +GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); +// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description, +// size_t description_size); +// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total); + +// GGML_BACKEND_API bool ggml_backend_openvino_register_host_buffer(void * buffer, size_t size); +// GGML_BACKEND_API void ggml_backend_openvino_unregister_host_buffer(void * buffer); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); struct ggml_openvino_device_info { int device_count; diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 75b11448436d2..08712c1527a89 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,42 +1,19 @@ find_package(OpenVINO REQUIRED) -list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime) -# Set header and libs -file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h") -list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h") -file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp") -list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp") +file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp") +file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp") -list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO) +ggml_add_backend_library(ggml-openvino + ${GGML_SOURCES_OPENVINO} + ${GGML_HEADERS_OPENVINO} +) -if (OPENVINO_DEVICE) - if (OPENVINO_DEVICE STREQUAL "GPU") - add_compile_definitions(GGML_OPENVINO_GPU) - elseif (OPENVINO_DEVICE STREQUAL "NPU") - add_compile_definitions(GGML_OPENVINO_NPU) - endif() -endif() - -if(NOT DEFINED GGML_OV_FRONTEND) - set(GGML_OV_FRONTEND OpenVINO_DIR) -endif() -add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}") +target_link_libraries(ggml-openvino PRIVATE openvino::runtime) -if (OpenVINO_DIR) - if (GGML_OPENVINO) - if (NOT UNIX) - set(GGML_OPENVINO OFF) - message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO") - endif() +if (GGML_OPENVINO) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}") endif() - - if (GGML_OPENVINO) - if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") - elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") - else() - set(GGML_OPENVINO OFF) - message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO") - endif() - endif() - endif() diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 28409186f8274..43869ec228d78 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -278,8 +279,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { << std::setw(5) << node->ne[2] << ", " << std::setw(5) << node->ne[3] << "] " << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " " - << std::left << std::setw(44) << node->name << std::right - << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") + << std::left << std::setw(45) << node->name << std::right << std::setw(2) << "[ " << std::setw(0) << node->nb[0] << ", " << std::setw(5) << node->nb[1] << ", " @@ -486,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const { } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; -} \ No newline at end of file +} diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp similarity index 94% rename from ggml/src/ggml-openvino.cpp rename to ggml/src/ggml-openvino/ggml-openvino.cpp index f5d5c7ed6798d..01fccea47a5b5 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -62,7 +62,6 @@ ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * static const ggml_backend_i ggml_backend_openvino_interface = { /* .get_name = */ ggml_backend_openvino_get_name, /* .free = */ ggml_backend_openvino_free, - /* .get_default_buffer_type = */ ggml_backend_openvino_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -72,9 +71,6 @@ static const ggml_backend_i ggml_backend_openvino_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_openvino_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -89,7 +85,7 @@ static ggml_guid_t ggml_backend_openvino_guid(void) { } // backend API -GGML_API ggml_backend_t ggml_backend_openvino_init(int device) { +GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { if (device < 0 || device >= ggml_backend_openvino_get_device_count()) { GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; @@ -111,30 +107,28 @@ GGML_API ggml_backend_t ggml_backend_openvino_init(int device) { return openvino_backend; } -GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) { +GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { GGML_ASSERT(backend->context != nullptr); return true; } // device buffer -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_buffer_type(int device) { +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { GGML_ASSERT(device >= 0); return nullptr; } // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_split_buffer_type(const float *tensor_split) { +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split) { GGML_ASSERT(tensor_split != nullptr); return nullptr; } // pinned host buffer for use with the CPU backend for faster copies between CPU // and GPU -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_host_buffer_type(void) { return nullptr;} - +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void) { + return nullptr; +} struct ggml_backend_openvino_buffer_type_context { int device; @@ -367,7 +361,7 @@ const ggml_openvino_device_info & ggml_openvino_info() { return info; } -GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { static ggml_backend_reg reg; static bool initialized = false; @@ -394,14 +388,13 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { ctx->devices.push_back(dev); } - reg = ggml_backend_reg { - /* .interface = */ ggml_backend_openvino_reg_interface, - /* .context = */ ctx - }; + reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_openvino_reg_interface, + /* .context = */ ctx }; } initialized = true; } return ® -} \ No newline at end of file +} From 7b7d22ffc0c445414dee13e36fc7bea97918fbc6 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 13 May 2025 17:45:47 +0800 Subject: [PATCH 061/156] FIX: backend buffer type issue --- ggml/src/ggml-backend-reg.cpp | 1 + ggml/src/ggml-openvino/ggml-openvino.cpp | 15 ++++----------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 1f5c10e83b1ef..dfdb04e3313ed 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -606,6 +606,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("vulkan", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("openvino", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend const char * backend_path = std::getenv("GGML_BACKEND_PATH"); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 01fccea47a5b5..19e4ed5b77872 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -46,17 +46,11 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { GGML_UNUSED(backend); } -static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_cpu_buffer_type(); - GGML_UNUSED(backend); -} - static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -108,14 +102,14 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { } GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { - GGML_ASSERT(backend->context != nullptr); - return true; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid()); } // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { GGML_ASSERT(device >= 0); - return nullptr; + return ggml_backend_cpu_buffer_type(); + GGML_UNUSED(device); } // split tensor buffer that splits matrices by rows across multiple devices @@ -184,8 +178,7 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_CPU; - // return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_ACCEL; } static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { From df975a096968c43881fe7492d886835feaebc37b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 9 May 2025 13:07:27 +0800 Subject: [PATCH 062/156] STYLE: clang-format --- ggml/src/ggml-openvino/README.md | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 ggml/src/ggml-openvino/README.md diff --git a/ggml/src/ggml-openvino/README.md b/ggml/src/ggml-openvino/README.md deleted file mode 100644 index 46c2adb438653..0000000000000 --- a/ggml/src/ggml-openvino/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# Instructions to Modify and Build ggml with OpenVINO - -## Step 1: Modify the Source Code - -In order to change the frontend `.so` path to the path to `.so` file, you need to add path to the `.so` file in cmake compiler option: -1. Open a terminal and navigate to the root directory of this repo. -2. Run the following commands to configure: - ```sh - mkdir build - cmake -B build -DGGML_OV_FRONTEND="${openvino_repo_dir}/bin/intel64/Release/libopenvino_ggml_frontend.so" - ``` -Where GGML_OV_FRONTEND should point to the path to `libopenvino_ggml_frontend.so` file. - -## Step 2: Build the Project - -After modifying the source code, you need to build the project using CMake. Follow these steps: - -1. (Optional) Enable debug option for ggml-openvino, this will output dump of subgraph sent to OpenVINO, information after convert ggml_cgraph to GraphIterator, and calculation input value/output value of each OP: - ```sh - cmake -B build -DGGML_OPENVINO_DEBUG=ON - ``` - -2. Run the following commands to configure and build the project: - ```sh - cmake -B build -DGGML_OPENVINO=ON - cmake --build build -j - ``` - -This will configure the project with OpenVINO support and build it using multiple cores for faster compilation. - From f6bb0c6ccfd0ff8d0e7c9f6311b034fefb1568b2 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 9 May 2025 13:04:20 +0800 Subject: [PATCH 063/156] FEAT: Add all conversion code from ov side --- docs/build.md | 6 +- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- .../{decoder.h => openvino/decoder.hpp} | 1 - ggml/src/ggml-openvino/openvino/frontend.cpp | 27 +++ ggml/src/ggml-openvino/openvino/frontend.hpp | 23 +++ .../ggml-openvino/openvino/input_model.cpp | 17 ++ .../ggml-openvino/openvino/input_model.hpp | 29 +++ .../ggml-openvino/openvino/node_context.hpp | 100 ++++++++++ ggml/src/ggml-openvino/openvino/op/add.cpp | 23 +++ ggml/src/ggml-openvino/openvino/op/cont.cpp | 56 ++++++ ggml/src/ggml-openvino/openvino/op/cpy.cpp | 106 +++++++++++ .../ggml-openvino/openvino/op/get_rows.cpp | 40 ++++ ggml/src/ggml-openvino/openvino/op/mul.cpp | 28 +++ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 127 +++++++++++++ .../src/ggml-openvino/openvino/op/permute.cpp | 22 +++ .../src/ggml-openvino/openvino/op/reshape.cpp | 35 ++++ .../ggml-openvino/openvino/op/rms_norm.cpp | 47 +++++ ggml/src/ggml-openvino/openvino/op/rope.cpp | 171 ++++++++++++++++++ ggml/src/ggml-openvino/openvino/op/scale.cpp | 31 ++++ .../ggml-openvino/openvino/op/soft_max.cpp | 88 +++++++++ .../ggml-openvino/openvino/op/transpose.cpp | 23 +++ ggml/src/ggml-openvino/openvino/op/unary.cpp | 24 +++ .../ggml-openvino/openvino/op/unary_silu.cpp | 29 +++ ggml/src/ggml-openvino/openvino/op/view.cpp | 26 +++ ggml/src/ggml-openvino/openvino/op_table.cpp | 64 +++++++ ggml/src/ggml-openvino/openvino/op_table.hpp | 13 ++ .../openvino/translate_session.cpp | 145 +++++++++++++++ .../openvino/translate_session.hpp | 27 +++ ggml/src/ggml-openvino/openvino/utils.cpp | 52 ++++++ ggml/src/ggml-openvino/openvino/utils.hpp | 68 +++++++ ggml/src/ggml-openvino/utils.cpp | 30 +-- 31 files changed, 1465 insertions(+), 15 deletions(-) rename ggml/src/ggml-openvino/{decoder.h => openvino/decoder.hpp} (98%) create mode 100644 ggml/src/ggml-openvino/openvino/frontend.cpp create mode 100644 ggml/src/ggml-openvino/openvino/frontend.hpp create mode 100644 ggml/src/ggml-openvino/openvino/input_model.cpp create mode 100644 ggml/src/ggml-openvino/openvino/input_model.hpp create mode 100644 ggml/src/ggml-openvino/openvino/node_context.hpp create mode 100644 ggml/src/ggml-openvino/openvino/op/add.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/cont.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/cpy.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/get_rows.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/mul.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/mulmat.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/permute.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/reshape.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/rms_norm.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/rope.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/scale.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/soft_max.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/transpose.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/unary.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/unary_silu.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/view.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op_table.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op_table.hpp create mode 100644 ggml/src/ggml-openvino/openvino/translate_session.cpp create mode 100644 ggml/src/ggml-openvino/openvino/translate_session.hpp create mode 100644 ggml/src/ggml-openvino/openvino/utils.cpp create mode 100644 ggml/src/ggml-openvino/openvino/utils.hpp diff --git a/docs/build.md b/docs/build.md index 1cd890e64b070..20d34170774c9 100644 --- a/docs/build.md +++ b/docs/build.md @@ -586,7 +586,11 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build git submodule update --init --recursive export OPENVINO_LLAMA_PATH=$(pwd) + ``` + + Before building, change "ENABLE_OV_GGML_FRONTEND" from true to false in the CMakePresets.json file since we already have the code from the ov side in this branch of llama.cpp (`full_backend`). You could also build the master branch of ov instead. + ``` cmake --preset Release cmake --build build/Release ``` @@ -594,7 +598,7 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ### Build llama.cpp-ov ```bash - git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino + git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b full_backend cd llama.cpp-ov cmake --preset ReleaseOV diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index a0f6cbea30b10..959e00b65d8df 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -5,8 +5,8 @@ #include #include -#include "decoder.h" #include "ggml.h" +#include "openvino/decoder.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.hpp similarity index 98% rename from ggml/src/ggml-openvino/decoder.h rename to ggml/src/ggml-openvino/openvino/decoder.hpp index 3404e7c211e62..3987760a294e1 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -8,7 +8,6 @@ namespace ov { namespace frontend { namespace ggml { -// TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: virtual ov::Any get_attribute(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp new file mode 100644 index 0000000000000..ff7f0e8392b0f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -0,0 +1,27 @@ +#include "frontend.hpp" + +#include "input_model.hpp" +#include "op_table.hpp" +#include "translate_session.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +FrontEnd::FrontEnd() {} + +std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) { + auto ggml_model = std::dynamic_pointer_cast(model); + FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); + std::shared_ptr converted_model; + const auto& supported_ops = get_supported_ops(); + { + TranslateSession translate_session(model, supported_ops); + converted_model = translate_session.get_converted_model(); + } + return converted_model; +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/frontend.hpp b/ggml/src/ggml-openvino/openvino/frontend.hpp new file mode 100644 index 0000000000000..5cc7ff1773216 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/frontend.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace frontend { +namespace ggml { + +class FrontEnd { +public: + using Ptr = std::shared_ptr; + FrontEnd(); + + static std::shared_ptr convert(const InputModel::Ptr& model); +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/input_model.cpp b/ggml/src/ggml-openvino/openvino/input_model.cpp new file mode 100644 index 0000000000000..5fb16ea2db87d --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/input_model.cpp @@ -0,0 +1,17 @@ +#include "input_model.hpp" + +#include "decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +InputModel::InputModel(const std::shared_ptr& gdecoder) : m_decoder(gdecoder) {} + +const std::shared_ptr& InputModel::get_model_decoder() const { + return m_decoder; +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/input_model.hpp b/ggml/src/ggml-openvino/openvino/input_model.hpp new file mode 100644 index 0000000000000..9bc9a28e9aeca --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/input_model.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include + +#include "decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +class FrontEnd; +class GgmlDecoder; +using ov::frontend::ggml::GgmlDecoder; + +class InputModel : public ov::frontend::InputModel { + friend class ::ov::frontend::ggml::FrontEnd; + +public: + explicit InputModel(const std::shared_ptr& gdecoder); + + const std::shared_ptr& get_model_decoder() const; + +private: + std::shared_ptr m_decoder; +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp new file mode 100644 index 0000000000000..bac135270d876 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include + +#include "decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +class TranslateSession; + +typedef std::map> TensorMap; + +class NodeContext : public frontend::NodeContext { +public: + NodeContext(const std::shared_ptr& decoder, + std::shared_ptr& tensor_map, + TranslateSession* translate_session = nullptr) + : ov::frontend::NodeContext(decoder->get_op_type()), + m_decoder(decoder), + m_tensor_map(tensor_map), + m_translate_session(translate_session) { + m_input_names = decoder->get_input_names(); + m_output_names = decoder->get_output_names(); + } + + TranslateSession* get_translate_session() const { + return m_translate_session; + } + + size_t get_input_size() const override { + return m_decoder->get_input_size(); + } + + Any get_input_type(size_t index) const { + return m_decoder->get_input_type(m_input_names[index]); + } + + PartialShape get_input_shape(size_t index) const { + return m_decoder->get_input_shape(m_input_names[index]); + } + + std::vector get_input_stride(size_t index) const { + return m_decoder->get_input_stride(m_input_names[index]); + } + + PartialShape get_output_shape(size_t index) const { + return m_decoder->get_output_shape(m_output_names[index]); + } + + std::vector get_output_stride(size_t index) const { + return m_decoder->get_output_stride(m_output_names[index]); + } + + int32_t* get_input_op_params(size_t index) const { + return m_decoder->get_input_op_params(m_input_names[index]); + } + + int32_t* get_output_op_params(size_t index) const { + return m_decoder->get_output_op_params(m_output_names[index]); + } + + ov::element::Type get_output_type(size_t index) const { + return m_decoder->get_output_type(m_output_names[index]); + } + + Output get_input(int idx) const override { + return m_tensor_map->at(m_decoder->get_input_name(idx)); + } + + Output get_input(const std::string& name) const override { + return m_tensor_map->at(name); + } + + const std::string& get_name() const override { + return m_decoder->get_op_name(); + } + + ov::Any get_attribute_as_any(const std::string& name) const override { + return m_decoder->get_attribute(name); + } + + bool check_if_continuous() const { + return m_decoder->check_if_continuous(); + } + +private: + std::shared_ptr m_decoder; + std::shared_ptr& m_tensor_map; + TranslateSession* m_translate_session; + std::vector m_input_names; + std::vector m_output_names; +}; + +using CreatorFunction = std::function; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp new file mode 100644 index 0000000000000..c218cf34de808 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/add.cpp @@ -0,0 +1,23 @@ +#include "openvino/op/add.hpp" + +#include "../node_context.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_add(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto lhs = context.get_input(0); + auto rhs = context.get_input(1); + auto add = std::make_shared(lhs, rhs); + return {add}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp new file mode 100644 index 0000000000000..2ebc890fda088 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -0,0 +1,56 @@ + +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/slice.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cont(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + auto src_shape = context.get_input_shape(0).to_shape(); + auto dst_shape = context.get_output_shape(0).to_shape(); + + bool continuous = context.check_if_continuous(); + if (continuous) { + // The input comes from a PERMUTE + dst_shape[1] = -1; + auto result = std::make_shared( + context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), + false); + + return {result}; + } else { + // The input comes from a VIEW + // Currently all cases are slicing at lowest dim + int32_t* op_params = context.get_input_op_params(0); + auto output_stride = context.get_output_stride(0); + + int64_t split_addr = op_params[0] / output_stride[2]; + std::vector begin = {0, 0, split_addr}; + std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; + std::vector strides = {1, 1, 1}; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, {begin.size()}, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); + auto slice = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); + + return {slice}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp new file mode 100644 index 0000000000000..b4f4d5940869f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -0,0 +1,106 @@ +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/range.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/scatter_nd_update.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cpy(const NodeContext& context) { + num_inputs_check(context, 2, 2); + auto src0 = context.get_input(0); + auto src1 = context.get_input(1); + auto past_token_len = context.get_input("past_token_len"); + + auto src0_shape = context.get_input_shape(0).to_shape(); + auto output_shape = context.get_output_shape(0).to_shape(); + bool continuous = context.check_if_continuous(); + + std::vector input0_strides = context.get_input_stride(0); + std::vector output_strides = context.get_output_stride(0); + + auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + + src0 = std::make_shared(src0, src1); + if (continuous) { + // Write K to cache_k + int64_t head_size = src0_shape[2]; + int64_t num_heads = src0_shape[1]; + + auto reshaped_src1_shape = + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads, head_size}); + auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); + token_len = std::make_shared(token_len, + ov::op::v0::Constant::create(ov::element::i64, {0}, {}), + false); + auto total_token_len = std::make_shared(past_token_len, token_len); + std::shared_ptr indices = + std::make_shared(past_token_len, total_token_len, one, ov::element::i64); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); + + auto res = std::make_shared(reshaped_src1, indices, src0); + return {res}; + } else { + // Write V to cache_v + int64_t total_head_size = src0_shape[1]; + + auto reshaped_src0 = std::make_shared( + src0, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), + false); + auto transposed_src0 = + std::make_shared(reshaped_src0, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + + auto reshaped_src1 = std::make_shared( + src1, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), + false); + auto transposed_src1 = + std::make_shared(reshaped_src1, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); + token_len = std::make_shared(token_len, + ov::op::v0::Constant::create(ov::element::i64, {0}, {}), + false); + auto total_token_len = std::make_shared(past_token_len, token_len); + std::shared_ptr indices = + std::make_shared(past_token_len, total_token_len, one, ov::element::i64); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); + + auto res = std::make_shared(transposed_src1, indices, transposed_src0); + auto transposed_res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + auto reshaped_res = std::make_shared( + transposed_res, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + false); + return {reshaped_res}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp new file mode 100644 index 0000000000000..edb25d91244c7 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -0,0 +1,40 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/reshape.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_get_rows(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto data_node = context.get_input(0); + auto indices_node = context.get_input(1); + + auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2}); + Output indice_reshaped = std::make_shared(indices_node, indices_shape, false); + + auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + + Output res = std::make_shared(data_node, indice_reshaped, axis_node); + if (res.get_element_type() != context.get_output_type(0)) { + res = std::make_shared(res, context.get_output_type(0)); + } + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp new file mode 100644 index 0000000000000..1b1c69f7dfdef --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/mul.cpp @@ -0,0 +1,28 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_mul(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp new file mode 100644 index 0000000000000..e00435ef81075 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -0,0 +1,127 @@ +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/transpose.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_mulmat(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + bool continuous = context.check_if_continuous(); + if (continuous) { + auto src1 = context.get_input(1); + auto src0_converted = std::make_shared(context.get_input(0), src1); + auto result = std::make_shared(src1, src0_converted, false, true); + return {result}; + } else { + /* + Two cases here: + - 21: [ 96, 32, 32, 1] VIEW k-0 [ 2, 6144, 192, 6144] + [ 196608, 1, 1, 1] 0: NONE cache_k_l0 [ 2, 393216, 393216, 393216] + - 22: [ 96, 7, 32, 1] PERMUTE q-0 [ 4, 12288, 384, 86016] + [ 96, 32, 7, 1] 0: SCALE Qcur-0 [ 4, 384, 12288, 86016] + - 23: [ 32, 7, 32, 1] MUL_MAT kq-0 [ 4, 128, 896, 28672] + [ 96, 32, 32, 1] 0: VIEW k-0 [ 2, 6144, 192, 6144] + [ 96, 7, 32, 1] 1: PERMUTE q-0 [ 4, 12288, 384, 86016] + + - 20: [ 32, 96, 32, 1] VIEW v-0 [ 2, 128, 12288, 393216] + [ 196608, 1, 1, 1] 0: NONE cache_v_l0 [ 2, 393216, 393216, 393216] + - 25: [ 96, 7, 32, 1] MUL_MAT kqv-0 [ 4, 384, 2688, 86016] + [ 32, 96, 32, 1] 0: VIEW v-0 [ 2, 128, 12288, 393216] + [ 32, 7, 32, 1] 1: SOFT_MAX kq_soft_max_ext-0 [ 4, 128, 896, 28672] + + For case 1, for src0, Reshape + Slice + Transpose + For case 2, for src0, Reshape + Slice + */ + ov::Output A; + ov::Output B; + + auto attention_size = context.get_input("attention_size"); + + auto src0 = context.get_input(0); + auto src0_shape = context.get_input_shape(0).to_shape(); + auto src0_stride = context.get_input_stride(0); + auto permuted = is_permuted(src0_stride); + auto token_dim = permuted ? 0 : 2; + + auto src0_perm = argsort_descend(src0_stride); + auto src0_original_shape_ = permute(src0_shape, src0_perm); + std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); + src0_original_shape[token_dim] = -1; + + auto src0_slice_shape = src0_original_shape; + src0_slice_shape.erase(src0_slice_shape.begin() + token_dim); + + auto src0_reshape_shape = + ov::op::v0::Constant::create(ov::element::i64, {src0_original_shape.size()}, src0_original_shape); + auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); + + std::shared_ptr slice_end; + if (permuted) { + slice_end = std::make_shared( + ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, + 0); + } else { + slice_end = std::make_shared( + ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape), attention_size}, + 0); + } + auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); + auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); + auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); + + if (permuted) { + B = std::make_shared( + src0_slice, + ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); + } else { + B = src0_slice; + } + + A = context.get_input(1); + B = std::make_shared(B, A); + + int64_t num_heads = context.get_input_shape(1).to_shape()[0]; + int64_t num_heads_kv = src0_shape[0]; + int64_t kv_num_heads_factor = num_heads / num_heads_kv; + if (kv_num_heads_factor > 1) { + auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); + auto num_heads_kv_node = + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); + auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); + + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + std::shared_ptr new_B_shape = + std::make_shared(ov::OutputVector{num_heads_kv_node, one, B_shape_last_two}, 0); + B = std::make_shared(B, new_B_shape, false); + + B = std::make_shared(ov::OutputVector(kv_num_heads_factor, B), 1); + new_B_shape = std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); + B = std::make_shared(B, new_B_shape, false); + } + + auto result = std::make_shared(A, B, false, true); + return {result}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp new file mode 100644 index 0000000000000..42472f18cccd9 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -0,0 +1,22 @@ +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/transpose.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { +OutputVector translate_permute(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + // TODO: make this more general + auto res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + + return {res}; +}; +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp new file mode 100644 index 0000000000000..ca18b72c42f12 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -0,0 +1,35 @@ +#include "openvino/op/reshape.hpp" + +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_reshape(const NodeContext& context) { + num_inputs_check(context, 1, 1); + if (context.get_input_shape(0) == context.get_output_shape(0)) { + return {context.get_input(0)}; + } + + auto output_shape = context.get_output_shape(0).to_shape(); + auto new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + Output res = std::make_shared(context.get_input(0), new_shape_node, false); + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp new file mode 100644 index 0000000000000..7b9783e8c9398 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -0,0 +1,47 @@ +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reduce_sum.hpp" +#include "openvino/op/sqrt.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_rms_norm(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + ov::Shape input_shape = context.get_input_shape(0).to_shape(); + auto input_node = context.get_input(0); + auto square = std::make_shared(input_node, input_node); + + auto reduce_sum = + std::make_shared(square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), + true); + + auto mean = std::make_shared( + reduce_sum, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(input_shape[2])})); + + float eps; + memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + auto rms = std::make_shared( + std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); + + auto scale = + std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), rms); + + auto res = std::make_shared(input_node, scale); + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp new file mode 100644 index 0000000000000..d5083ae14bcbb --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -0,0 +1,171 @@ + +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/cos.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/sin.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/split.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" + +#define GGML_ROPE_TYPE_NEOX 2 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, + int n_ctx_orig, + float freq_base, + float beta_fast, + float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); +} + +OutputVector translate_rope(const NodeContext& context) { + num_inputs_check(context, 2, 3); + + auto data_node = context.get_input(0); + auto pos_node = context.get_input(1); + pos_node = std::make_shared(pos_node, ov::element::f32); + + auto permutation_node = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + Output pos_node_reshaped = std::make_shared(pos_node, permutation_node); + + auto output_shape = context.get_output_shape(0); + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + int32_t* op_params = context.get_output_op_params(0); + const int n_dims = op_params[1]; + const int mode = op_params[2]; + const int n_ctx_orig = op_params[4]; + memcpy(&freq_base, op_params + 5, sizeof(float)); + memcpy(&freq_scale, op_params + 6, sizeof(float)); + memcpy(&ext_factor, op_params + 7, sizeof(float)); + memcpy(&attn_factor, op_params + 8, sizeof(float)); + memcpy(&beta_fast, op_params + 9, sizeof(float)); + memcpy(&beta_slow, op_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + // TODO: corr_dims is not used in the current implementation + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + // TODO: GGML_OP_ROPE_BACK -> false + bool forward = true; + const float sin_sign = forward ? 1.0f : -1.0f; + + const int64_t ne0 = output_shape[2].get_length(); + std::vector factor(ne0 / 2); + factor[0] = freq_scale; + for (int64_t i = 1; i < ne0 / 2; i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output factor_node = + std::make_shared(ov::element::f32, ov::Shape{factor.size()}, factor); + if (context.get_input_size() == 3) { + auto freq_factors_node = context.get_input(2); + factor_node = std::make_shared(factor_node, freq_factors_node); + } + + auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2}); + Output input_shape_node = std::make_shared( + OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim}, + 0); + Output factor_broadcasted_node = std::make_shared(factor_node, input_shape_node); + + Output cos_factor_broadcasted_node = std::make_shared( + std::make_shared(factor_broadcasted_node, pos_node_reshaped)); + Output sin_factor_broadcasted_node = std::make_shared( + std::make_shared(factor_broadcasted_node, pos_node_reshaped)); + + float mscale = attn_factor; + Output mscale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); + Output mscale_sin_sign_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale * sin_sign}); + Output cos_theta_node = std::make_shared(cos_factor_broadcasted_node, mscale_node); + Output sin_theta_node = std::make_shared(sin_factor_broadcasted_node, mscale_node); + + if (!is_neox) { + auto input_shape = context.get_input_shape(0); + + auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); + auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1}); + auto end = std::make_shared(data_node); + auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2}); + auto even_slice = std::make_shared(data_node, begin_even, end, stride); + auto odd_slice = std::make_shared(data_node, begin_odd, end, stride); + + auto first_half = + std::make_shared(std::make_shared(even_slice, cos_theta_node), + std::make_shared(odd_slice, sin_theta_node)); + auto second_half = + std::make_shared(std::make_shared(even_slice, sin_theta_node), + std::make_shared(odd_slice, cos_theta_node)); + + auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); + auto shape_const = ov::op::v0::Constant::create( + ov::element::i64, + Shape{3}, + std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); + auto reshaped = std::make_shared(stack, shape_const, false); + + return {reshaped}; + } else { + auto slice_node = + std::make_shared(data_node, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), + 2); + Output slice_data_node_0 = slice_node->outputs()[0]; + Output slice_data_node_1 = slice_node->outputs()[1]; + + auto first_half_node = std::make_shared( + std::make_shared(slice_data_node_0, cos_theta_node), + std::make_shared(slice_data_node_1, sin_theta_node)); + + auto second_half_node = std::make_shared( + std::make_shared(slice_data_node_0, sin_theta_node), + std::make_shared(slice_data_node_1, cos_theta_node)); + + auto res_node = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); + return {res_node}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp new file mode 100644 index 0000000000000..392bfc1ed401f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -0,0 +1,31 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_scale(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + float scale; + memcpy(&scale, context.get_output_op_params(0), sizeof(float)); + auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + + auto res = std::make_shared(context.get_input(0), scale_node); + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp new file mode 100644 index 0000000000000..27c7cefef09a5 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -0,0 +1,88 @@ + +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/softmax.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_soft_max(const NodeContext& context) { + num_inputs_check(context, 1, 2); + + auto input_node = context.get_input(0); + + float scale = 1.0f; + float max_bias = 0.0f; + auto op_params = context.get_output_op_params(0); + memcpy(&scale, (float*)op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); + + const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + + // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = (max_bias > 0.0f) ? 1.0f : 1.0f; + // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) + // : 1.0f; + + if (scale != 1.0f) { + auto scale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + input_node = std::make_shared(input_node, scale_node); + } + + if (context.get_input_size() == 2) { + // Calculate mask then softmax + auto mask_node = context.get_input(1); + ov::element::Type mask_type = (context.get_input_type(1)).as(); + if (mask_type == ov::element::f16) { + // Convert f16 to f32 + mask_node = std::make_shared(mask_node, ov::element::f32); + } + + // Stride slice mask node + Output mask_begin_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}); + auto input_last_two_dim = get_dimensions(input_node.get_node_shared_ptr(), {1, 2}); + auto mask_slice_shape = std::make_shared(ov::NodeVector{one, input_last_two_dim}, 0); + Output mask_stride_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); + auto mask_node_sliced = + std::make_shared(mask_node, mask_begin_node, mask_slice_shape, mask_stride_node); + + // slope * mask + auto slope_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{slope}); + auto slope_mask_node = std::make_shared(mask_node_sliced, slope_node); + + // input + slope * mask + auto input_slope_mask_node = std::make_shared(input_node, slope_mask_node); + + // Calculate softmax + auto res = std::make_shared(input_slope_mask_node, 2); + return {res}; + } else { + // Directly softmax + auto res = std::make_shared(input_node, 0); + return {res}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp new file mode 100644 index 0000000000000..f7408f40d42fc --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -0,0 +1,23 @@ +#include "openvino/op/transpose.hpp" + +#include "../node_context.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_transpose(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + auto perm = argsort_descend(context.get_output_stride(0)); + auto res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary.cpp b/ggml/src/ggml-openvino/openvino/op/unary.cpp new file mode 100644 index 0000000000000..391e0a7599586 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary.cpp @@ -0,0 +1,24 @@ + +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + return {context.get_input(0)}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp new file mode 100644 index 0000000000000..2a90a79475537 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -0,0 +1,29 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/sigmoid.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary_silu(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + auto input = context.get_input(0); + auto sigmoid = std::make_shared(input); + auto res = std::make_shared(input, sigmoid); + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp new file mode 100644 index 0000000000000..aaf117b662137 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -0,0 +1,26 @@ +#include +#include + +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/strided_slice.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_view(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + return {context.get_input(0)}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp new file mode 100644 index 0000000000000..af51bb157e099 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -0,0 +1,64 @@ +#include "op_table.hpp" + +#include +#include +#include +#include +#include +#include + +#include "utils.hpp" + +using namespace ov::op; +namespace ov { +namespace frontend { +namespace ggml { + +namespace op { + +#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& node) + +GGML_OP_CONVERTER(translate_add); +GGML_OP_CONVERTER(translate_cont); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_get_rows); +GGML_OP_CONVERTER(translate_mul); +GGML_OP_CONVERTER(translate_mulmat); +GGML_OP_CONVERTER(translate_permute); +GGML_OP_CONVERTER(translate_reshape); +GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_rope); +GGML_OP_CONVERTER(translate_scale); +GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_soft_max); +GGML_OP_CONVERTER(translate_transpose); +GGML_OP_CONVERTER(translate_unary); +GGML_OP_CONVERTER(translate_view); + +} // namespace op + +const std::unordered_map get_supported_ops() { + return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, + {"GGML_OP_CONT", op::translate_cont}, + {"GGML_OP_CPY", op::translate_cpy}, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, + {"GGML_OP_GET_ROWS", op::translate_get_rows}, + // {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL", op::translate_mul}, + {"GGML_OP_MUL_MAT", op::translate_mulmat}, + {"GGML_OP_PERMUTE", op::translate_permute}, + {"GGML_OP_RESHAPE", op::translate_reshape}, + {"GGML_OP_RMS_NORM", op::translate_rms_norm}, + {"GGML_OP_ROPE", op::translate_rope}, + {"GGML_OP_SCALE", op::translate_scale}, + {"GGML_OP_SOFT_MAX", op::translate_soft_max}, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose}, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, + {"GGML_OP_VIEW", op::translate_view}}; +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp new file mode 100644 index 0000000000000..c83aaa199fb49 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "node_context.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +const std::unordered_map get_supported_ops(); + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp new file mode 100644 index 0000000000000..f5b14d3a0ff0e --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -0,0 +1,145 @@ +#include "translate_session.hpp" + +#include +#include + +#include "input_model.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +using namespace ov::op; + +TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, + const std::unordered_map& translator_map) + : m_input_model(input_model), + m_translator_map(translator_map), + m_ov_model(nullptr) {} + +std::shared_ptr TranslateSession::get_converted_model() { + if (m_ov_model) { + return m_ov_model; + } + m_ov_model = translate_graph(m_input_model); + // print_model_topology(); + return m_ov_model; +} + +void TranslateSession::print_model_topology() { + try { + std::ofstream outfile("model_topology.txt", std::ios::out | std::ios::app); + if (!outfile.is_open()) { + throw std::runtime_error("Failed to open file for writing model topology."); + } + + outfile << "============ Model ============" << std::endl; + for (const auto& op : m_ov_model->get_ordered_ops()) { + outfile << "Operation: " << op->get_friendly_name() << std::endl; + outfile << " Inputs:" << std::endl; + for (const auto& input : op->inputs()) { + outfile << " " << input.get_node()->get_friendly_name() << " -> " << input.get_element_type() << " " + << input.get_shape() << std::endl; + } + outfile << " Outputs:" << std::endl; + for (const auto& output : op->outputs()) { + outfile << " " << output.get_node()->get_friendly_name() << " -> " << output.get_element_type() + << " " << output.get_shape() << std::endl; + } + outfile << std::endl; + } + outfile << "===============================" << std::endl; + outfile.close(); + } catch (const std::exception& ex) { + std::cout << ex.what() << std::endl; + } +} + +std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) { + ov::ParameterVector params; + ov::ResultVector results; + auto tensor_map = std::make_shared(); + std::shared_ptr resulting_model; + + const auto& ggml_model = std::dynamic_pointer_cast(input_model); + std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); + + FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model"); + const auto& model_inputs = ggml_model->get_inputs(); + const auto& model_outputs = ggml_model->get_outputs(); + + for (const auto& it : ggml_model_decoder->get_model_inputs()) { + params.push_back(std::dynamic_pointer_cast(it.second)); + (*tensor_map)[it.first] = it.second; + } + + for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) { + params.push_back(std::dynamic_pointer_cast(it.second)); + (*tensor_map)[it.first] = it.second; + } + + for (const auto& it : ggml_model_decoder->get_model_weights()) { + (*tensor_map)[it.first] = it.second; + } + + auto node_visitor = [&](std::shared_ptr node) { + auto operation_type = node->get_op_type(); + ov::OutputVector converted_outputs; + auto it = m_translator_map.find(operation_type); + if (it != m_translator_map.end()) { + try { + NodeContext node_context(node, tensor_map, this); + converted_outputs = it->second(node_context); + } catch (const std::exception& ex) { + std::cout << ex.what() << std::endl; + } + } else { + // TODO + } + + const auto& node_output_names = node->get_output_names(); + FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), + "Number of ", + operation_type, + " outputs greater than number of converted outputs, which are ", + node_output_names.size(), + " and ", + converted_outputs.size(), + " respectively."); + + for (size_t i = 0; i < node_output_names.size(); ++i) { + auto output_name = node_output_names[i]; + if (i < converted_outputs.size() && converted_outputs[i].get_node_shared_ptr() != nullptr) { + (*tensor_map)[output_name] = converted_outputs[i]; + } + } + }; + + ggml_model_decoder->visit_subgraph(node_visitor); + + for (const auto& name : ggml_model_decoder->get_model_output_names()) { + FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(), + "Output name not found in tensor map: ", + name); + auto result = std::make_shared(tensor_map->at(name)); + // result->set_friendly_name(it); + results.push_back(result); + } + + ov::ParameterVector used_params; + for (const auto& param : params) { + if (!param->output(0).get_target_inputs().empty()) { + used_params.push_back(param); + } + } + if (auto diff = params.size() - used_params.size()) { + std::cout << diff << " parameters are not used in the model." << std::endl; + } + resulting_model = std::make_shared(results, used_params); + + return resulting_model; +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp new file mode 100644 index 0000000000000..5c7a9d464d786 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "input_model.hpp" +#include "node_context.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +class TranslateSession { +public: + TranslateSession(const frontend::InputModel::Ptr& input_model, + const std::unordered_map& translator_map); + + std::shared_ptr get_converted_model(); + std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); + +private: + void print_model_topology(); + const frontend::InputModel::Ptr m_input_model; + const std::unordered_map& m_translator_map; + std::shared_ptr m_ov_model; +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp new file mode 100644 index 0000000000000..ff16e9d4ae20d --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -0,0 +1,52 @@ +#include "utils.hpp" + +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { + +std::string getCurrentTime() { + std::time_t now = std::time(nullptr); + char buf[100]; + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); + return buf; +} + +void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { + auto input_size = context.get_input_size(); + FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected"); + FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected"); +} + +int non_cont_dim(std::vector ne, std::vector nb) { + int dim = nb.size() - 1; + size_t bytes = nb[dim]; + for (int i = dim; i > 0; i--) { + bytes *= ne[i]; + if (bytes != nb[i - 1]) { + return i; + } + } + return 0; +} + +std::shared_ptr get_dimensions(const std::shared_ptr& shape, + const std::vector& dims) { + using namespace ov::op; + const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims); + return std::make_shared(shape, dims_const, zero); +} + +std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims) { + return get_dimensions(std::make_shared(node), dims); +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp new file mode 100644 index 0000000000000..6e106fa932bfc --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -0,0 +1,68 @@ +#pragma once + +#include + +#include "node_context.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +void dump_ov_model(const std::shared_ptr model); + +void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs); + +int non_cont_dim(std::vector ne, std::vector nb); + +template +std::vector argsort_descend(const std::vector& v) { + std::vector idx(v.size()); + std::iota(idx.begin(), idx.end(), 0); + std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) { + return v[i1] > v[i2]; + }); + return idx; +} + +template +std::vector sorted_descend(std::vector v) { + std::sort(v.begin(), v.end(), [](T a, T b) { + return a > b; + }); + return v; +} + +template +bool is_permuted(const std::vector& strides) { + for (size_t i = 0; i < strides.size() - 1; ++i) { + if (strides[i] < strides[i + 1]) { + return true; + } + } + return false; +} + +template +std::vector permute(const std::vector& x, const std::vector& perm) { + std::vector result; + result.reserve(perm.size()); + for (int i : perm) { + result.push_back(x[i]); + } + return result; +} + +std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); +std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); + +namespace op { +template +OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { + num_inputs_check(context, 2, 2); + return {std::make_shared(context.get_input(0), context.get_input(1))}; +} +} // namespace op + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f36700d5ec148..34bcfc54a7267 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,6 +14,8 @@ #include "ggml-impl.h" #include "ggml.h" +#include "openvino/frontend.hpp" +#include "openvino/input_model.hpp" std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { return std::make_shared(nullptr, cgraph); @@ -56,11 +58,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } // auto devices = core.get_available_devices(); - static auto front_end = get_ggml_frontend(); - if (!front_end) { - GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - return GGML_STATUS_FAILED; - } + // static auto front_end = get_ggml_frontend(); + // if (!front_end) { + // GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); + // return GGML_STATUS_FAILED; + // } using CachedItem = std::pair, ov::CompiledModel>; static std::unordered_map compiled_cache; @@ -79,14 +81,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compiled_model = it->second.second; compile_end_time = ggml_time_us(); } else { - std::shared_ptr graph_decoder = ggml_decoder; - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; - } + // std::shared_ptr graph_decoder = ggml_decoder; + // ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); + // if (!input_model) { + // GGML_LOG_ERROR("Input Model is not loaded \n"); + // return GGML_STATUS_FAILED; + // } + + // model = front_end->convert(input_model); + + ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); - model = front_end->convert(input_model); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { From eb62dc85ef26f19264abdf38e68a588d62b387ad Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 13 May 2025 08:42:54 +0800 Subject: [PATCH 064/156] PERF: favor low precision matmul --- .../ggml-openvino/openvino/node_context.hpp | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 35 ++++++++++--------- .../ggml-openvino/openvino/op/soft_max.cpp | 4 +-- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index bac135270d876..e934e2ac368d2 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -33,7 +33,7 @@ class NodeContext : public frontend::NodeContext { return m_decoder->get_input_size(); } - Any get_input_type(size_t index) const { + ov::element::Type get_input_type(size_t index) const { return m_decoder->get_input_type(m_input_names[index]); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index e00435ef81075..3e9c5c5083023 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,19 +1,18 @@ -#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert_like.hpp" -#include "openvino/op/matmul.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/transpose.hpp" namespace ov { namespace frontend { @@ -25,9 +24,10 @@ OutputVector translate_mulmat(const NodeContext& context) { bool continuous = context.check_if_continuous(); if (continuous) { - auto src1 = context.get_input(1); - auto src0_converted = std::make_shared(context.get_input(0), src1); - auto result = std::make_shared(src1, src0_converted, false, true); + auto src0 = context.get_input(0); + auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); + auto result_lp = std::make_shared(src1, src0, false, true); + auto result = std::make_shared(result_lp, context.get_output_type(0)); return {result}; } else { /* @@ -94,8 +94,7 @@ OutputVector translate_mulmat(const NodeContext& context) { B = src0_slice; } - A = context.get_input(1); - B = std::make_shared(B, A); + A = std::make_shared(context.get_input(1), context.get_input_type(0)); int64_t num_heads = context.get_input_shape(1).to_shape()[0]; int64_t num_heads_kv = src0_shape[0]; @@ -116,10 +115,12 @@ OutputVector translate_mulmat(const NodeContext& context) { B = std::make_shared(B, new_B_shape, false); } - auto result = std::make_shared(A, B, false, true); + auto result_lp = std::make_shared(A, B, false, true); + auto result = std::make_shared(result_lp, context.get_output_type(0)); + return {result}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 27c7cefef09a5..cdb59f47d9f1e 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -49,7 +49,7 @@ OutputVector translate_soft_max(const NodeContext& context) { if (context.get_input_size() == 2) { // Calculate mask then softmax auto mask_node = context.get_input(1); - ov::element::Type mask_type = (context.get_input_type(1)).as(); + ov::element::Type mask_type = context.get_input_type(1); if (mask_type == ov::element::f16) { // Convert f16 to f32 mask_node = std::make_shared(mask_node, ov::element::f32); @@ -80,7 +80,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto res = std::make_shared(input_node, 0); return {res}; } -}; +} } // namespace op } // namespace ggml From 733f6557574d0eca298a07223f9014bb2827fb31 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 13 May 2025 10:34:51 +0800 Subject: [PATCH 065/156] STYLE and minor REFACTOR --- ggml/src/ggml-openvino/openvino/op/add.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 6 +-- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 22 ++++---- .../ggml-openvino/openvino/op/get_rows.cpp | 15 +++--- ggml/src/ggml-openvino/openvino/op/mul.cpp | 11 +--- .../src/ggml-openvino/openvino/op/permute.cpp | 14 ++--- .../src/ggml-openvino/openvino/op/reshape.cpp | 11 ++-- .../ggml-openvino/openvino/op/rms_norm.cpp | 15 +++--- ggml/src/ggml-openvino/openvino/op/rope.cpp | 36 ++++++------- ggml/src/ggml-openvino/openvino/op/scale.cpp | 9 ++-- .../ggml-openvino/openvino/op/soft_max.cpp | 18 +++---- .../ggml-openvino/openvino/op/transpose.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/unary.cpp | 24 --------- .../ggml-openvino/openvino/op/unary_silu.cpp | 11 ++-- ggml/src/ggml-openvino/openvino/op/view.cpp | 11 +--- ggml/src/ggml-openvino/openvino/op_table.cpp | 2 +- ggml/src/ggml-openvino/openvino/op_table.hpp | 2 +- .../openvino/translate_session.cpp | 51 +++---------------- ggml/src/ggml-openvino/utils.cpp | 6 ++- 19 files changed, 97 insertions(+), 175 deletions(-) delete mode 100644 ggml/src/ggml-openvino/openvino/op/unary.cpp diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp index c218cf34de808..18bc463fb9c50 100644 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ b/ggml/src/ggml-openvino/openvino/op/add.cpp @@ -1,4 +1,4 @@ -#include "openvino/op/add.hpp" +#include #include "../node_context.hpp" #include "../utils.hpp" @@ -15,7 +15,7 @@ OutputVector translate_add(const NodeContext& context) { auto rhs = context.get_input(1); auto add = std::make_shared(lhs, rhs); return {add}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 2ebc890fda088..e8e9bf0a4e732 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -2,12 +2,12 @@ #include #include #include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/slice.hpp" namespace ov { namespace frontend { @@ -48,7 +48,7 @@ OutputVector translate_cont(const NodeContext& context) { return {slice}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index b4f4d5940869f..2808d3ee91372 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,19 +1,19 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert_like.hpp" -#include "openvino/op/range.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/scatter_nd_update.hpp" -#include "openvino/op/transpose.hpp" -#include "openvino/op/unsqueeze.hpp" namespace ov { namespace frontend { @@ -98,7 +98,7 @@ OutputVector translate_cpy(const NodeContext& context) { false); return {reshaped_res}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index edb25d91244c7..64fc57bd88ad0 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,14 +1,13 @@ -#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/gather.hpp" -#include "openvino/op/reshape.hpp" namespace ov { namespace frontend { @@ -32,7 +31,7 @@ OutputVector translate_get_rows(const NodeContext& context) { } return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp index 1b1c69f7dfdef..14473f4e2777b 100644 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mul.cpp @@ -1,14 +1,7 @@ -#include -#include +#include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/broadcast.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reshape.hpp" namespace ov { namespace frontend { @@ -20,7 +13,7 @@ OutputVector translate_mul(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), context.get_input(1)); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 42472f18cccd9..478c9430f00e1 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,21 +1,23 @@ +#include +#include + #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/transpose.hpp" namespace ov { namespace frontend { namespace ggml { namespace op { + OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); - // TODO: make this more general + auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return {res}; -}; +} + } // namespace op } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index ca18b72c42f12..06b2bd339e9fc 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,13 +1,12 @@ -#include "openvino/op/reshape.hpp" - #include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" namespace ov { namespace frontend { @@ -27,7 +26,7 @@ OutputVector translate_reshape(const NodeContext& context) { std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); Output res = std::make_shared(context.get_input(0), new_shape_node, false); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 7b9783e8c9398..a91fffb72d5d1 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,11 +1,12 @@ +#include +#include +#include +#include +#include +#include + #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/divide.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reduce_sum.hpp" -#include "openvino/op/sqrt.hpp" namespace ov { namespace frontend { @@ -39,7 +40,7 @@ OutputVector translate_rms_norm(const NodeContext& context) { auto res = std::make_shared(input_node, scale); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index d5083ae14bcbb..aad156082e556 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,27 +1,27 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/broadcast.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/cos.hpp" -#include "openvino/op/divide.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/shape_of.hpp" -#include "openvino/op/sin.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/split.hpp" -#include "openvino/op/subtract.hpp" -#include "openvino/op/transpose.hpp" #define GGML_ROPE_TYPE_NEOX 2 @@ -163,7 +163,7 @@ OutputVector translate_rope(const NodeContext& context) { auto res_node = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); return {res_node}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 392bfc1ed401f..b393dd8aa2c2f 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -1,12 +1,9 @@ -#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/multiply.hpp" namespace ov { namespace frontend { @@ -23,7 +20,7 @@ OutputVector translate_scale(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), scale_node); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index cdb59f47d9f1e..549c35a9b6a0d 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,19 +1,19 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/softmax.hpp" namespace ov { namespace frontend { diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index f7408f40d42fc..7d33ca9d6170b 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,4 +1,4 @@ -#include "openvino/op/transpose.hpp" +#include #include "../node_context.hpp" #include "../utils.hpp" @@ -15,7 +15,7 @@ OutputVector translate_transpose(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/unary.cpp b/ggml/src/ggml-openvino/openvino/op/unary.cpp deleted file mode 100644 index 391e0a7599586..0000000000000 --- a/ggml/src/ggml-openvino/openvino/op/unary.cpp +++ /dev/null @@ -1,24 +0,0 @@ - -#include -#include - -#include "../node_context.hpp" -#include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_unary(const NodeContext& context) { - num_inputs_check(context, 1, 1); - - return {context.get_input(0)}; -}; - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 2a90a79475537..1c396e6aaf9c2 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -1,12 +1,9 @@ -#include -#include +#include +#include +#include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/sigmoid.hpp" namespace ov { namespace frontend { @@ -21,7 +18,7 @@ OutputVector translate_unary_silu(const NodeContext& context) { auto res = std::make_shared(input, sigmoid); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index aaf117b662137..fcfb9f732c581 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,13 +1,4 @@ -#include -#include - #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/strided_slice.hpp" namespace ov { namespace frontend { @@ -18,7 +9,7 @@ OutputVector translate_view(const NodeContext& context) { num_inputs_check(context, 1, 1); return {context.get_input(0)}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index af51bb157e099..d588b2bff0e9b 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -37,7 +37,7 @@ GGML_OP_CONVERTER(translate_view); } // namespace op -const std::unordered_map get_supported_ops() { +std::unordered_map get_supported_ops() { return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, {"GGML_OP_CONT", op::translate_cont}, diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index c83aaa199fb49..1a71a06c181ff 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -6,7 +6,7 @@ namespace ov { namespace frontend { namespace ggml { -const std::unordered_map get_supported_ops(); +std::unordered_map get_supported_ops(); } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index f5b14d3a0ff0e..012e9178c6217 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,8 +1,5 @@ #include "translate_session.hpp" -#include -#include - #include "input_model.hpp" namespace ov { @@ -22,39 +19,9 @@ std::shared_ptr TranslateSession::get_converted_model() { return m_ov_model; } m_ov_model = translate_graph(m_input_model); - // print_model_topology(); return m_ov_model; } -void TranslateSession::print_model_topology() { - try { - std::ofstream outfile("model_topology.txt", std::ios::out | std::ios::app); - if (!outfile.is_open()) { - throw std::runtime_error("Failed to open file for writing model topology."); - } - - outfile << "============ Model ============" << std::endl; - for (const auto& op : m_ov_model->get_ordered_ops()) { - outfile << "Operation: " << op->get_friendly_name() << std::endl; - outfile << " Inputs:" << std::endl; - for (const auto& input : op->inputs()) { - outfile << " " << input.get_node()->get_friendly_name() << " -> " << input.get_element_type() << " " - << input.get_shape() << std::endl; - } - outfile << " Outputs:" << std::endl; - for (const auto& output : op->outputs()) { - outfile << " " << output.get_node()->get_friendly_name() << " -> " << output.get_element_type() - << " " << output.get_shape() << std::endl; - } - outfile << std::endl; - } - outfile << "===============================" << std::endl; - outfile.close(); - } catch (const std::exception& ex) { - std::cout << ex.what() << std::endl; - } -} - std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) { ov::ParameterVector params; ov::ResultVector results; @@ -86,16 +53,12 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto operation_type = node->get_op_type(); ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); - if (it != m_translator_map.end()) { - try { - NodeContext node_context(node, tensor_map, this); - converted_outputs = it->second(node_context); - } catch (const std::exception& ex) { - std::cout << ex.what() << std::endl; - } - } else { - // TODO - } + FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), + "Translation for operation type ", + operation_type, + " is not implemented."); + NodeContext node_context(node, tensor_map, this); + converted_outputs = it->second(node_context); const auto& node_output_names = node->get_output_names(); FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), @@ -122,7 +85,7 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo "Output name not found in tensor map: ", name); auto result = std::make_shared(tensor_map->at(name)); - // result->set_friendly_name(it); + result->set_friendly_name(name); results.push_back(result); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 34bcfc54a7267..09bf0d0ac5843 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -69,10 +69,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::shared_ptr model; ov::CompiledModel compiled_model; + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; auto ggml_decoder = get_ggml_decoder(cgraph); + decoder_end_time = ggml_time_us(); + auto it = compiled_cache.find(cgraph); if (it != compiled_cache.end()) { model = it->second.first; @@ -147,7 +150,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); From 99c67acf1572b102554ca48fc6d8428cc85bd230 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 14 May 2025 14:06:15 +0800 Subject: [PATCH 066/156] FIX: Re-add tensor names in cgraph, Add another case for RESHAPE --- ggml/src/ggml-openvino/ggml-decoder.cpp | 39 +++++++++++++++---- ggml/src/ggml-openvino/ggml-decoder.h | 8 ++-- ggml/src/ggml-openvino/openvino/decoder.hpp | 4 +- .../ggml-openvino/openvino/node_context.hpp | 4 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 6 ++- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 7 +++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 6 ++- .../src/ggml-openvino/openvino/op/reshape.cpp | 21 ++++++++-- .../openvino/translate_session.cpp | 4 -- src/llama-graph.cpp | 12 ++++-- 10 files changed, 77 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 43869ec228d78..0d612c18196a0 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -38,6 +38,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap printed = true; } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + dump_cgraph(m_cgraph); + } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; @@ -47,10 +51,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_model_weights = model_weights; add_extra_inputs(); - - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - dump_cgraph(m_cgraph); - } } } @@ -142,17 +142,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (m_node) { switch (node->op) { + case GGML_OP_RESHAPE: { + if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + m_op_case = 1; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { + m_op_case = 2; + } + break; + } case GGML_OP_CONT: { - // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE - m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); + if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) { + // The input comes from a PERMUTE + m_op_case = 1; + } else { + // The input comes from a VIEW which is subtensor + m_op_case = 2; + } break; } case GGML_OP_CPY: { - m_continuous = ggml_is_contiguous(node); + if (ggml_is_contiguous(node)) { + // Write K to cache_k + m_op_case = 1; + } else { + // Write V to cache_v + m_op_case = 2; + } break; } case GGML_OP_MUL_MAT: { - m_continuous = node->src[0]->view_src == nullptr; + if (node->src[0]->view_src == nullptr) { + m_op_case = 1; + } else { + m_op_case = 2; + } break; } default: diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 959e00b65d8df..b8cc4c4cdf7b4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -69,8 +69,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_outputs.at(name); } - virtual bool check_if_continuous() const override { - return m_continuous; + virtual int get_op_case() const override { + return m_op_case; } virtual const std::map>& get_model_inputs() const override { @@ -110,7 +110,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::vector m_nodes; std::string m_op_name; mutable std::string m_name; - bool m_continuous; + int m_op_case; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; @@ -119,4 +119,4 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::vector m_model_output_names; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); \ No newline at end of file +void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 3987760a294e1..b3cf75817fcd9 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -49,7 +49,7 @@ class GgmlDecoder : public DecoderBase { virtual void visit_subgraph(std::function)> node_visitor) const = 0; - virtual bool check_if_continuous() const = 0; + virtual int get_op_case() const = 0; virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; @@ -59,4 +59,4 @@ class GgmlDecoder : public DecoderBase { } // namespace ggml } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index e934e2ac368d2..44f55222e35db 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -81,8 +81,8 @@ class NodeContext : public frontend::NodeContext { return m_decoder->get_attribute(name); } - bool check_if_continuous() const { - return m_decoder->check_if_continuous(); + int get_op_case() const { + return m_decoder->get_op_case(); } private: diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index e8e9bf0a4e732..a052bf06ca210 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -17,11 +17,13 @@ namespace op { OutputVector translate_cont(const NodeContext& context) { num_inputs_check(context, 1, 1); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); - bool continuous = context.check_if_continuous(); - if (continuous) { + if (op_case == 1) { // The input comes from a PERMUTE dst_shape[1] = -1; auto result = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 2808d3ee91372..4ab1502f81b2b 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -22,13 +22,16 @@ namespace op { OutputVector translate_cpy(const NodeContext& context) { num_inputs_check(context, 2, 2); + + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case"); + auto src0 = context.get_input(0); auto src1 = context.get_input(1); auto past_token_len = context.get_input("past_token_len"); auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); - bool continuous = context.check_if_continuous(); std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); @@ -36,7 +39,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); src0 = std::make_shared(src0, src1); - if (continuous) { + if (op_case == 1) { // Write K to cache_k int64_t head_size = src0_shape[2]; int64_t num_heads = src0_shape[1]; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 3e9c5c5083023..5673551f709c1 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -22,8 +22,10 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); - bool continuous = context.check_if_continuous(); - if (continuous) { + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + + if (op_case == 1) { auto src0 = context.get_input(0); auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); auto result_lp = std::make_shared(src1, src0, false, true); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 06b2bd339e9fc..f6586d674cd54 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,6 +1,8 @@ #include +#include #include #include +#include #include #include #include @@ -19,11 +21,22 @@ OutputVector translate_reshape(const NodeContext& context) { return {context.get_input(0)}; } + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case"); + auto output_shape = context.get_output_shape(0).to_shape(); - auto new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + std::shared_ptr new_shape_node; + if (op_case == 1) { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + } else { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + } Output res = std::make_shared(context.get_input(0), new_shape_node, false); return {res}; } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 012e9178c6217..910a0d833663d 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -31,10 +31,6 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo const auto& ggml_model = std::dynamic_pointer_cast(input_model); std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); - FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model"); - const auto& model_inputs = ggml_model->get_inputs(); - const auto& model_outputs = ggml_model->get_outputs(); - for (const auto& it : ggml_model_decoder->get_model_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a24853c63ada4..65ac8744e9012 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1093,7 +1093,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp->tokens, "inp_tokens", -1); + cb(inp->tokens, "inp_tokens", -1); ggml_set_input(inp->tokens); res->t_tokens = inp->tokens; @@ -1141,6 +1141,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { auto & cur = inp->pos; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd()); + cb(cur, "inp_pos", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1176,6 +1177,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const { auto & cur = inp->out_ids; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + cb(cur, "inp_out_ids", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1420,6 +1422,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); + cb(inp->kq_mask, "KQ_mask", -1); ggml_set_input(inp->kq_mask); inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask; @@ -1466,7 +1469,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1496,6 +1499,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); + cb(inp->self_kq_mask, "KQ_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1622,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1677,7 +1681,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { From 4cf4dfce7f7b75d949e38c45aadc1b19c018b9cc Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 14 May 2025 17:48:20 +0800 Subject: [PATCH 067/156] FIX: input shape of KQ_mask --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0d612c18196a0..fd5690072872e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -112,8 +112,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; } else if (std::string(src->name).find("KQ_mask") == 0) { - input_shape = - ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)}; + auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; } else { input_shape = ov::Shape{get_shape(src)}; } @@ -187,9 +187,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, void GgmlOvDecoder::set_max_token_len() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "v-0") { - auto* cache_v = node->src[0]; - m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2]; + if (std::string(node->name) == "k-0") { + auto* cache_k = node->src[0]; + m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[1]; break; } } From cf2bf6ec1634ae0e00d056af33ad305170f8889a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 14 May 2025 17:48:56 +0800 Subject: [PATCH 068/156] PERF: add weight constant in parallel --- ggml/src/ggml-openvino/ggml-decoder.cpp | 45 +++++++++++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ 2 files changed, 47 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fd5690072872e..a8e1ad5556e74 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -3,9 +3,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -42,6 +44,12 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap dump_cgraph(m_cgraph); } + static bool weight_created = false; + if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { + add_weight_const_parallel(model_weights); + weight_created = true; + } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; @@ -235,6 +243,43 @@ void GgmlOvDecoder::add_extra_inputs() { } } +void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { + static std::mutex weights_mutex; + auto* nodes = m_cgraph->nodes; + auto n_nodes = m_cgraph->n_nodes; + std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto* src = node->src[i]; + if (src == nullptr) { + continue; + } + + std::string src_name(src->name); + if (!src->view_src) { + ggml_backend_buffer* buffer = src->buffer; + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + bool should_create = false; + { + std::lock_guard lock(weights_mutex); + if (model_weights.find(src_name) == model_weights.end()) { + model_weights[src_name] = nullptr; + should_create = true; + } + } + if (should_create) { + auto weight_node = create_weight_node(src); + weight_node->set_friendly_name(src_name); + { + std::lock_guard lock(weights_mutex); + model_weights[src_name] = weight_node; + } + } + } + } + } + }); +} + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b8cc4c4cdf7b4..4d4a928121160 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -101,6 +101,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_max_token_len(); int64_t m_max_token_len; + void add_weight_const_parallel(std::map>& model_weights); + struct ggml_cgraph* m_cgraph; std::map m_inputs; std::vector m_input_names; From a04d0df855407885e3d9fc860705c0d35f5a55c1 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 16 May 2025 10:12:22 +0800 Subject: [PATCH 069/156] FIX: set_max_token_len --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 +++-- ggml/src/ggml-openvino/utils.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a8e1ad5556e74..e6474d6def2d7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -44,13 +44,14 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap dump_cgraph(m_cgraph); } + set_max_token_len(); + static bool weight_created = false; if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { add_weight_const_parallel(model_weights); weight_created = true; } - set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); @@ -197,7 +198,7 @@ void GgmlOvDecoder::set_max_token_len() { auto* node = m_cgraph->nodes[i]; if (std::string(node->name) == "k-0") { auto* cache_k = node->src[0]; - m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[1]; + m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[2]; break; } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 09bf0d0ac5843..040ca1961e833 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -209,4 +209,4 @@ void print_output_tensor_info(const std::string& name, default: break; } -} \ No newline at end of file +} From 454988c43d827ab67e60c9141114d7331539f16a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 16 May 2025 10:14:05 +0800 Subject: [PATCH 070/156] PERF: use Slice+Concat in writing cache_v --- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 56 +++++++++++----------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 4ab1502f81b2b..0c4a3d1558468 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,13 +1,17 @@ +#include #include #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -64,42 +68,40 @@ OutputVector translate_cpy(const NodeContext& context) { } else { // Write V to cache_v int64_t total_head_size = src0_shape[1]; + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto reshaped_src0 = std::make_shared( - src0, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto transposed_src0 = - std::make_shared(reshaped_src0, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); + past_token_len = std::make_shared(past_token_len, zero); + auto total_token_len = std::make_shared(past_token_len, token_len); auto reshaped_src1 = std::make_shared( src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), false); - auto transposed_src1 = - std::make_shared(reshaped_src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - token_len = std::make_shared(token_len, - ov::op::v0::Constant::create(ov::element::i64, {0}, {}), - false); - auto total_token_len = std::make_shared(past_token_len, token_len); - std::shared_ptr indices = - std::make_shared(past_token_len, total_token_len, one, ov::element::i64); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); + auto src1_left = std::make_shared( + reshaped_src1, + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), + std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + auto src1_right = std::make_shared( + reshaped_src1, + std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - auto res = std::make_shared(transposed_src1, indices, transposed_src0); - auto transposed_res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); - auto reshaped_res = std::make_shared( - transposed_res, + auto reshaped_src0 = std::make_shared( + src0, ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), false); - return {reshaped_res}; + + auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + + return {res}; } } From aaa13d880a055697e8d0f632444e82a5a2111f4a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 20 May 2025 10:38:15 +0800 Subject: [PATCH 071/156] Update build doc --- docs/build.md | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/docs/build.md b/docs/build.md index 20d34170774c9..07305e21ad664 100644 --- a/docs/build.md +++ b/docs/build.md @@ -577,33 +577,30 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OPENVINO -### Build openvino-llama +### Build openvino - ```bash - git lfs install --skip-smudge - git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend - cd openvino-llama - git submodule update --init --recursive - - export OPENVINO_LLAMA_PATH=$(pwd) - ``` +```bash +git clone https://github.com/openvinotoolkit/openvino.git +cd openvino +git submodule update --init --recursive +export OPENVINO_DIR=$(pwd) - Before building, change "ENABLE_OV_GGML_FRONTEND" from true to false in the CMakePresets.json file since we already have the code from the ov side in this branch of llama.cpp (`full_backend`). You could also build the master branch of ov instead. +sudo ./install_build_dependencies.sh - ``` - cmake --preset Release - cmake --build build/Release - ``` +mkdir -p build/Release && cd build/Release +cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_DEBUG_CAPS=ON ../.. +``` ### Build llama.cpp-ov - ```bash - git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b full_backend - cd llama.cpp-ov +```bash +git clone https://github.com/intel-sandbox/llama.cpp-ov.git +cd llama.cpp-ov +git switch dev_backend_openvino - cmake --preset ReleaseOV - cmake --build build/ReleaseOV - ``` +cmake --preset ReleaseOV +cmake --build build/ReleaseOV +``` Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. ``` bash @@ -611,12 +608,10 @@ Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingf ``` Execute the following command to test. - ```bash - export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache - # Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance - export GGML_OPENVINO_WEIGHT_AS_INPUT=1 - ./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " - ``` +```bash +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " +``` Environment variables: - GGML_OPENVINO_WEIGHT_AS_INPUT: From 97b9ba8f3f77cd3cc9c26bd58c535445fc117e55 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 May 2025 10:32:18 +0800 Subject: [PATCH 072/156] Add cgraph tensor output name to OV op name --- ggml/src/ggml-openvino/openvino/op/add.cpp | 7 +++---- ggml/src/ggml-openvino/openvino/op/cont.cpp | 13 ++++++------ ggml/src/ggml-openvino/openvino/op/cpy.cpp | 10 ++++----- .../ggml-openvino/openvino/op/get_rows.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/mul.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 11 +++++----- .../src/ggml-openvino/openvino/op/permute.cpp | 2 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 4 ++-- .../ggml-openvino/openvino/op/rms_norm.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 11 +++++----- ggml/src/ggml-openvino/openvino/op/scale.cpp | 2 +- .../ggml-openvino/openvino/op/soft_max.cpp | 21 ++++++++++--------- .../ggml-openvino/openvino/op/transpose.cpp | 2 +- .../ggml-openvino/openvino/op/unary_silu.cpp | 2 +- .../openvino/translate_session.cpp | 14 +++++++++++-- ggml/src/ggml-openvino/openvino/utils.cpp | 11 ++++++++++ ggml/src/ggml-openvino/openvino/utils.hpp | 2 ++ 17 files changed, 71 insertions(+), 47 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp index 18bc463fb9c50..5a75ff2148c83 100644 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ b/ggml/src/ggml-openvino/openvino/op/add.cpp @@ -11,10 +11,9 @@ namespace op { OutputVector translate_add(const NodeContext& context) { num_inputs_check(context, 2, 2); - auto lhs = context.get_input(0); - auto rhs = context.get_input(1); - auto add = std::make_shared(lhs, rhs); - return {add}; + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index a052bf06ca210..7cdfba051ef50 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -22,16 +22,15 @@ OutputVector translate_cont(const NodeContext& context) { auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); + ov::Output res; if (op_case == 1) { // The input comes from a PERMUTE dst_shape[1] = -1; - auto result = std::make_shared( + res = std::make_shared( context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); - - return {result}; } else { // The input comes from a VIEW // Currently all cases are slicing at lowest dim @@ -43,13 +42,13 @@ OutputVector translate_cont(const NodeContext& context) { std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; std::vector strides = {1, 1, 1}; - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, {begin.size()}, begin); + auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin); auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); - auto slice = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); - - return {slice}; + res = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 0c4a3d1558468..7cdeddce38af3 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -33,6 +33,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); auto past_token_len = context.get_input("past_token_len"); + ov::Output res; auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); @@ -63,8 +64,7 @@ OutputVector translate_cpy(const NodeContext& context) { indices, ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - auto res = std::make_shared(reshaped_src1, indices, src0); - return {res}; + res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v int64_t total_head_size = src0_shape[1]; @@ -99,10 +99,10 @@ OutputVector translate_cpy(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), false); - auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); - - return {res}; + res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 64fc57bd88ad0..ca36548d9fa5b 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -30,7 +30,7 @@ OutputVector translate_get_rows(const NodeContext& context) { res = std::make_shared(res, context.get_output_type(0)); } - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp index 14473f4e2777b..40caf4331e6b5 100644 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mul.cpp @@ -12,7 +12,7 @@ OutputVector translate_mul(const NodeContext& context) { num_inputs_check(context, 2, 2); auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 5673551f709c1..06e7d9ece0a9c 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -25,12 +25,13 @@ OutputVector translate_mulmat(const NodeContext& context) { int op_case = context.get_op_case(); FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + ov::Output res; + if (op_case == 1) { auto src0 = context.get_input(0); auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); auto result_lp = std::make_shared(src1, src0, false, true); - auto result = std::make_shared(result_lp, context.get_output_type(0)); - return {result}; + res = std::make_shared(result_lp, context.get_output_type(0)); } else { /* Two cases here: @@ -118,10 +119,10 @@ OutputVector translate_mulmat(const NodeContext& context) { } auto result_lp = std::make_shared(A, B, false, true); - auto result = std::make_shared(result_lp, context.get_output_type(0)); - - return {result}; + res = std::make_shared(result_lp, context.get_output_type(0)); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 478c9430f00e1..649cf8f3e1cc6 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -15,7 +15,7 @@ OutputVector translate_permute(const NodeContext& context) { auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index f6586d674cd54..49551eb81551c 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -37,8 +37,8 @@ OutputVector translate_reshape(const NodeContext& context) { {3}, std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); } - Output res = std::make_shared(context.get_input(0), new_shape_node, false); - return {res}; + auto res = std::make_shared(context.get_input(0), new_shape_node, false); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index a91fffb72d5d1..7b8b582dacf89 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -39,7 +39,7 @@ OutputVector translate_rms_norm(const NodeContext& context) { auto res = std::make_shared(input_node, scale); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index aad156082e556..94810e549d5ef 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -52,6 +52,8 @@ void ggml_rope_yarn_corr_dims(int n_dims, OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); + ov::Output res; + auto data_node = context.get_input(0); auto pos_node = context.get_input(1); pos_node = std::make_shared(pos_node, ov::element::f32); @@ -141,9 +143,7 @@ OutputVector translate_rope(const NodeContext& context) { ov::element::i64, Shape{3}, std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); - auto reshaped = std::make_shared(stack, shape_const, false); - - return {reshaped}; + res = std::make_shared(stack, shape_const, false); } else { auto slice_node = std::make_shared(data_node, @@ -160,9 +160,10 @@ OutputVector translate_rope(const NodeContext& context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - auto res_node = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); - return {res_node}; + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index b393dd8aa2c2f..8f0999432ce6f 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -19,7 +19,7 @@ OutputVector translate_scale(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), scale_node); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 549c35a9b6a0d..bb6b0023953f2 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -24,6 +24,7 @@ OutputVector translate_soft_max(const NodeContext& context) { num_inputs_check(context, 1, 2); auto input_node = context.get_input(0); + ov::Output res; float scale = 1.0f; float max_bias = 0.0f; @@ -56,13 +57,13 @@ OutputVector translate_soft_max(const NodeContext& context) { } // Stride slice mask node - Output mask_begin_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); + Output slice_start = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}); - auto input_last_two_dim = get_dimensions(input_node.get_node_shared_ptr(), {1, 2}); - auto mask_slice_shape = std::make_shared(ov::NodeVector{one, input_last_two_dim}, 0); - Output mask_stride_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); - auto mask_node_sliced = - std::make_shared(mask_node, mask_begin_node, mask_slice_shape, mask_stride_node); + auto token_len = get_dimensions(input_node.get_node_shared_ptr(), {1}); + auto total_token_len = get_dimensions(mask_node.get_node_shared_ptr(), {2}); + auto slice_end = std::make_shared(ov::NodeVector{one, token_len, total_token_len}, 0); + Output slice_stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); + auto mask_node_sliced = std::make_shared(mask_node, slice_start, slice_end, slice_stride); // slope * mask auto slope_node = @@ -73,13 +74,13 @@ OutputVector translate_soft_max(const NodeContext& context) { auto input_slope_mask_node = std::make_shared(input_node, slope_mask_node); // Calculate softmax - auto res = std::make_shared(input_slope_mask_node, 2); - return {res}; + res = std::make_shared(input_slope_mask_node, 2); } else { // Directly softmax - auto res = std::make_shared(input_node, 0); - return {res}; + res = std::make_shared(input_node, 0); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 7d33ca9d6170b..99178a1944bc0 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -14,7 +14,7 @@ OutputVector translate_transpose(const NodeContext& context) { auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 1c396e6aaf9c2..6c73653ca4dba 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -17,7 +17,7 @@ OutputVector translate_unary_silu(const NodeContext& context) { auto sigmoid = std::make_shared(input); auto res = std::make_shared(input, sigmoid); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 910a0d833663d..8eda23c1c532f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,5 +1,8 @@ #include "translate_session.hpp" +#include +#include + #include "input_model.hpp" namespace ov { @@ -91,11 +94,18 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo used_params.push_back(param); } } - if (auto diff = params.size() - used_params.size()) { - std::cout << diff << " parameters are not used in the model." << std::endl; + if (getenv("GGML_OPENVINO_PROFILING")) { + if (auto diff = params.size() - used_params.size()) { + std::cout << diff << " parameters are not used in the model." << std::endl; + } } resulting_model = std::make_shared(results, used_params); + ov::pass::Manager manager; + manager.set_per_pass_validation(true); + manager.register_pass(); + manager.run_passes(resulting_model); + return resulting_model; } diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index ff16e9d4ae20d..69e26f05ca095 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -47,6 +47,17 @@ std::shared_ptr get_dimensions(const std::shared_ptr& node, return get_dimensions(std::make_shared(node), dims); } +OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) { + for (const auto& output : outputs) { + auto node = output.get_node_shared_ptr(); + std::string name = node->get_friendly_name(); + name += "_"; + name += suffix; + node->set_friendly_name(name); + } + return outputs; +} + } // namespace ggml } // namespace frontend } // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index 6e106fa932bfc..e0fe25078992d 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -55,6 +55,8 @@ std::vector permute(const std::vector& x, const std::vector& perm) { std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); +OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); + namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { From 2d743026e90b758ad1ef7d272ef44e7bd123fa0e Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 28 May 2025 18:32:18 -0700 Subject: [PATCH 073/156] Update openvino build instructions --- docs/build.md | 135 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 38 deletions(-) diff --git a/docs/build.md b/docs/build.md index 07305e21ad664..4ee3666eab88c 100644 --- a/docs/build.md +++ b/docs/build.md @@ -13,6 +13,21 @@ cd llama.cpp The following sections describe how to build with different backends and options. +* [CPU Build](#cpu-build) +* [BLAS Build](#blas-build) +* [Metal Build](#metal-build) +* [SYCL](#sycl) +* [CUDA](#cuda) +* [MUSA](#musa) +* [HIP](#hip) +* [Vulkan](#vulkan) +* [CANN](#cann) +* [Arm® KleidiAI™](#arm-kleidiai) +* [OpenCL](#opencl) +* [Android](#android-1) +* [OPENVINO](#openvino) +* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends) + ## CPU Build Build llama.cpp using `CMake`: @@ -577,62 +592,106 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OPENVINO -### Build openvino +[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. -```bash -git clone https://github.com/openvinotoolkit/openvino.git -cd openvino -git submodule update --init --recursive -export OPENVINO_DIR=$(pwd) +Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. + +### 1. Install OpenVINO Runtime -sudo ./install_build_dependencies.sh +- Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)** -mkdir -p build/Release && cd build/Release -cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_DEBUG_CAPS=ON ../.. +- After installation, make sure to [source the environment setup script](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html#step-2-configure-the-environment): +```bash +source /opt/intel/openvino_2025.1.0/setupvars.sh ``` +- Verify OpenVINO is initialized properly +```bash +echo $OpenVINO_DIR +``` + +### 2. Build llama.cpp with OpenVINO Backend -### Build llama.cpp-ov +Clone the OpenVINO-enabled llama.cpp fork and build it: ```bash -git clone https://github.com/intel-sandbox/llama.cpp-ov.git -cd llama.cpp-ov +git clone https://github.com/ravi9/llama.cpp.git +cd llama.cpp git switch dev_backend_openvino +# Build with OpenVINO support cmake --preset ReleaseOV -cmake --build build/ReleaseOV +cmake --build build/ReleaseOV --parallel + ``` -Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. - ``` bash - wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf - ``` +### 3. Download Sample Model + +Download the Phi-3 mini model for testing: + +```bash +# Create models directory +mkdir -p ~/models/Phi-3-mini-4k-instruct-gguf + +# Download model file +wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ + -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf + +``` + +### 4. Run inference with OpenVINO backend: + +When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster. -Execute the following command to test. ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " -``` - -Environment variables: -- GGML_OPENVINO_WEIGHT_AS_INPUT: - Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. -- GGML_OPENVINO_CACHE_DIR: - If set, model caching in OpenVINO will be used. -- GGML_OPENVINO_DUMP_CGRAPH: - Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one. -- GGML_OPENVINO_PROFILING: - Print the time taken for each phase in the OpenVINO backend. -- GGML_OPENVINO_DUMP_IR: - Dump the converted OpenVINO IR. The filenames are timestamps. -- GGML_OPENVINO_DEBUG_INPUT -- GGML_OPENVINO_DEBUG_OUTPUT - -To use Llama.cpp's builtin CPU backend: + +./build/ReleaseOV/bin/llama-simple \ + -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ + -n 50 \ + "Hello, my name is " + +``` + +### Using Llama.cpp's Built-in CPU Backend (for Comparison) + +To compare performance with the deafult CPU backend: + ```bash +# Build CPU-only version cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU +cmake --build build/ReleaseCPU --parallel + +# Run with Default CPU backend +./build/ReleaseCPU/bin/llama-simple \ + -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ + -n 50 \ + "Hello, my name is " + +``` + +### Configuration Options + +Control OpenVINO behavior using these environment variables: + +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. +- **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling +- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt` +- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps +- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging +- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging + +### Example with Profiling + +```bash +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +export GGML_OPENVINO_PROFILING=1 + +./build/ReleaseOV/bin/llama-simple \ + -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ + -n 50 \ + "Hello, my name is " -./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " ``` ## Notes about GPU-accelerated backends From 4f2036dc57fc9805f95cd5b672d93e172be1fd42 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 27 May 2025 16:51:14 +0800 Subject: [PATCH 074/156] Add initial NPU support --- ggml/src/ggml-openvino/ggml-decoder.cpp | 54 ++++----- ggml/src/ggml-openvino/ggml-decoder.h | 13 ++- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 + .../ggml-openvino/openvino/node_context.hpp | 3 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 106 ++++++++++++++---- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 8 +- .../ggml-openvino/openvino/op/rms_norm.cpp | 23 ++-- ggml/src/ggml-openvino/openvino/op/rope.cpp | 5 +- ggml/src/ggml-openvino/utils.cpp | 86 +++++++++----- 9 files changed, 201 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e6474d6def2d7..7bb092a65cae8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -25,14 +26,16 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph) +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) : m_cgraph(cgraph), m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_is_static(is_static), + m_is_first_token(is_first_token) { static std::map> model_weights; if (m_node) { - set_input_output(m_node, model_weights); + set_input_output(m_node); } else { static bool printed = false; if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { @@ -47,7 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap set_max_token_len(); static bool weight_created = false; - if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { + if (!weight_created) { add_weight_const_parallel(model_weights); weight_created = true; } @@ -55,7 +58,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); - set_input_output(cur_node, model_weights); + set_input_output(cur_node); } m_model_weights = model_weights; @@ -65,8 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node, - std::map>& model_weights) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -95,21 +97,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; - if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); - auto& weights_map = weight_as_input ? m_model_inputs : model_weights; - if (weights_map.find(src_name) != weights_map.end()) { - continue; - } - - std::shared_ptr weight_node = - weight_as_input - ? std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}) - : create_weight_node(src); - weight_node->set_friendly_name(src_name); - weights_map[src_name] = weight_node; - - } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); @@ -119,10 +107,24 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } ov::PartialShape input_shape; if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + if (m_is_static) { + input_shape = ov::PartialShape(get_shape(src)); + // if (m_is_first_token) { + // input_shape = ov::PartialShape{1, 1, m_max_token_len}; + // } else { + // input_shape = ov::PartialShape{1, 1, 1}; + // } + } else { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + } } else if (std::string(src->name).find("KQ_mask") == 0) { - auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + if (m_is_static) { + input_shape = ov::PartialShape(get_shape(src)); + } else { + auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = + ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + } } else { input_shape = ov::Shape{get_shape(src)}; } @@ -510,7 +512,7 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4d4a928121160..b372cc80404bc 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -12,7 +12,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -89,8 +89,15 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_model_output_names; } + virtual bool is_static() const override { + return m_is_static; + } + virtual bool is_first_token() const { + return m_is_first_token; + } + private: - void set_input_output(ggml_tensor* node, std::map>& model_weights); + void set_input_output(ggml_tensor* node); void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); @@ -119,6 +126,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + bool m_is_static; + bool m_is_first_token; }; void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index b3cf75817fcd9..a0b9509336349 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -55,6 +55,8 @@ class GgmlDecoder : public DecoderBase { virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; + + virtual bool is_static() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 44f55222e35db..f5940585a618e 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -84,6 +84,9 @@ class NodeContext : public frontend::NodeContext { int get_op_case() const { return m_decoder->get_op_case(); } + bool is_static() const { + return m_decoder->is_static(); + } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 7cdeddce38af3..fe755a5f647b4 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +59,13 @@ OutputVector translate_cpy(const NodeContext& context) { token_len = std::make_shared(token_len, ov::op::v0::Constant::create(ov::element::i64, {0}, {}), false); + + if (context.is_static()) { + int32_t* op_params = context.get_input_op_params(1); + int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size; + past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + } + auto total_token_len = std::make_shared(past_token_len, token_len); std::shared_ptr indices = std::make_shared(past_token_len, total_token_len, one, ov::element::i64); @@ -67,39 +76,88 @@ OutputVector translate_cpy(const NodeContext& context) { res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v - int64_t total_head_size = src0_shape[1]; - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + + auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); + + int64_t total_head_size = src0_shape[1]; + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); + auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - past_token_len = std::make_shared(past_token_len, zero); - auto total_token_len = std::make_shared(past_token_len, token_len); + auto token_len_scalar = std::make_shared(token_len, zero); + if (context.is_static()) { + int32_t* op_params = context.get_input_op_params(1); + int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2]; + past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + } + auto total_token_len_scalar = std::make_shared(past_token_len, token_len_scalar); + + // auto reshaped_src1 = std::make_shared( + // src1, + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + // false); + + // auto src1_left = std::make_shared( + // reshaped_src1, + // ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), + // std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), + // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + // auto src1_right = std::make_shared( + // reshaped_src1, + // std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), + // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + // auto reshaped_src0 = std::make_shared( + // src0, + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + // false); + + // auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + + // 1D tensor of shape [total_head_size], values starting from 0 + auto range_row = + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); + auto range_row_reshaped = + std::make_shared(range_row, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); + auto row_indices = std::make_shared( + range_row_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // 1D tensor of shape [token_len], values starting from past_token_len + auto range_col = + std::make_shared(past_token_len, total_token_len_scalar, one_scalar, element::i64); + auto range_col_reshaped = + std::make_shared(range_col, + ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); + auto col_indices = std::make_shared( + range_col_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] + auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + auto indices_final = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), + false); + auto flattend_src0 = + std::make_shared(src0, + ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), + false); auto reshaped_src1 = std::make_shared( src1, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - false); - - auto src1_left = std::make_shared( - reshaped_src1, - ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), - std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - auto src1_right = std::make_shared( - reshaped_src1, - std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - auto reshaped_src0 = std::make_shared( - src0, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), false); - res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); + res = std::make_shared(updated, zero); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 06e7d9ece0a9c..20ad5683b8d45 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -55,17 +55,21 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output A; ov::Output B; - auto attention_size = context.get_input("attention_size"); - auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); auto src0_stride = context.get_input_stride(0); auto permuted = is_permuted(src0_stride); auto token_dim = permuted ? 0 : 2; + auto attention_size = context.get_input("attention_size"); + auto src0_perm = argsort_descend(src0_stride); auto src0_original_shape_ = permute(src0_shape, src0_perm); std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); + + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]}); + } src0_original_shape[token_dim] = -1; auto src0_slice_shape = src0_original_shape; diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 7b8b582dacf89..4b230ad630bfa 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,8 +1,9 @@ +#include #include #include #include #include -#include +#include #include #include "../node_context.hpp" @@ -16,28 +17,24 @@ namespace op { OutputVector translate_rms_norm(const NodeContext& context) { num_inputs_check(context, 1, 1); - ov::Shape input_shape = context.get_input_shape(0).to_shape(); auto input_node = context.get_input(0); auto square = std::make_shared(input_node, input_node); - auto reduce_sum = - std::make_shared(square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true); - - auto mean = std::make_shared( - reduce_sum, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(input_shape[2])})); + auto mean = + std::make_shared(square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), + true); float eps; memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + auto rms = std::make_shared( std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); - auto scale = - std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), rms); + auto reciprocal = + std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms); - auto res = std::make_shared(input_node, scale); + auto res = std::make_shared(input_node, reciprocal); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 94810e549d5ef..b47b8a6a54bde 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,4 +1,3 @@ - #include #include #include @@ -23,6 +22,10 @@ #include "../node_context.hpp" #include "../utils.hpp" +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + #define GGML_ROPE_TYPE_NEOX 2 #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 040ca1961e833..65a609f1d7e60 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -4,11 +4,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -17,8 +19,8 @@ #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { - return std::make_shared(nullptr, cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) { + return std::make_shared(nullptr, cgraph, is_static, is_first_token); } ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { @@ -49,50 +51,63 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { } enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { + static ov::Core core; + static bool is_first_token = true; + + static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; + if (device.empty()) { + // Prefer GPU over CPU + for (const auto& dev : core.get_available_devices()) { + device = dev; + if (device == "GPU") + break; + } + } + + bool is_static = device == "NPU" ? true : false; + ov::AnyMap config; + if (is_static) { + config = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_DEVICES", "NPU"}, + {"NPUW_FOLD", "YES"}, + // {"NPU_COMPILER_TYPE", "MLIR"}, + }; + } + auto start_time = ggml_time_us(); - static ov::Core core; auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (cache_dir) { + if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } - // auto devices = core.get_available_devices(); - // static auto front_end = get_ggml_frontend(); - // if (!front_end) { - // GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - // return GGML_STATUS_FAILED; - // } - - using CachedItem = std::pair, ov::CompiledModel>; + // For CPU and GPU, there is only one compiled model, so only use the first element of the pair + // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet, + // currently recompile for every token) + using CachedItem = std::pair, std::pair>; static std::unordered_map compiled_cache; std::shared_ptr model; - ov::CompiledModel compiled_model; + ov::CompiledModel compiled_model_prefill; + ov::CompiledModel compiled_model_kvcache; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - auto ggml_decoder = get_ggml_decoder(cgraph); + auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token); decoder_end_time = ggml_time_us(); auto it = compiled_cache.find(cgraph); - if (it != compiled_cache.end()) { + if (it != compiled_cache.end() && !is_static) { model = it->second.first; conversion_end_time = ggml_time_us(); - compiled_model = it->second.second; + compiled_model_prefill = it->second.second.first; + compiled_model_kvcache = it->second.second.second; compile_end_time = ggml_time_us(); } else { - // std::shared_ptr graph_decoder = ggml_decoder; - // ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - // if (!input_model) { - // GGML_LOG_ERROR("Input Model is not loaded \n"); - // return GGML_STATUS_FAILED; - // } - - // model = front_end->convert(input_model); - ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); @@ -105,16 +120,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } - compiled_model = core.compile_model(model, "CPU"); + compiled_model_prefill = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); - compiled_cache[cgraph] = std::make_pair(model, compiled_model); + compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache)); } - ov::InferRequest infer_request = compiled_model.create_infer_request(); + ov::InferRequest infer_request; + if (!is_static) { + infer_request = compiled_model_prefill.create_infer_request(); + } else { + infer_request = compiled_model_prefill.create_infer_request(); + // if (is_first_token) { + // infer_request = compiled_model_prefill.create_infer_request(); + // } else { + // infer_request = compiled_model_kvcache.create_infer_request(); + // } + } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { @@ -148,6 +170,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } auto end_time = ggml_time_us(); + is_first_token = false; + if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); From 320a348223400c6f05829290dae3ac0e626b3f72 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 29 May 2025 17:53:00 +0800 Subject: [PATCH 075/156] draft NPU support version 2: prefill + kvcache --- ggml/src/ggml-openvino/ggml-decoder.cpp | 27 +-- ggml/src/ggml-openvino/ggml-decoder.h | 7 +- ggml/src/ggml-openvino/openvino/decoder.hpp | 3 + .../ggml-openvino/openvino/node_context.hpp | 7 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 90 ++++------ ggml/src/ggml-openvino/utils.cpp | 163 +++++++++++++----- ggml/src/ggml-openvino/utils.h | 27 ++- 7 files changed, 211 insertions(+), 113 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7bb092a65cae8..29be4dbae883e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -108,22 +108,25 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { ov::PartialShape input_shape; if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { if (m_is_static) { - input_shape = ov::PartialShape(get_shape(src)); - // if (m_is_first_token) { - // input_shape = ov::PartialShape{1, 1, m_max_token_len}; - // } else { - // input_shape = ov::PartialShape{1, 1, 1}; - // } + if (m_is_first_token) { + input_shape = ov::PartialShape{1, 1, m_max_token_len}; + } else { + input_shape = ov::PartialShape{1, 1, 1}; + } } else { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; } - } else if (std::string(src->name).find("KQ_mask") == 0) { + } else if (std::string(src->name) == "KQ_mask") { if (m_is_static) { - input_shape = ov::PartialShape(get_shape(src)); + if (m_is_first_token) { + input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len}; + } else { + input_shape = ov::PartialShape{1, 1, m_max_token_len}; + } } else { - auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); input_shape = - ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } } else { input_shape = ov::Shape{get_shape(src)}; @@ -208,6 +211,7 @@ void GgmlOvDecoder::set_max_token_len() { void GgmlOvDecoder::add_extra_inputs() { int64_t past_token_len; + // attention_size not used for NPU int64_t attention_size; for (const auto& node : m_nodes) { @@ -231,8 +235,7 @@ void GgmlOvDecoder::add_extra_inputs() { for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; - attention_size = (total_token_len + 31) / 32 * 32; - + attention_size = GGML_PAD(total_token_len, 32); std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b372cc80404bc..2c89d062676d8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -92,9 +92,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_static() const override { return m_is_static; } - virtual bool is_first_token() const { + virtual bool is_first_token() const override { return m_is_first_token; } + virtual int get_max_token_len() const override { + return m_max_token_len; + } private: void set_input_output(ggml_tensor* node); @@ -106,7 +109,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::shared_ptr create_weight_node(ggml_tensor* tensor); void set_max_token_len(); - int64_t m_max_token_len; + int m_max_token_len; void add_weight_const_parallel(std::map>& model_weights); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a0b9509336349..62125683995fc 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -57,6 +58,8 @@ class GgmlDecoder : public DecoderBase { virtual const std::vector& get_model_output_names() const = 0; virtual bool is_static() const = 0; + virtual bool is_first_token() const = 0; + virtual int get_max_token_len() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index f5940585a618e..f4e7c4e31f4e3 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "decoder.hpp" @@ -87,6 +88,12 @@ class NodeContext : public frontend::NodeContext { bool is_static() const { return m_decoder->is_static(); } + bool is_first_token() const { + return m_decoder->is_first_token(); + } + int get_max_token_len() const { + return m_decoder->get_max_token_len(); + } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index fe755a5f647b4..75dd0e7d83ab0 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -34,18 +34,26 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto past_token_len = context.get_input("past_token_len"); + auto past_token_len_scalar = context.get_input("past_token_len"); + + src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; + if (context.is_static() && context.is_first_token()) { + res = src0; + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); - auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - src0 = std::make_shared(src0, src1); if (op_case == 1) { // Write K to cache_k int64_t head_size = src0_shape[2]; @@ -56,32 +64,29 @@ OutputVector translate_cpy(const NodeContext& context) { auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); - token_len = std::make_shared(token_len, - ov::op::v0::Constant::create(ov::element::i64, {0}, {}), - false); + auto token_len_scalar = std::make_shared(token_len, zero); + std::shared_ptr indices; if (context.is_static()) { - int32_t* op_params = context.get_input_op_params(1); - int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size; - past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + indices = past_token_len_scalar.get_node_shared_ptr(); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{0, 1})); + } else { + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + indices = std::make_shared(past_token_len_scalar, + total_token_len_scalar, + one_scalar, + ov::element::i64); + indices = std::make_shared(indices, one); } - auto total_token_len = std::make_shared(past_token_len, token_len); - std::shared_ptr indices = - std::make_shared(past_token_len, total_token_len, one, ov::element::i64); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); int64_t total_head_size = src0_shape[1]; auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); @@ -89,36 +94,6 @@ OutputVector translate_cpy(const NodeContext& context) { auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); auto token_len_scalar = std::make_shared(token_len, zero); - if (context.is_static()) { - int32_t* op_params = context.get_input_op_params(1); - int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2]; - past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); - } - auto total_token_len_scalar = std::make_shared(past_token_len, token_len_scalar); - - // auto reshaped_src1 = std::make_shared( - // src1, - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - // false); - - // auto src1_left = std::make_shared( - // reshaped_src1, - // ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), - // std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), - // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - // auto src1_right = std::make_shared( - // reshaped_src1, - // std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), - // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - // auto reshaped_src0 = std::make_shared( - // src0, - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - // false); - - // auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); // 1D tensor of shape [total_head_size], values starting from 0 auto range_row = @@ -131,8 +106,19 @@ OutputVector translate_cpy(const NodeContext& context) { std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = - std::make_shared(past_token_len, total_token_len_scalar, one_scalar, element::i64); + std::shared_ptr range_col; + if (context.is_static()) { + range_col = past_token_len_scalar.get_node_shared_ptr(); + range_col = std::make_shared( + range_col, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{0})); + } else { + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + range_col = std::make_shared(past_token_len_scalar, + total_token_len_scalar, + one_scalar, + ov::element::i64); + } auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 65a609f1d7e60..3e49081515902 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,5 +1,7 @@ #include "utils.h" +#include +#include #include #include #include @@ -13,6 +15,7 @@ #include #include #include +#include #include "ggml-impl.h" #include "ggml.h" @@ -52,7 +55,6 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { static ov::Core core; - static bool is_first_token = true; static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { @@ -66,12 +68,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (is_static) { + if (device == "NPU") { config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, {"NPU_USE_NPUW", "YES"}, {"NPUW_DEVICES", "NPU"}, {"NPUW_FOLD", "YES"}, + {"NPUW_DQ", "YES"}, + {"NPUW_FUNCALL_ASYNC", "YES"}, + {"NPUW_HOST_GATHER", "YES"}, + {"NPUW_WEIGHTS_BANK", "shared"}, // {"NPU_COMPILER_TYPE", "MLIR"}, }; } @@ -83,69 +89,128 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } - // For CPU and GPU, there is only one compiled model, so only use the first element of the pair - // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet, - // currently recompile for every token) - using CachedItem = std::pair, std::pair>; - static std::unordered_map compiled_cache; + // CPU and GPU will only use cache_prefill + using CachedItem = std::pair, ov::CompiledModel>; + static std::unordered_map compiled_cache_prefill; + static std::unordered_map compiled_cache_kvcache; + std::shared_ptr ggml_decoder; std::shared_ptr model; - ov::CompiledModel compiled_model_prefill; - ov::CompiledModel compiled_model_kvcache; + ov::CompiledModel compiled_model; + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token); - decoder_end_time = ggml_time_us(); + auto it = compiled_cache_prefill.find(cgraph); + bool is_first_token = it == compiled_cache_prefill.end(); + if (!is_first_token) { + ggml_decoder = get_ggml_decoder(cgraph, is_static, false); + decoder_end_time = ggml_time_us(); - auto it = compiled_cache.find(cgraph); - if (it != compiled_cache.end() && !is_static) { - model = it->second.first; + if (is_static) { + model = compiled_cache_kvcache[cgraph].first; + compiled_model = compiled_cache_kvcache[cgraph].second; + } else { + model = it->second.first; + compiled_model = it->second.second; + } conversion_end_time = ggml_time_us(); - - compiled_model_prefill = it->second.second.first; - compiled_model_kvcache = it->second.second.second; - compile_end_time = ggml_time_us(); + compile_end_time = conversion_end_time; } else { - ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (is_static) { + ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); + + model = ov::frontend::ggml::FrontEnd::convert(input_model); + auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + conversion_end_time = ggml_time_us(); + + compiled_model = core.compile_model(model, device, config); + auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); + compile_end_time = ggml_time_us(); + + compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); + ov::serialize(model_kvcache, timestamped_filename); + } + } else { + ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + conversion_end_time = ggml_time_us(); + + compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } } - - compiled_model_prefill = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - - compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache)); - } - - ov::InferRequest infer_request; - if (!is_static) { - infer_request = compiled_model_prefill.create_infer_request(); - } else { - infer_request = compiled_model_prefill.create_infer_request(); - // if (is_first_token) { - // infer_request = compiled_model_prefill.create_infer_request(); - // } else { - // infer_request = compiled_model_kvcache.create_infer_request(); - // } } + auto infer_request = compiled_model.create_infer_request(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - } else { + + } else if (!is_static) { input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + + } else { + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = + pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } } infer_request.set_input_tensor(i, input_tensor); @@ -234,3 +299,9 @@ void print_output_tensor_info(const std::string& name, break; } } + +void set_zero_diagonal(std::vector& matrix, size_t dim) { + for (size_t i = 0; i < dim; ++i) { + matrix[i * dim + i] = 0.0f; + } +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 88c182d9edb71..000c2b87c1b3f 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,12 +1,37 @@ +#include + #include "ggml-backend-impl.h" #include "ggml-decoder.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + +ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name); + +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); + size_t checksum(const void* data, size_t size); void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, - std::map& output_dst); \ No newline at end of file + std::map& output_dst); + +template +std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) { + std::vector padded_data(padded_rows * padded_cols, pad_value); + size_t rows = tensor->ne[1]; + size_t cols = tensor->ne[0]; + T* data = static_cast(tensor->data); + + for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { + for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { + padded_data[i * padded_cols + j] = data[i * cols + j]; + } + } + return padded_data; +} + +void set_zero_diagonal(std::vector& matrix, size_t dim); From 24d5c02805705b1d7ac22f9acd13c99f9db3485c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 3 Jun 2025 14:22:51 +0800 Subject: [PATCH 076/156] NPU support version 2: prefill + kvcache --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 16 +++--- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 3 +- ggml/src/ggml-openvino/utils.cpp | 54 +++++++++++++------ ggml/src/ggml-openvino/utils.h | 3 ++ 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 29be4dbae883e..66f82773e30d4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -222,11 +222,11 @@ void GgmlOvDecoder::add_extra_inputs() { past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{}); + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{}); + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); *tensor->data() = past_token_len; m_model_extra_input_values[name] = tensor; break; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 75dd0e7d83ab0..49736450242a3 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -34,7 +34,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto past_token_len_scalar = context.get_input("past_token_len"); + auto past_token_len = context.get_input("past_token_len"); src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -68,18 +68,16 @@ OutputVector translate_cpy(const NodeContext& context) { std::shared_ptr indices; if (context.is_static()) { - indices = past_token_len_scalar.get_node_shared_ptr(); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{0, 1})); + indices = past_token_len.get_node_shared_ptr(); } else { + auto past_token_len_scalar = std::make_shared(past_token_len, zero); auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); indices = std::make_shared(past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); - indices = std::make_shared(indices, one); } + indices = std::make_shared(indices, one); res = std::make_shared(reshaped_src1, indices, src0); } else { @@ -108,11 +106,9 @@ OutputVector translate_cpy(const NodeContext& context) { // 1D tensor of shape [token_len], values starting from past_token_len std::shared_ptr range_col; if (context.is_static()) { - range_col = past_token_len_scalar.get_node_shared_ptr(); - range_col = std::make_shared( - range_col, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{0})); + range_col = past_token_len.get_node_shared_ptr(); } else { + auto past_token_len_scalar = std::make_shared(past_token_len, zero); auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); range_col = std::make_shared(past_token_len_scalar, total_token_len_scalar, diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 20ad5683b8d45..0d3190f6c1f4e 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,7 +69,7 @@ OutputVector translate_mulmat(const NodeContext& context) { std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]}); + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); } src0_original_shape[token_dim] = -1; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3e49081515902..fe46b8a794108 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include +#include #include #include #include @@ -70,15 +71,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::AnyMap config; if (device == "NPU") { config = { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, - {"NPU_USE_NPUW", "YES"}, - {"NPUW_DEVICES", "NPU"}, - {"NPUW_FOLD", "YES"}, - {"NPUW_DQ", "YES"}, - {"NPUW_FUNCALL_ASYNC", "YES"}, - {"NPUW_HOST_GATHER", "YES"}, - {"NPUW_WEIGHTS_BANK", "shared"}, - // {"NPU_COMPILER_TYPE", "MLIR"}, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + { "NPU_USE_NPUW", "YES" }, + { "NPUW_DEVICES", "NPU" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_HOST_GATHER", "YES" }, + { "NPUW_DQ", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + // Option 'CACHE_DIR' is not supported with MLIR compiler type + // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + { "NPU_COMPILER_TYPE", "MLIR" }, }; } @@ -102,15 +105,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c int64_t conversion_end_time; int64_t compile_end_time; + bool is_first_token = is_prefill(cgraph); + auto it = compiled_cache_prefill.find(cgraph); - bool is_first_token = it == compiled_cache_prefill.end(); - if (!is_first_token) { + if (it != compiled_cache_prefill.end()) { ggml_decoder = get_ggml_decoder(cgraph, is_static, false); decoder_end_time = ggml_time_us(); if (is_static) { - model = compiled_cache_kvcache[cgraph].first; - compiled_model = compiled_cache_kvcache[cgraph].second; + if (is_first_token) { + model = compiled_cache_prefill[cgraph].first; + compiled_model = compiled_cache_prefill[cgraph].second; + } else { + model = compiled_cache_kvcache[cgraph].first; + compiled_model = compiled_cache_kvcache[cgraph].second; + } } else { model = it->second.first; compiled_model = it->second.second; @@ -235,8 +244,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } auto end_time = ggml_time_us(); - is_first_token = false; - if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); @@ -305,3 +312,20 @@ void set_zero_diagonal(std::vector& matrix, size_t dim) { matrix[i * dim + i] = 0.0f; } } + +bool is_prefill(struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * op = cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + auto* src = op->src[j]; + if (src == nullptr) { + break; + } + if (std::string(src->name) == "inp_tokens") { + return src->ne[0] != 1; + } + } + } + GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph"); + throw std::runtime_error("is_prefill: inp_tokens not found in cgraph"); +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 000c2b87c1b3f..2427b0b1ce255 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -2,6 +2,7 @@ #include "ggml-backend-impl.h" #include "ggml-decoder.h" +#include "ggml-impl.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); @@ -35,3 +36,5 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p } void set_zero_diagonal(std::vector& matrix, size_t dim); + +bool is_prefill(struct ggml_cgraph * cgraph); From 2dbd96c36b4234fc19e53ebb0aff1445c14357f3 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 4 Jun 2025 17:22:50 +0800 Subject: [PATCH 077/156] Change due to ggml cgraph changes, not correct yet --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ++++++++++ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 9 ++++----- ggml/src/ggml-openvino/openvino/op/permute.cpp | 17 +++++++++++++---- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 66f82773e30d4..2a95c894f4af5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -187,6 +187,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { case GGML_OP_MUL_MAT: { if (node->src[0]->view_src == nullptr) { m_op_case = 1; + } else if (std::string(node->src[0]->name).find("cache_k") == 0) { + m_op_case = 2; + } else if (std::string(node->src[0]->name).find("cache_v") == 0) { + m_op_case = 3; + } + break; + } + case GGML_OP_PERMUTE: { + if (ggml_is_contiguous(node->src[0])) { + m_op_case = 1; } else { m_op_case = 2; } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 0d3190f6c1f4e..728ee5cb5ff17 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -24,7 +24,7 @@ OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case"); ov::Output res; @@ -59,8 +59,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); auto src0_stride = context.get_input_stride(0); - auto permuted = is_permuted(src0_stride); - auto token_dim = permuted ? 0 : 2; + auto token_dim = op_case == 2 ? 0 : 2; auto attention_size = context.get_input("attention_size"); @@ -81,7 +80,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); std::shared_ptr slice_end; - if (permuted) { + if (op_case == 2) { slice_end = std::make_shared( ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, 0); @@ -94,7 +93,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); - if (permuted) { + if (op_case == 2) { B = std::make_shared( src0_slice, ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 649cf8f3e1cc6..8e91b6120168e 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -12,10 +12,19 @@ namespace op { OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); - auto perm = argsort_descend(context.get_output_stride(0)); - auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return rename_outputs_with_suffix({res}, context.get_name()); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + ov::Output res; + + if (op_case == 1) { + auto perm = argsort_descend(context.get_output_stride(0)); + auto res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + return rename_outputs_with_suffix({res}, context.get_name()); + } else { + auto res = context.get_input(0); + return {res}; + } } } // namespace op From 00ed1e020cc68aa7e07bce8aa5fc212e7ab991d0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 11:46:40 +0800 Subject: [PATCH 078/156] Change due to ggml cgraph changes, llama-3.2 CPU work --- ggml/src/ggml-openvino/ggml-decoder.cpp | 9 ++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 69 +------------------ .../src/ggml-openvino/openvino/op/permute.cpp | 53 +++++++++++++- ggml/src/ggml-openvino/utils.cpp | 1 + 4 files changed, 60 insertions(+), 72 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2a95c894f4af5..7b4456c8d08bf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -195,10 +195,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { break; } case GGML_OP_PERMUTE: { - if (ggml_is_contiguous(node->src[0])) { + if (node->src[0]->view_src == nullptr) { + // Permute Qcur m_op_case = 1; - } else { + } else if (ggml_is_contiguous(node->src[0])) { + // Permute cache_k (view) m_op_case = 2; + } else { + // Permute cache_v (view) + m_op_case = 3; } break; } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 728ee5cb5ff17..b94f327a1f273 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -34,75 +34,10 @@ OutputVector translate_mulmat(const NodeContext& context) { auto result_lp = std::make_shared(src1, src0, false, true); res = std::make_shared(result_lp, context.get_output_type(0)); } else { - /* - Two cases here: - - 21: [ 96, 32, 32, 1] VIEW k-0 [ 2, 6144, 192, 6144] - [ 196608, 1, 1, 1] 0: NONE cache_k_l0 [ 2, 393216, 393216, 393216] - - 22: [ 96, 7, 32, 1] PERMUTE q-0 [ 4, 12288, 384, 86016] - [ 96, 32, 7, 1] 0: SCALE Qcur-0 [ 4, 384, 12288, 86016] - - 23: [ 32, 7, 32, 1] MUL_MAT kq-0 [ 4, 128, 896, 28672] - [ 96, 32, 32, 1] 0: VIEW k-0 [ 2, 6144, 192, 6144] - [ 96, 7, 32, 1] 1: PERMUTE q-0 [ 4, 12288, 384, 86016] + ov::Output B = context.get_input(0); + ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - - 20: [ 32, 96, 32, 1] VIEW v-0 [ 2, 128, 12288, 393216] - [ 196608, 1, 1, 1] 0: NONE cache_v_l0 [ 2, 393216, 393216, 393216] - - 25: [ 96, 7, 32, 1] MUL_MAT kqv-0 [ 4, 384, 2688, 86016] - [ 32, 96, 32, 1] 0: VIEW v-0 [ 2, 128, 12288, 393216] - [ 32, 7, 32, 1] 1: SOFT_MAX kq_soft_max_ext-0 [ 4, 128, 896, 28672] - - For case 1, for src0, Reshape + Slice + Transpose - For case 2, for src0, Reshape + Slice - */ - ov::Output A; - ov::Output B; - - auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); - auto src0_stride = context.get_input_stride(0); - auto token_dim = op_case == 2 ? 0 : 2; - - auto attention_size = context.get_input("attention_size"); - - auto src0_perm = argsort_descend(src0_stride); - auto src0_original_shape_ = permute(src0_shape, src0_perm); - std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); - - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } - src0_original_shape[token_dim] = -1; - - auto src0_slice_shape = src0_original_shape; - src0_slice_shape.erase(src0_slice_shape.begin() + token_dim); - - auto src0_reshape_shape = - ov::op::v0::Constant::create(ov::element::i64, {src0_original_shape.size()}, src0_original_shape); - auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); - - std::shared_ptr slice_end; - if (op_case == 2) { - slice_end = std::make_shared( - ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, - 0); - } else { - slice_end = std::make_shared( - ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape), attention_size}, - 0); - } - auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); - auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); - auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); - - if (op_case == 2) { - B = std::make_shared( - src0_slice, - ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); - } else { - B = src0_slice; - } - - A = std::make_shared(context.get_input(1), context.get_input_type(0)); - int64_t num_heads = context.get_input_shape(1).to_shape()[0]; int64_t num_heads_kv = src0_shape[0]; int64_t kv_num_heads_factor = num_heads / num_heads_kv; diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 8e91b6120168e..8b246f75cd137 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,4 +1,11 @@ +#include +#include +#include +#include +#include #include +#include +#include #include #include "../node_context.hpp" @@ -13,7 +20,7 @@ OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); ov::Output res; if (op_case == 1) { @@ -22,8 +29,48 @@ OutputVector translate_permute(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return rename_outputs_with_suffix({res}, context.get_name()); } else { - auto res = context.get_input(0); - return {res}; + auto src = context.get_input(0); + auto attention_size = context.get_input("attention_size"); + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } + + auto src_shape_ = context.get_input_shape(0).to_shape(); + std::vector src_shape(src_shape_.begin(), src_shape_.end()); + + std::shared_ptr src_reshaped; + if (op_case == 2) { + src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); + } else { + src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{src_shape[1], src_shape[0], -1}), + false); + } + + auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); + auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); + std::shared_ptr slice_end; + if (op_case == 2) { + slice_end = std::make_shared( + ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[2]})}, + 0); + } else { + slice_end = std::make_shared( + ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[0]}), attention_size}, + 0); + } + auto src_slice = std::make_shared(src_reshaped, slice_start, slice_end, slice_step); + + if (op_case == 2) { + res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = src_slice; + } + return rename_outputs_with_suffix({res}, context.get_name()); } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index fe46b8a794108..44356209ceb87 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -262,6 +262,7 @@ size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { + sum += (uint8_t)i; sum += bytes[i]; } return sum; From 1d8b1a2b247af86679f161750ef133c00f327943 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 13:19:51 +0800 Subject: [PATCH 079/156] Add AMD64 to CMakeLists --- ggml/src/ggml-openvino/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 08712c1527a89..216aa756a7a96 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -12,7 +12,7 @@ target_link_libraries(ggml-openvino PRIVATE openvino::runtime) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") - elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") else() message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}") endif() From 0fe973d65cd6097a2acac8db3d8e5de4d348791b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 13:20:11 +0800 Subject: [PATCH 080/156] Change due to ggml cgraph changes, all device work --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7b4456c8d08bf..7b62f4487c73a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -216,9 +216,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { void GgmlOvDecoder::set_max_token_len() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "k-0") { + if (std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; - m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[2]; + m_max_token_len = cache_k->ne[1]; break; } } From 6b14c7a148016288e4625a90dfc7a6a15e05eaae Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 20 Jun 2025 16:41:42 +0800 Subject: [PATCH 081/156] Refactor: clean, fix warning --- examples/simple/simple.cpp | 2 +- ggml/CMakeLists.txt | 2 - ggml/src/ggml-openvino/.clang-format | 4 + ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +- ggml/src/ggml-openvino/ggml-decoder.h | 4 +- ggml/src/ggml-openvino/openvino/op/add.cpp | 22 -- ggml/src/ggml-openvino/openvino/op/cont.cpp | 1 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 1 + .../ggml-openvino/openvino/op/get_rows.cpp | 1 + ggml/src/ggml-openvino/openvino/op/mul.cpp | 21 -- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 1 + .../src/ggml-openvino/openvino/op/permute.cpp | 8 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 1 + .../ggml-openvino/openvino/op/rms_norm.cpp | 1 + ggml/src/ggml-openvino/openvino/op/rope.cpp | 20 +- ggml/src/ggml-openvino/openvino/op/scale.cpp | 1 + .../ggml-openvino/openvino/op/soft_max.cpp | 11 +- .../ggml-openvino/openvino/op/transpose.cpp | 1 + .../ggml-openvino/openvino/op/unary_silu.cpp | 1 + ggml/src/ggml-openvino/openvino/op/view.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.cpp | 64 ++---- ggml/src/ggml-openvino/openvino/op_table.hpp | 23 ++ ggml/src/ggml-openvino/openvino/utils.hpp | 10 +- ggml/src/ggml-openvino/utils.cpp | 196 ++++++++++-------- ggml/src/ggml-openvino/utils.h | 6 +- setup.sh | 2 - 26 files changed, 213 insertions(+), 199 deletions(-) create mode 100644 ggml/src/ggml-openvino/.clang-format delete mode 100644 ggml/src/ggml-openvino/openvino/op/add.cpp delete mode 100644 ggml/src/ggml-openvino/openvino/op/mul.cpp delete mode 100755 setup.sh diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9e6c678e830aa..d09771d10457f 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { std::string s(buf, n); printf("%s", s.c_str()); } - printf("\n"); + // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 8daee61276ebe..c8e31143e0e2a 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -245,8 +245,6 @@ set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") option(GGML_OPENVINO "ggml: use OPENVINO" OFF) -option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF) -option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON) option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format new file mode 100644 index 0000000000000..8491f4e5c6d63 --- /dev/null +++ b/ggml/src/ggml-openvino/.clang-format @@ -0,0 +1,4 @@ +--- +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +ReferenceAlignment: Left diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7b62f4487c73a..04f68a495020d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -225,9 +225,9 @@ void GgmlOvDecoder::set_max_token_len() { } void GgmlOvDecoder::add_extra_inputs() { - int64_t past_token_len; + int64_t past_token_len = -1; // attention_size not used for NPU - int64_t attention_size; + int64_t attention_size = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { @@ -247,6 +247,9 @@ void GgmlOvDecoder::add_extra_inputs() { break; } } + if (past_token_len == -1) { + throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); + } for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2c89d062676d8..b6b13d1f116c7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -61,11 +61,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor* get_input_ggml_tensor(std::string& name) const { + const ggml_tensor* get_input_ggml_tensor(const std::string& name) const { return m_inputs.at(name); } - const ggml_tensor* get_output_ggml_tensor(std::string& name) const { + const ggml_tensor* get_output_ggml_tensor(const std::string& name) const { return m_outputs.at(name); } diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp deleted file mode 100644 index 5a75ff2148c83..0000000000000 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include - -#include "../node_context.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_add(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - auto res = std::make_shared(context.get_input(0), context.get_input(1)); - - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 7cdfba051ef50..5c6953caffe27 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 49736450242a3..d27f4babb4c84 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -19,6 +19,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index ca36548d9fa5b..9ed5f4deaf047 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp deleted file mode 100644 index 40caf4331e6b5..0000000000000 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include - -#include "../node_context.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_mul(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b94f327a1f273..d5a6ba2f0385f 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -13,6 +13,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 8b246f75cd137..09d15da42718c 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -9,6 +9,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { @@ -25,9 +26,8 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { auto perm = argsort_descend(context.get_output_stride(0)); - auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return rename_outputs_with_suffix({res}, context.get_name()); + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, { 3 }, perm)); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); @@ -70,8 +70,8 @@ OutputVector translate_permute(const NodeContext& context) { } else { res = src_slice; } - return rename_outputs_with_suffix({res}, context.get_name()); } + return rename_outputs_with_suffix({ res }, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 49551eb81551c..3a695683bfafb 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -8,6 +8,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 4b230ad630bfa..211692a3c706c 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index b47b8a6a54bde..78523e5781bfd 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -20,6 +20,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" #ifndef M_PI @@ -36,21 +37,19 @@ namespace frontend { namespace ggml { namespace op { -static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +namespace { +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); } -void ggml_rope_yarn_corr_dims(int n_dims, - int n_ctx_orig, - float freq_base, - float beta_fast, - float beta_slow, +void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]) { float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); dims[0] = MAX(0, start); dims[1] = MIN(n_dims - 1, end); } +} // namespace OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); @@ -67,7 +66,12 @@ OutputVector translate_rope(const NodeContext& context) { auto output_shape = context.get_output_shape(0); - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; int32_t* op_params = context.get_output_op_params(0); const int n_dims = op_params[1]; const int mode = op_params[2]; diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 8f0999432ce6f..783440ebd967e 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -3,6 +3,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index bb6b0023953f2..aeca9b3be59f3 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,5 +1,3 @@ - -#include #include #include #include @@ -13,6 +11,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { @@ -28,18 +27,18 @@ OutputVector translate_soft_max(const NodeContext& context) { float scale = 1.0f; float max_bias = 0.0f; - auto op_params = context.get_output_op_params(0); + auto * op_params = context.get_output_op_params(0); memcpy(&scale, (float*)op_params + 0, sizeof(float)); memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); - const uint32_t n_head = context.get_input_shape(0)[0].get_length(); - const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + // const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + // const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = (max_bias > 0.0f) ? 1.0f : 1.0f; // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) // : 1.0f; + const float slope = 1.0; if (scale != 1.0f) { auto scale_node = diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 99178a1944bc0..b35f1fb8610ea 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,6 +1,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 6c73653ca4dba..2b27c0be1227c 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -3,6 +3,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index fcfb9f732c581..58143e667cc6f 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,3 +1,4 @@ +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index d588b2bff0e9b..11d1c773c3476 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -9,55 +9,31 @@ #include "utils.hpp" -using namespace ov::op; namespace ov { namespace frontend { namespace ggml { -namespace op { - -#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& node) - -GGML_OP_CONVERTER(translate_add); -GGML_OP_CONVERTER(translate_cont); -GGML_OP_CONVERTER(translate_cpy); -GGML_OP_CONVERTER(translate_get_rows); -GGML_OP_CONVERTER(translate_mul); -GGML_OP_CONVERTER(translate_mulmat); -GGML_OP_CONVERTER(translate_permute); -GGML_OP_CONVERTER(translate_reshape); -GGML_OP_CONVERTER(translate_rms_norm); -GGML_OP_CONVERTER(translate_rope); -GGML_OP_CONVERTER(translate_scale); -GGML_OP_CONVERTER(translate_unary_silu); -GGML_OP_CONVERTER(translate_soft_max); -GGML_OP_CONVERTER(translate_transpose); -GGML_OP_CONVERTER(translate_unary); -GGML_OP_CONVERTER(translate_view); - -} // namespace op - std::unordered_map get_supported_ops() { - return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, - {"GGML_OP_CONT", op::translate_cont}, - {"GGML_OP_CPY", op::translate_cpy}, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, - {"GGML_OP_GET_ROWS", op::translate_get_rows}, - // {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL", op::translate_mul}, - {"GGML_OP_MUL_MAT", op::translate_mulmat}, - {"GGML_OP_PERMUTE", op::translate_permute}, - {"GGML_OP_RESHAPE", op::translate_reshape}, - {"GGML_OP_RMS_NORM", op::translate_rms_norm}, - {"GGML_OP_ROPE", op::translate_rope}, - {"GGML_OP_SCALE", op::translate_scale}, - {"GGML_OP_SOFT_MAX", op::translate_soft_max}, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose}, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, - {"GGML_OP_VIEW", op::translate_view}}; -}; + using namespace ov::op; + return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, + {"GGML_OP_CONT", op::translate_cont}, + {"GGML_OP_CPY", op::translate_cpy}, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, + {"GGML_OP_GET_ROWS", op::translate_get_rows}, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat}, + {"GGML_OP_PERMUTE", op::translate_permute}, + {"GGML_OP_RESHAPE", op::translate_reshape}, + {"GGML_OP_RMS_NORM", op::translate_rms_norm}, + {"GGML_OP_ROPE", op::translate_rope}, + {"GGML_OP_SCALE", op::translate_scale}, + {"GGML_OP_SOFT_MAX", op::translate_soft_max}, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose}, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, + {"GGML_OP_VIEW", op::translate_view}}; +} } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 1a71a06c181ff..d576c2a1357e1 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -6,6 +6,29 @@ namespace ov { namespace frontend { namespace ggml { +namespace op { + +#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context) + +GGML_OP_CONVERTER(translate_add); +GGML_OP_CONVERTER(translate_cont); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_get_rows); +GGML_OP_CONVERTER(translate_mul); +GGML_OP_CONVERTER(translate_mulmat); +GGML_OP_CONVERTER(translate_permute); +GGML_OP_CONVERTER(translate_reshape); +GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_rope); +GGML_OP_CONVERTER(translate_scale); +GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_soft_max); +GGML_OP_CONVERTER(translate_transpose); +GGML_OP_CONVERTER(translate_unary); +GGML_OP_CONVERTER(translate_view); + +} // namespace op + std::unordered_map get_supported_ops(); } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index e0fe25078992d..1896f814273b2 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -8,7 +8,9 @@ namespace ov { namespace frontend { namespace ggml { -void dump_ov_model(const std::shared_ptr model); +std::string getCurrentTime(); + +void dump_ov_model(std::shared_ptr model); void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs); @@ -52,7 +54,8 @@ std::vector permute(const std::vector& x, const std::vector& perm) { return result; } -std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); +std::shared_ptr get_dimensions(const std::shared_ptr& shape, + const std::vector& dims); std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); @@ -61,7 +64,8 @@ namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { num_inputs_check(context, 2, 2); - return {std::make_shared(context.get_input(0), context.get_input(1))}; + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + return rename_outputs_with_suffix({ res }, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 44356209ceb87..ebcf8fdd75d93 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,13 +27,15 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool return std::make_shared(nullptr, cgraph, is_static, is_first_token); } -ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { - auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, + const std::string& name) { + auto *input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = + ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; } std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { @@ -59,30 +61,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - // Prefer GPU over CPU - for (const auto& dev : core.get_available_devices()) { - device = dev; - if (device == "GPU") - break; + const std::vector preferred_device = {"GPU", "CPU", "NPU"}; + const auto available_devices = core.get_available_devices(); + for (const auto& dev : preferred_device) { + if (std::find(available_devices.begin(), available_devices.end(), + dev) != available_devices.end()) { + device = dev; + break; } + } } bool is_static = device == "NPU" ? true : false; ov::AnyMap config; if (device == "NPU") { - config = { - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - { "NPU_USE_NPUW", "YES" }, - { "NPUW_DEVICES", "NPU" }, - { "NPUW_FOLD", "YES" }, - { "NPUW_HOST_GATHER", "YES" }, - { "NPUW_DQ", "YES" }, - { "NPUW_FUNCALL_ASYNC", "YES" }, - { "NPUW_WEIGHTS_BANK", "shared" }, - // Option 'CACHE_DIR' is not supported with MLIR compiler type - // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - { "NPU_COMPILER_TYPE", "MLIR" }, - }; + config = get_npu_config(); } auto start_time = ggml_time_us(); @@ -179,48 +172,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - ov::Tensor input_tensor; - - if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { - input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - - } else if (!is_static) { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - - } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - } - - } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = - pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } - - } else { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - } - } + auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { @@ -258,6 +210,80 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } +ov::AnyMap get_npu_config() { + ov::AnyMap config = { + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + { "NPU_USE_NPUW", "YES" }, + { "NPUW_DEVICES", "NPU" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_HOST_GATHER", "YES" }, + { "NPUW_DQ", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + // Option 'CACHE_DIR' is not supported with MLIR compiler type + // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + { "NPU_COMPILER_TYPE", "MLIR" }, + }; + return config; +} + +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, + const std::string& param_name) { + bool is_static = ggml_decoder->is_static(); + bool is_first_token = ggml_decoder->is_first_token(); + + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != + ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + + } else if (!is_static) { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + + } else { + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto *input_tensor_ggml = + ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = + pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = + ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto *input_tensor_ggml = + ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = pad_input( + input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, + ov::Shape{1, max_token_len, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = + pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = + ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + } + return input_tensor; +} + size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; @@ -268,22 +294,27 @@ size_t checksum(const void* data, size_t size) { return sum; } +// Suppress deprecation warning for ov::Tensor::data() +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; - break; + std::cout << ov::float16::from_bits(*(tensor.data())) + << std::endl; + break; case ov::element::i32: - std::cout << *(int32_t*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; case ov::element::i64: - std::cout << *(int64_t*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; default: break; } @@ -296,18 +327,21 @@ void print_output_tensor_info(const std::string& name, << ", Address: " << output_dst[name] << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; + std::cout << ov::float16::from_bits(*(tensor.data())) + << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; default: break; } } +#pragma GCC diagnostic pop + void set_zero_diagonal(std::vector& matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 2427b0b1ce255..1d23e285227e6 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -8,7 +8,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); -ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name); +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name); std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); @@ -38,3 +38,7 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p void set_zero_diagonal(std::vector& matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); + +ov::AnyMap get_npu_config(); + +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); diff --git a/setup.sh b/setup.sh deleted file mode 100755 index 697639dd143c3..0000000000000 --- a/setup.sh +++ /dev/null @@ -1,2 +0,0 @@ -cmake --build build --parallel $(nproc) - From fa18f09100f7566ca38ca48ba4996c95823018a3 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 23 Jun 2025 11:56:36 +0800 Subject: [PATCH 082/156] Update clang-format --- ggml/src/ggml-openvino/.clang-format | 157 +++++++++++++++ ggml/src/ggml-openvino/openvino/op_table.cpp | 40 ++-- ggml/src/ggml-openvino/utils.cpp | 193 +++++++++---------- 3 files changed, 267 insertions(+), 123 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 8491f4e5c6d63..9382a117b86da 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -1,4 +1,161 @@ --- +# Override root .clang-format AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false ReferenceAlignment: Left +PointerAlignment: Left + +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++17 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index 11d1c773c3476..bf7d54d9a161e 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -14,25 +14,27 @@ namespace frontend { namespace ggml { std::unordered_map get_supported_ops() { - using namespace ov::op; - return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, - {"GGML_OP_CONT", op::translate_cont}, - {"GGML_OP_CPY", op::translate_cpy}, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, - {"GGML_OP_GET_ROWS", op::translate_get_rows}, - {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL_MAT", op::translate_mulmat}, - {"GGML_OP_PERMUTE", op::translate_permute}, - {"GGML_OP_RESHAPE", op::translate_reshape}, - {"GGML_OP_RMS_NORM", op::translate_rms_norm}, - {"GGML_OP_ROPE", op::translate_rope}, - {"GGML_OP_SCALE", op::translate_scale}, - {"GGML_OP_SOFT_MAX", op::translate_soft_max}, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose}, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, - {"GGML_OP_VIEW", op::translate_view}}; + using namespace ov::op; + return { + { "GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + { "GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + { "GGML_OP_CONT", op::translate_cont }, + { "GGML_OP_CPY", op::translate_cpy }, + { "GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + { "GGML_OP_GET_ROWS", op::translate_get_rows }, + { "GGML_OP_MUL", op::translate_1to1_match_2_inputs }, + { "GGML_OP_MUL_MAT", op::translate_mulmat }, + { "GGML_OP_PERMUTE", op::translate_permute }, + { "GGML_OP_RESHAPE", op::translate_reshape }, + { "GGML_OP_RMS_NORM", op::translate_rms_norm }, + { "GGML_OP_ROPE", op::translate_rope }, + { "GGML_OP_SCALE", op::translate_scale }, + { "GGML_OP_SOFT_MAX", op::translate_soft_max }, + { "GGML_OP_SUB", op::translate_1to1_match_2_inputs }, + { "GGML_OP_TRANSPOSE", op::translate_transpose }, + { "GGML_UNARY_OP_SILU", op::translate_unary_silu }, + { "GGML_OP_VIEW", op::translate_view } + }; } } // namespace ggml diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index ebcf8fdd75d93..d20e671064cc2 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,15 +27,13 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool return std::make_shared(nullptr, cgraph, is_static, is_first_token); } -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, - const std::string& name) { - auto *input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = - ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { + auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; } std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { @@ -61,21 +59,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - const std::vector preferred_device = {"GPU", "CPU", "NPU"}; - const auto available_devices = core.get_available_devices(); - for (const auto& dev : preferred_device) { - if (std::find(available_devices.begin(), available_devices.end(), - dev) != available_devices.end()) { - device = dev; - break; + const std::vector preferred_device = { "GPU", "CPU", "NPU" }; + const auto available_devices = core.get_available_devices(); + for (const auto& dev : preferred_device) { + if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { + device = dev; + break; + } } - } } bool is_static = device == "NPU" ? true : false; ov::AnyMap config; if (device == "NPU") { - config = get_npu_config(); + config = get_npu_config(); } auto start_time = ggml_time_us(); @@ -107,10 +104,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (is_static) { if (is_first_token) { - model = compiled_cache_prefill[cgraph].first; + model = compiled_cache_prefill[cgraph].first; compiled_model = compiled_cache_prefill[cgraph].second; } else { - model = compiled_cache_kvcache[cgraph].first; + model = compiled_cache_kvcache[cgraph].first; compiled_model = compiled_cache_kvcache[cgraph].second; } } else { @@ -141,7 +138,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); + auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); @@ -161,7 +158,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); + auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); } @@ -227,68 +224,59 @@ ov::AnyMap get_npu_config() { return config; } -ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, - const std::string& param_name) { - bool is_static = ggml_decoder->is_static(); - bool is_first_token = ggml_decoder->is_first_token(); - - ov::Tensor input_tensor; - if (ggml_decoder->get_model_extra_inputs().find(param_name) != - ggml_decoder->get_model_extra_inputs().end()) { - input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - - } else if (!is_static) { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - - } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto *input_tensor_ggml = - ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = - pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = - ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); - auto *data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { + bool is_static = ggml_decoder->is_static(); + bool is_first_token = ggml_decoder->is_first_token(); + + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + + } else if (!is_static) { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - } - - } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto *input_tensor_ggml = - ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = pad_input( - input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, - ov::Shape{1, max_token_len, max_token_len}); - auto *data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = - pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = - ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); - auto *data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } } else { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{ 1, 1, max_token_len }); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = + pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, max_token_len, max_token_len }); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, 1, max_token_len }); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } } - } - return input_tensor; + return input_tensor; } size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { - sum += (uint8_t)i; + sum += (uint8_t) i; sum += bytes[i]; } return sum; @@ -302,41 +290,38 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) - << std::endl; - break; - case ov::element::i32: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; - break; - default: - break; + case ov::element::f32: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(tensor.data()) << std::endl; + break; + default: + break; } } -void print_output_tensor_info(const std::string& name, - const ov::Tensor& tensor, +void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::map& output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) - << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - default: - break; + case ov::element::f32: + std::cout << *(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + default: + break; } } @@ -348,9 +333,9 @@ void set_zero_diagonal(std::vector& matrix, size_t dim) { } } -bool is_prefill(struct ggml_cgraph * cgraph) { +bool is_prefill(struct ggml_cgraph* cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { - auto * op = cgraph->nodes[i]; + auto* op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { auto* src = op->src[j]; if (src == nullptr) { From e2021de473653dae54a0022808cfe81cb7ed7c36 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 26 Jun 2025 13:54:06 +0800 Subject: [PATCH 083/156] Statful transformation for CPU GPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 104 +++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 38 ++++--- ggml/src/ggml-openvino/openvino/decoder.hpp | 6 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 13 +-- .../openvino/translate_session.cpp | 69 +++++++++--- .../openvino/translate_session.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 100 ++++++++++------- 7 files changed, 214 insertions(+), 118 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 04f68a495020d..e30f026e36435 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,12 +26,13 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) - : m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), - m_is_static(is_static), - m_is_first_token(is_first_token) { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgraph, bool is_static, + bool is_first_token) : + m_cgraph(m_cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_is_static(is_static), + m_is_first_token(is_first_token) { static std::map> model_weights; if (m_node) { @@ -44,10 +45,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - dump_cgraph(m_cgraph); + std::string filename = "cgraph.txt"; + dump_cgraph(m_cgraph, filename); } - set_max_token_len(); + set_llm_params(); static bool weight_created = false; if (!weight_created) { @@ -105,33 +107,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - ov::PartialShape input_shape; - if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, 1, m_max_token_len}; - } else { - input_shape = ov::PartialShape{1, 1, 1}; - } - } else { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; - } - } else if (std::string(src->name) == "KQ_mask") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len}; - } else { - input_shape = ov::PartialShape{1, 1, m_max_token_len}; - } - } else { - auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = - ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; - } - } else { - input_shape = ov::Shape{get_shape(src)}; - } - auto param_node = std::make_shared(get_ov_type(src), input_shape); + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); param_node->set_friendly_name(src_name); m_model_inputs[src_name] = param_node; } @@ -150,6 +126,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); if (it == m_model_output_names.end()) { m_model_output_names.push_back(name); + m_kv_names.push_back(name); } } } @@ -213,15 +190,52 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } } -void GgmlOvDecoder::set_max_token_len() { +void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "cache_k_l0 (view)") { + if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; m_max_token_len = cache_k->ne[1]; - break; + } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { + m_head_size = node->ne[0]; + m_num_heads = node->ne[1]; + } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { + m_num_heads_kv = node->ne[1]; + } + } +} + +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { + ov::PartialShape input_shape; + if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + if (m_is_static) { + if (m_is_first_token) { + input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ 1, 1, 1 }; + } + } else { + input_shape = ov::PartialShape{ 1, 1, ov::Dimension(1, m_max_token_len) }; + } + } else if (std::string(src->name) == "KQ_mask") { + if (m_is_static) { + if (m_is_first_token) { + input_shape = ov::PartialShape{ 1, m_max_token_len, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + } + } else { + auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{ 1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size) }; } + } else if (std::string(src->name).find("cache_k") == 0) { + input_shape = ov::PartialShape{ m_max_token_len, m_num_heads_kv, m_head_size }; + } else if (std::string(src->name).find("cache_v") == 0) { + input_shape = ov::PartialShape{ m_num_heads_kv, m_head_size, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ get_shape(src) }; } + return input_shape; } void GgmlOvDecoder::add_extra_inputs() { @@ -267,6 +281,16 @@ void GgmlOvDecoder::add_extra_inputs() { } } +std::map GgmlOvDecoder::get_kv_param_res_names() const { + std::map kv_param_res_names; + for (const auto& name : m_kv_names) { + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + kv_param_res_names[name] = name; + } + } + return kv_param_res_names; +} + void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { static std::mutex weights_mutex; auto* nodes = m_cgraph->nodes; @@ -344,8 +368,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) return weight_node; } -void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { - std::ofstream file("cgraph.txt"); +void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { + std::ofstream file(filename); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; return; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b6b13d1f116c7..6d3f24b093a59 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "ggml.h" @@ -89,28 +90,34 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_model_output_names; } - virtual bool is_static() const override { - return m_is_static; - } - virtual bool is_first_token() const override { - return m_is_first_token; - } - virtual int get_max_token_len() const override { - return m_max_token_len; - } + virtual int get_max_token_len() const override { return m_max_token_len; } + + virtual int get_num_heads() const override { return m_num_heads; } + + virtual int get_num_heads_kv() const override { return m_num_heads_kv; } + + virtual int get_head_size() const override { return m_head_size; } + + virtual std::map get_kv_param_res_names() const override; + + virtual bool is_static() const override { return m_is_static; } + + virtual bool is_first_token() const override { return m_is_first_token; } + + ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; private: void set_input_output(ggml_tensor* node); void add_extra_inputs(); - static void dump_cgraph(const struct ggml_cgraph* cgraph); + static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - void set_max_token_len(); - int m_max_token_len; + // set max_token_len, num_heads, etc + void set_llm_params(); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); void add_weight_const_parallel(std::map>& model_weights); struct ggml_cgraph* m_cgraph; @@ -129,6 +136,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + int m_max_token_len; + int m_num_heads; + int m_num_heads_kv; + int m_head_size; + std::vector m_kv_names; bool m_is_static; bool m_is_first_token; }; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 62125683995fc..3105d0f16f74a 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -57,6 +58,11 @@ class GgmlDecoder : public DecoderBase { virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; + virtual int get_num_heads() const = 0; + virtual int get_num_heads_kv() const = 0; + virtual int get_head_size() const = 0; + virtual std::map get_kv_param_res_names() const = 0; + virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; virtual int get_max_token_len() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index d27f4babb4c84..b183b97f23f95 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -57,13 +58,6 @@ OutputVector translate_cpy(const NodeContext& context) { if (op_case == 1) { // Write K to cache_k - int64_t head_size = src0_shape[2]; - int64_t num_heads = src0_shape[1]; - - auto reshaped_src1_shape = - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads, head_size}); - auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); auto token_len_scalar = std::make_shared(token_len, zero); @@ -80,7 +74,8 @@ OutputVector translate_cpy(const NodeContext& context) { } indices = std::make_shared(indices, one); - res = std::make_shared(reshaped_src1, indices, src0); + auto updated = std::make_shared(src1, indices, src0); + res = std::make_shared(updated, std::make_shared(src1), false); } else { // Write V to cache_v auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); @@ -140,7 +135,7 @@ OutputVector translate_cpy(const NodeContext& context) { false); auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); - res = std::make_shared(updated, zero); + res = std::make_shared(updated, std::make_shared(src1), false); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 8eda23c1c532f..3bf0403a64a96 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,7 +1,12 @@ #include "translate_session.hpp" #include +#include +#include +#include +#include #include +#include #include "input_model.hpp" @@ -11,6 +16,41 @@ namespace ggml { using namespace ov::op; +namespace { +ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( + const std::shared_ptr& model, const std::map& kv_param_res_names) { + ov::pass::MakeStateful::ParamResPairs pairs; + const auto& params = model->get_parameters(); + const auto& results = model->get_results(); + + for (const auto& param_res : kv_param_res_names) { + const auto& param_name = param_res.first; + const auto& res_name = param_res.second; + + auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr& node) { + return node->get_friendly_name() == param_name; + }); + + OPENVINO_ASSERT(param_it != params.end(), "The tensor name ", param_name, + " is not associated with any of " + "Parameters in the network."); + + auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr& node) { + return node->get_friendly_name() == res_name; + }); + + OPENVINO_ASSERT(res_it != results.end(), "The tensor name ", res_name, + " is not associated with any of " + "Results in the network."); + + std::shared_ptr param = *param_it; + std::shared_ptr res = *res_it; + pairs.emplace_back(param, res); + } + return pairs; +} +} // namespace + TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, const std::unordered_map& translator_map) : m_input_model(input_model), @@ -88,25 +128,26 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo results.push_back(result); } - ov::ParameterVector used_params; - for (const auto& param : params) { - if (!param->output(0).get_target_inputs().empty()) { - used_params.push_back(param); - } - } - if (getenv("GGML_OPENVINO_PROFILING")) { - if (auto diff = params.size() - used_params.size()) { - std::cout << diff << " parameters are not used in the model." << std::endl; - } - } - resulting_model = std::make_shared(results, used_params); + resulting_model = std::make_shared(results, params); + + apply_transformations(resulting_model); + return resulting_model; +} + +void TranslateSession::apply_transformations(const std::shared_ptr& model) { + auto ggml_model_decoder = std::dynamic_pointer_cast(m_input_model)->get_model_decoder(); ov::pass::Manager manager; manager.set_per_pass_validation(true); manager.register_pass(); - manager.run_passes(resulting_model); - return resulting_model; + if (!ggml_model_decoder->is_static()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } + + manager.run_passes(model); } } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 5c7a9d464d786..9167b55fe52ea 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -16,7 +16,7 @@ class TranslateSession { std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); private: - void print_model_topology(); + void apply_transformations(const std::shared_ptr& model); const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index d20e671064cc2..2620fa561508a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -9,10 +9,13 @@ #include #include #include +#include #include #include +#include #include #include +#include #include #include #include @@ -28,11 +31,15 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool } ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { - auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto* input_data = ggml_tensor->data; + ov::Shape input_shape; + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); + } else { + input_shape = ggml_decoder->get_input_shape(name).to_shape(); + } + auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); return input_tensor; } @@ -82,41 +89,37 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } - // CPU and GPU will only use cache_prefill - using CachedItem = std::pair, ov::CompiledModel>; - static std::unordered_map compiled_cache_prefill; - static std::unordered_map compiled_cache_kvcache; + static std::unordered_map> infer_request_cache; + static std::unordered_map> ov_input_names_cache; + static std::unordered_map> ov_output_names_cache; + // For NPU, store the kvcache model, since we cannot create two infer_request + static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; - std::shared_ptr model; - ov::CompiledModel compiled_model; + ov::InferRequest infer_request; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - bool is_first_token = is_prefill(cgraph); - - auto it = compiled_cache_prefill.find(cgraph); - if (it != compiled_cache_prefill.end()) { + auto it = infer_request_cache.find(cgraph); + if (it != infer_request_cache.end()) { ggml_decoder = get_ggml_decoder(cgraph, is_static, false); decoder_end_time = ggml_time_us(); - if (is_static) { - if (is_first_token) { - model = compiled_cache_prefill[cgraph].first; - compiled_model = compiled_cache_prefill[cgraph].second; - } else { - model = compiled_cache_kvcache[cgraph].first; - compiled_model = compiled_cache_kvcache[cgraph].second; - } - } else { - model = it->second.first; - compiled_model = it->second.second; + // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache + if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { + infer_request_cache[cgraph] = + std::make_shared(compiled_model_cache[cgraph].create_infer_request()); + compiled_model_cache.erase(cgraph); } + infer_request = *infer_request_cache[cgraph]; + conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { + std::shared_ptr model; + if (is_static) { ggml_decoder = get_ggml_decoder(cgraph, is_static, true); auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); @@ -129,12 +132,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); conversion_end_time = ggml_time_us(); - compiled_model = core.compile_model(model, device, config); + auto compiled_model = core.compile_model(model, device, config); auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); + compiled_model_cache[cgraph] = compiled_model_kvcache; compile_end_time = ggml_time_us(); - compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); - compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; + compiled_model_cache[cgraph] = compiled_model_kvcache; if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; @@ -152,9 +157,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c model = ov::frontend::ggml::FrontEnd::convert(input_model); conversion_end_time = ggml_time_us(); - compiled_model = core.compile_model(model, device, config); + auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); - compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; @@ -163,12 +169,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } } + + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto& ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto& ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[cgraph] = ov_input_names; + ov_output_names_cache[cgraph] = ov_output_names; } - auto infer_request = compiled_model.create_infer_request(); - auto ov_params = model->get_parameters(); - for (size_t i = 0; i < ov_params.size(); i++) { - auto param_name = ov_params[i]->get_friendly_name(); + auto ov_input_names = ov_input_names_cache[cgraph]; + auto ov_output_names = ov_output_names_cache[cgraph]; + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); @@ -181,14 +198,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.infer(); auto infer_end_time = ggml_time_us(); - auto output_names = ggml_decoder->get_model_output_names(); - auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); - for (size_t i = 0; i < output_names.size(); i++) { - auto output_tensor = infer_request.get_output_tensor(i); - std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto result_name = ov_output_names[i]; + const auto output_tensor = infer_request.get_output_tensor(i); + + std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - print_output_tensor_info(output_names[i], output_tensor, output_tensors); + print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs); } } auto end_time = ggml_time_us(); From 7cd2e60ea22f1dce2b56aff9eb04c6a582a2f818 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 3 Jul 2025 11:03:40 +0800 Subject: [PATCH 084/156] Add SwiGLU --- ggml/src/ggml-openvino/.clang-format | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 87 +++++++++++-------- ggml/src/ggml-openvino/ggml-openvino.cpp | 38 ++++---- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 29 +++++++ ggml/src/ggml-openvino/openvino/op_table.cpp | 37 ++++---- ggml/src/ggml-openvino/openvino/op_table.hpp | 2 +- 6 files changed, 124 insertions(+), 71 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 9382a117b86da..6d77ecea3cc0a 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -4,6 +4,7 @@ AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false ReferenceAlignment: Left PointerAlignment: Left +Cpp11BracedListStyle: true Language: Cpp AlignAfterOpenBracket: Align @@ -65,7 +66,6 @@ CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 -Cpp11BracedListStyle: false DerivePointerAlignment: false DisableFormat: false EmptyLineBeforeAccessModifier: Leave diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e30f026e36435..61c0fe48339f5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -563,43 +563,58 @@ void GgmlOvDecoder::visit_subgraph(std::function opTypeMap = { - {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, - {GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"}, - {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, - {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, - {GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"}, - {GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, - {GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}}; - static const std::map unaryOpTypeMap = { - {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, - {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, - {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG"}, - {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP"}, - {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH"}, - {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU"}, - {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU"}, - {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID"}, - {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU"}, - {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK"}, - {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU"}, - {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, + static const std::map ops = { + {GGML_OP_ACC, "GGML_OP_ACC" }, + {GGML_OP_ADD, "GGML_OP_ADD" }, + {GGML_OP_ADD1, "GGML_OP_ADD1" }, + {GGML_OP_CONT, "GGML_OP_CONT" }, + {GGML_OP_CPY, "GGML_OP_CPY" }, + {GGML_OP_DIV, "GGML_OP_DIV" }, + {GGML_OP_DUP, "GGML_OP_DUP" }, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, + {GGML_OP_MUL, "GGML_OP_MUL" }, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, + {GGML_OP_ROPE, "GGML_OP_ROPE" }, + {GGML_OP_SCALE, "GGML_OP_SCALE" }, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, + {GGML_OP_SUB, "GGML_OP_SUB" }, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, + {GGML_OP_VIEW, "GGML_OP_VIEW" } + }; + static const std::map unary_ops = { + {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, + {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" }, + {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" }, + {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" }, + {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" }, + {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" }, + {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" }, + {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" }, + {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" }, + {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" }, + {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" }, + {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" }, {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, - {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}}; - auto it = opTypeMap.find(m_node->op); - if (it != opTypeMap.end()) { - if (it->first == GGML_OP_UNARY) { - auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); - if (unary_it != unaryOpTypeMap.end()) { - return unary_it->second; - } - } - return it->second; + {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" }, + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" } + }; + static const std::map glu_ops = { + {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"}, + {GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" }, + {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" } + }; + + switch (m_node->op) { + case GGML_OP_UNARY: + return unary_ops.at(ggml_get_unary_op(m_node)); + case GGML_OP_GLU: + return glu_ops.at(ggml_get_glu_op(m_node)); + default: + return ops.at(m_node->op); } - static const std::string unknown_op = "UNKNOWN_OP"; + static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 19e4ed5b77872..167453b215657 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -237,21 +237,29 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_ops{ - GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, - GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, - GGML_OP_SCALE, GGML_OP_SOFT_MAX, - }; - static const std::set supported_unary_ops{ - GGML_UNARY_OP_SILU, - }; - - if (op->op == GGML_OP_UNARY) { - return supported_unary_ops.find(ggml_get_unary_op(op)) != - supported_unary_ops.end(); - } - return supported_ops.find(op->op) != supported_ops.end(); + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, + GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, + GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, + GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX}; + static const std::set supported_unary_ops{ + GGML_UNARY_OP_SILU, + }; + static const std::set supported_glu_ops{ + GGML_GLU_OP_SWIGLU, + }; + + auto res = false; + switch (op->op) { + case GGML_OP_UNARY: + res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + break; + case GGML_OP_GLU: + res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + break; + default: + res = supported_ops.find(op->op) != supported_ops.end(); + } + return res; } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp new file mode 100644 index 0000000000000..28013fbaa00aa --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -0,0 +1,29 @@ +#include +#include +#include + +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_glu_swiglu(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto src1 = context.get_input(0); + auto src2 = context.get_input(1); + auto sigmoid = std::make_shared(src1); + auto silu = std::make_shared(src1, sigmoid); + auto res = std::make_shared(silu, src2); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index bf7d54d9a161e..a99450ea95643 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -16,24 +16,25 @@ namespace ggml { std::unordered_map get_supported_ops() { using namespace ov::op; return { - { "GGML_OP_ADD", op::translate_1to1_match_2_inputs }, - { "GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, - { "GGML_OP_CONT", op::translate_cont }, - { "GGML_OP_CPY", op::translate_cpy }, - { "GGML_OP_DIV", op::translate_1to1_match_2_inputs }, - { "GGML_OP_GET_ROWS", op::translate_get_rows }, - { "GGML_OP_MUL", op::translate_1to1_match_2_inputs }, - { "GGML_OP_MUL_MAT", op::translate_mulmat }, - { "GGML_OP_PERMUTE", op::translate_permute }, - { "GGML_OP_RESHAPE", op::translate_reshape }, - { "GGML_OP_RMS_NORM", op::translate_rms_norm }, - { "GGML_OP_ROPE", op::translate_rope }, - { "GGML_OP_SCALE", op::translate_scale }, - { "GGML_OP_SOFT_MAX", op::translate_soft_max }, - { "GGML_OP_SUB", op::translate_1to1_match_2_inputs }, - { "GGML_OP_TRANSPOSE", op::translate_transpose }, - { "GGML_UNARY_OP_SILU", op::translate_unary_silu }, - { "GGML_OP_VIEW", op::translate_view } + {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + {"GGML_OP_CONT", op::translate_cont }, + {"GGML_OP_CPY", op::translate_cpy }, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + {"GGML_OP_GET_ROWS", op::translate_get_rows }, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat }, + {"GGML_OP_PERMUTE", op::translate_permute }, + {"GGML_OP_RESHAPE", op::translate_reshape }, + {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_ROPE", op::translate_rope }, + {"GGML_OP_SCALE", op::translate_scale }, + {"GGML_OP_SOFT_MAX", op::translate_soft_max }, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_OP_VIEW", op::translate_view }, + {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index d576c2a1357e1..9b141d6d20149 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -24,8 +24,8 @@ GGML_OP_CONVERTER(translate_scale); GGML_OP_CONVERTER(translate_unary_silu); GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); -GGML_OP_CONVERTER(translate_unary); GGML_OP_CONVERTER(translate_view); +GGML_OP_CONVERTER(translate_glu_swiglu); } // namespace op From 1c75a4d4e66d887bf5fc853eb56f35ad0a8a0e40 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 3 Jul 2025 13:22:39 +0800 Subject: [PATCH 085/156] Fuse to SDPA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 48 ++++++----- ggml/src/ggml-openvino/ggml-decoder.h | 10 +-- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 +- .../ggml-openvino/openvino/node_context.hpp | 13 ++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 5 +- .../src/ggml-openvino/openvino/op/permute.cpp | 21 ++--- .../ggml-openvino/openvino/op/soft_max.cpp | 82 ++++++++++--------- .../openvino/pass/fuse_to_sdpa.cpp | 61 ++++++++++++++ .../openvino/pass/fuse_to_sdpa.hpp | 17 ++++ .../openvino/translate_session.cpp | 3 + ggml/src/ggml-openvino/openvino/utils.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 20 +++-- 12 files changed, 190 insertions(+), 94 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp create mode 100644 ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 61c0fe48339f5..4a45aa2140b56 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,27 +26,36 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgraph, bool is_static, +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, + int context_size, int num_heads, int num_heads_kv, int head_size) : + GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) { + m_context_size = context_size; + m_num_heads = num_heads; + m_num_heads_kv = num_heads_kv; + m_head_size = head_size; +} + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) : - m_cgraph(m_cgraph), + m_cgraph(cgraph), m_node(node), m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), m_is_static(is_static), m_is_first_token(is_first_token) { + // TODO avoid static static std::map> model_weights; - if (m_node) { set_input_output(m_node); } else { static bool printed = false; if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { - print_tensor_address_map(m_cgraph); + print_tensor_address_map(cgraph); printed = true; } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; - dump_cgraph(m_cgraph, filename); + dump_cgraph(cgraph, filename); } set_llm_params(); @@ -57,8 +66,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgr weight_created = true; } - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - auto* cur_node = m_cgraph->nodes[node_n]; + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node); } @@ -195,7 +204,7 @@ void GgmlOvDecoder::set_llm_params() { auto* node = m_cgraph->nodes[i]; if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; - m_max_token_len = cache_k->ne[1]; + m_context_size = cache_k->ne[1]; } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; @@ -210,30 +219,30 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { if (m_is_static) { if (m_is_first_token) { - input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + input_shape = ov::PartialShape{1, 1, m_context_size}; } else { - input_shape = ov::PartialShape{ 1, 1, 1 }; + input_shape = ov::PartialShape{1, 1, 1}; } } else { - input_shape = ov::PartialShape{ 1, 1, ov::Dimension(1, m_max_token_len) }; + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; } } else if (std::string(src->name) == "KQ_mask") { if (m_is_static) { if (m_is_first_token) { - input_shape = ov::PartialShape{ 1, m_max_token_len, m_max_token_len }; + input_shape = ov::PartialShape{1, m_context_size, m_context_size}; } else { - input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + input_shape = ov::PartialShape{1, 1, m_context_size}; } } else { - auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{ 1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size) }; + auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } } else if (std::string(src->name).find("cache_k") == 0) { - input_shape = ov::PartialShape{ m_max_token_len, m_num_heads_kv, m_head_size }; + input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (std::string(src->name).find("cache_v") == 0) { - input_shape = ov::PartialShape{ m_num_heads_kv, m_head_size, m_max_token_len }; + input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; } else { - input_shape = ov::PartialShape{ get_shape(src) }; + input_shape = ov::PartialShape{get_shape(src)}; } return input_shape; } @@ -557,7 +566,8 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, + m_num_heads, m_num_heads_kv, m_head_size); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 6d3f24b093a59..171300b40611b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,9 +11,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: - using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, + int context_size, int num_heads, int num_heads_kv, int head_size); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -90,7 +90,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_model_output_names; } - virtual int get_max_token_len() const override { return m_max_token_len; } + virtual int get_context_size() const override { return m_context_size; } virtual int get_num_heads() const override { return m_num_heads; } @@ -114,7 +114,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); - // set max_token_len, num_heads, etc + // set context_size, num_heads, etc void set_llm_params(); static std::shared_ptr create_weight_node(ggml_tensor* tensor); @@ -136,7 +136,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; - int m_max_token_len; + int m_context_size; int m_num_heads; int m_num_heads_kv; int m_head_size; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 3105d0f16f74a..8d2e06c0e5461 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -65,7 +65,7 @@ class GgmlDecoder : public DecoderBase { virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; - virtual int get_max_token_len() const = 0; + virtual int get_context_size() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index f4e7c4e31f4e3..62aa7d1fc58f8 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -91,11 +91,16 @@ class NodeContext : public frontend::NodeContext { bool is_first_token() const { return m_decoder->is_first_token(); } - int get_max_token_len() const { - return m_decoder->get_max_token_len(); - } -private: + int get_num_heads() const { return m_decoder->get_num_heads(); } + + int get_num_heads_kv() const { return m_decoder->get_num_heads_kv(); } + + int get_head_size() const { return m_decoder->get_head_size(); } + + int get_context_size() const { return m_decoder->get_context_size(); } + + private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; TranslateSession* m_translate_session; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index d5a6ba2f0385f..cd027d28946ee 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -38,9 +38,8 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output B = context.get_input(0); ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - auto src0_shape = context.get_input_shape(0).to_shape(); - int64_t num_heads = context.get_input_shape(1).to_shape()[0]; - int64_t num_heads_kv = src0_shape[0]; + int64_t num_heads = context.get_num_heads(); + int64_t num_heads_kv = context.get_num_heads_kv(); int64_t kv_num_heads_factor = num_heads / num_heads_kv; if (kv_num_heads_factor > 1) { auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 09d15da42718c..978b5377fb514 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -27,7 +27,7 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { auto perm = argsort_descend(context.get_output_stride(0)); res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, { 3 }, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); @@ -51,19 +51,16 @@ OutputVector translate_permute(const NodeContext& context) { false); } - auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); - auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); - std::shared_ptr slice_end; + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + std::shared_ptr slice_axis; if (op_case == 2) { - slice_end = std::make_shared( - ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[2]})}, - 0); + slice_axis = zero; } else { - slice_end = std::make_shared( - ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[0]}), attention_size}, - 0); + slice_axis = two; } - auto src_slice = std::make_shared(src_reshaped, slice_start, slice_end, slice_step); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, slice_axis); if (op_case == 2) { res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); @@ -71,7 +68,7 @@ OutputVector translate_permute(const NodeContext& context) { res = src_slice; } } - return rename_outputs_with_suffix({ res }, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index aeca9b3be59f3..81d43c37fefae 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,3 +1,5 @@ +#include +#include #include #include #include @@ -5,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -22,62 +25,61 @@ namespace op { OutputVector translate_soft_max(const NodeContext& context) { num_inputs_check(context, 1, 2); - auto input_node = context.get_input(0); + auto input_node = context.get_input(0).get_node_shared_ptr(); ov::Output res; float scale = 1.0f; float max_bias = 0.0f; - auto * op_params = context.get_output_op_params(0); - memcpy(&scale, (float*)op_params + 0, sizeof(float)); - memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); + auto* op_params = context.get_output_op_params(0); + memcpy(&scale, (float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); + const uint32_t h = context.get_head_size(); - // const uint32_t n_head = context.get_input_shape(0)[0].get_length(); - // const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) - // : 1.0f; - const float slope = 1.0; + const float m0 = powf(2.0f, -(max_bias) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = + (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + std::shared_ptr scaled_input; if (scale != 1.0f) { auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); - input_node = std::make_shared(input_node, scale_node); + scaled_input = std::make_shared(input_node, scale_node); } - if (context.get_input_size() == 2) { - // Calculate mask then softmax - auto mask_node = context.get_input(1); - ov::element::Type mask_type = context.get_input_type(1); - if (mask_type == ov::element::f16) { - // Convert f16 to f32 - mask_node = std::make_shared(mask_node, ov::element::f32); - } - - // Stride slice mask node - Output slice_start = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}); - auto token_len = get_dimensions(input_node.get_node_shared_ptr(), {1}); - auto total_token_len = get_dimensions(mask_node.get_node_shared_ptr(), {2}); - auto slice_end = std::make_shared(ov::NodeVector{one, token_len, total_token_len}, 0); - Output slice_stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); - auto mask_node_sliced = std::make_shared(mask_node, slice_start, slice_end, slice_stride); - - // slope * mask + auto mask_node = context.get_input(1); + + // Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX + // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul + // can be fused into SDPA. + if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) { + throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + } + auto qk = input_node->get_input_node_shared_ptr(0); + if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) { + throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + } + auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + + Output slope_mask; + if (slope != 1.0f) { auto slope_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{slope}); - auto slope_mask_node = std::make_shared(mask_node_sliced, slope_node); + slope_mask = std::make_shared(mask_node_sliced, slope_node); + throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use."); + } + slope_mask = mask_node_sliced; - // input + slope * mask - auto input_slope_mask_node = std::make_shared(input_node, slope_mask_node); + auto input_slope_mask_node = std::make_shared(scaled_input, slope_mask); - // Calculate softmax - res = std::make_shared(input_slope_mask_node, 2); - } else { - // Directly softmax - res = std::make_shared(input_node, 0); - } + res = std::make_shared(input_slope_mask_node, 2); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp new file mode 100644 index 0000000000000..1b7ac602716ad --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -0,0 +1,61 @@ +#include "fuse_to_sdpa.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +FuseToSDPA::FuseToSDPA() { + const auto m_k = ov::pass::pattern::any_input(); + const auto m_q = ov::pass::pattern::any_input(); + const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); + const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); + const auto m_scale = ov::pass::pattern::any_input(); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); + const auto m_mask = ov::pass::pattern::any_input(); + const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); + const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); + const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); + const auto m_v = ov::pass::pattern::any_input(); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); + + const auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + auto k = pattern_to_output[m_k]; + auto q = pattern_to_output[m_q]; + auto v = pattern_to_output[m_v]; + auto mask = pattern_to_output[m_mask]; + auto scale = pattern_to_output[m_scale]; + + auto v_trans = + register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + auto mask_f16 = register_new_node(mask, ov::element::f16); + auto scale_f16 = register_new_node(scale, ov::element::f16); + auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + + ov::replace_node(m.get_match_root(), sdpa); + ov::copy_runtime_info(m.get_matched_nodes(), sdpa); + + return true; + }; + register_matcher(std::make_shared(m_qkv, "ov::frontend::ggml::pass::FuseToSDPA"), + callback); +} + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp new file mode 100644 index 0000000000000..8b5164d232932 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp @@ -0,0 +1,17 @@ +#include "openvino/pass/matcher_pass.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +class FuseToSDPA : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::FuseToSDPA") + FuseToSDPA(); +}; + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3bf0403a64a96..1f311b4a40c2a 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -9,6 +9,7 @@ #include #include "input_model.hpp" +#include "pass/fuse_to_sdpa.hpp" namespace ov { namespace frontend { @@ -145,6 +146,8 @@ void TranslateSession::apply_transformations(const std::shared_ptr& model const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); manager.register_pass(kv_param_res_pairs); + + manager.register_pass(); } manager.run_passes(model); diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index 1896f814273b2..b54b2b92c9dac 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -65,7 +65,7 @@ template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { num_inputs_check(context, 2, 2); auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return rename_outputs_with_suffix({ res }, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2620fa561508a..2c4f0afe58105 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } + // core.set_property(ov::enable_profiling(true)); static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; @@ -256,10 +258,10 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } else { if (param_name == "inp_tokens" || param_name == "inp_pos") { if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); + size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{ 1, 1, max_token_len }); + std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { @@ -267,18 +269,18 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); + size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { std::vector padded_data = - pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, max_token_len, max_token_len }); + pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); + set_zero_diagonal(padded_data, context_size); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, 1, max_token_len }); + std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } From 30fe39f46f4af37d59869ca9738889866fa66c92 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 4 Jul 2025 14:38:15 +0800 Subject: [PATCH 086/156] Replace Concat with Broadcast in MulMat for GQA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 ++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4a45aa2140b56..b731b26a9add3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -118,6 +118,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; } } @@ -262,6 +263,7 @@ void GgmlOvDecoder::add_extra_inputs() { std::string name = "past_token_len"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); @@ -280,6 +282,7 @@ void GgmlOvDecoder::add_extra_inputs() { std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index cd027d28946ee..139498939542e 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -45,16 +47,20 @@ OutputVector translate_mulmat(const NodeContext& context) { auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); auto num_heads_kv_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); + auto factor_node = + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_num_heads_factor}); auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr new_B_shape = - std::make_shared(ov::OutputVector{num_heads_kv_node, one, B_shape_last_two}, 0); - B = std::make_shared(B, new_B_shape, false); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto B_unsqueezed = std::make_shared(B, unsqueeze_axes); - B = std::make_shared(ov::OutputVector(kv_num_heads_factor, B), 1); - new_B_shape = std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); - B = std::make_shared(B, new_B_shape, false); + auto broadcast_shape = std::make_shared( + ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0); + auto B_broadcasted = std::make_shared(B_unsqueezed, broadcast_shape); + + auto new_B_shape = + std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); + B = std::make_shared(B_broadcasted, new_B_shape, false); } auto result_lp = std::make_shared(A, B, false, true); From 53dade3f51c3d1bdc2f1f170dee7a86a228557d0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sun, 6 Jul 2025 21:59:30 +0800 Subject: [PATCH 087/156] Pull out indices creation for kv cache update --- .../ggml-openvino/openvino/node_context.hpp | 3 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 86 ++---------------- .../openvino/translate_session.cpp | 87 +++++++++++++++++++ 3 files changed, 99 insertions(+), 77 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 62aa7d1fc58f8..b5f0f37406ac8 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -71,6 +71,9 @@ class NodeContext : public frontend::NodeContext { } Output get_input(const std::string& name) const override { + if (m_tensor_map->find(name) == m_tensor_map->end()) { + throw std::runtime_error("'" + name + "' not found in tensor map."); + } return m_tensor_map->at(name); } diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index b183b97f23f95..a70c62d9a8fa7 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -4,19 +4,11 @@ #include #include #include -#include -#include -#include #include #include -#include #include #include -#include -#include #include -#include -#include #include #include "../node_context.hpp" @@ -36,8 +28,13 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); + auto token_len = context.get_input("token_len"); auto past_token_len = context.get_input("past_token_len"); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto token_len_scalar = std::make_shared(token_len, zero); + auto past_token_len_scalar = std::make_shared(past_token_len, zero); + src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -52,89 +49,24 @@ OutputVector translate_cpy(const NodeContext& context) { std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - if (op_case == 1) { // Write K to cache_k - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); - auto token_len_scalar = std::make_shared(token_len, zero); - - std::shared_ptr indices; - if (context.is_static()) { - indices = past_token_len.get_node_shared_ptr(); - } else { - auto past_token_len_scalar = std::make_shared(past_token_len, zero); - auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); - indices = std::make_shared(past_token_len_scalar, - total_token_len_scalar, - one_scalar, - ov::element::i64); - } - indices = std::make_shared(indices, one); - + auto indices = context.get_input("update_indices_k"); auto updated = std::make_shared(src1, indices, src0); res = std::make_shared(updated, std::make_shared(src1), false); } else { // Write V to cache_v - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - - int64_t total_head_size = src0_shape[1]; - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); - - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - auto token_len_scalar = std::make_shared(token_len, zero); - - // 1D tensor of shape [total_head_size], values starting from 0 - auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); - auto range_row_reshaped = - std::make_shared(range_row, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); - auto row_indices = std::make_shared( - range_row_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // 1D tensor of shape [token_len], values starting from past_token_len - std::shared_ptr range_col; - if (context.is_static()) { - range_col = past_token_len.get_node_shared_ptr(); - } else { - auto past_token_len_scalar = std::make_shared(past_token_len, zero); - auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); - range_col = std::make_shared(past_token_len_scalar, - total_token_len_scalar, - one_scalar, - ov::element::i64); - } - auto range_col_reshaped = - std::make_shared(range_col, - ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); - auto col_indices = std::make_shared( - range_col_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); - auto indices_final = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), - false); - auto flattend_src0 = std::make_shared(src0, ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), false); + int64_t total_head_size = src0_shape[1]; auto reshaped_src1 = std::make_shared( src1, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), false); - - auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); + auto indices = context.get_input("update_indices_v"); + auto updated = std::make_shared(reshaped_src1, indices, flattend_src0); res = std::make_shared(updated, std::make_shared(src1), false); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 1f311b4a40c2a..31325a0c115ed 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -3,11 +3,20 @@ #include #include #include +#include +#include +#include #include +#include +#include #include +#include +#include #include #include +#include "ggml-openvino/openvino/node_context.hpp" +#include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" #include "pass/fuse_to_sdpa.hpp" @@ -50,6 +59,83 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( } return pairs; } + +void add_token_len(TensorMap& tensor_map) { + auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr(); + auto token_len = get_dimensions(inp_tokens, {2}); + token_len->set_friendly_name("token_len"); + tensor_map.insert({"token_len", token_len->output(0)}); +} + +void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + // cache_k layout: [S, N, H] (seq, num_heads, head_size) + // cache_v layout: [N, H, S] (num_heads, head_size, seq) + // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); + auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); + + std::shared_ptr update_indices_k; + std::shared_ptr update_indices_v; + + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + + if (ggml_model_decoder.is_static()) { + update_indices_k = past_token_len; + } else { + update_indices_k = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + } + update_indices_k = std::make_shared(update_indices_k, one); + update_indices_k->set_friendly_name("update_indices_k"); + tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); + + auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); + auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); + + // 1D tensor of shape [total_head_size], values starting from 0 + auto range_row = + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32); + auto range_row_reshaped = + std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); + auto row_indices = std::make_shared( + range_row_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // 1D tensor of shape [token_len], values starting from past_token_len + std::shared_ptr range_col; + if (ggml_model_decoder.is_static()) { + // aka inp_pos + range_col = past_token_len; + } else { + range_col = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + } + auto range_col_reshaped = + std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); + auto col_indices = std::make_shared( + range_col_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] + auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + update_indices_v = std::make_shared( + indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); + update_indices_v->set_friendly_name("update_indices_v"); + tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); +} + +// Create common patterns +void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + add_token_len(tensor_map); + add_kv_update_indices(tensor_map, ggml_model_decoder); +} + } // namespace TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, @@ -118,6 +204,7 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; + preprocess(*tensor_map, *ggml_model_decoder); ggml_model_decoder->visit_subgraph(node_visitor); for (const auto& name : ggml_model_decoder->get_model_output_names()) { From 5b707bb8c853caae0becfd2685fe38fdcad9ee98 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 9 Jul 2025 10:15:17 +0800 Subject: [PATCH 088/156] Refactor: remove past_token_len from extra_inputs --- ggml/src/ggml-openvino/ggml-decoder.cpp | 14 ++---------- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 13 +---------- .../openvino/translate_session.cpp | 22 +++++-------------- 3 files changed, 8 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b731b26a9add3..19152a5e6dbe1 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -249,26 +249,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } void GgmlOvDecoder::add_extra_inputs() { - int64_t past_token_len = -1; // attention_size not used for NPU int64_t attention_size = -1; + int64_t past_token_len = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { assert(std::string(node->view_src->name).find("cache_k") == 0); int64_t head_size = node->src[0]->ne[0]; int64_t num_heads = node->src[0]->ne[1]; - past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); - - std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; - - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = past_token_len; - m_model_extra_input_values[name] = tensor; + past_token_len = (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); break; } } diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index a70c62d9a8fa7..e85094bb1870e 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -28,12 +28,6 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto token_len = context.get_input("token_len"); - auto past_token_len = context.get_input("past_token_len"); - - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto token_len_scalar = std::make_shared(token_len, zero); - auto past_token_len_scalar = std::make_shared(past_token_len, zero); src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -43,12 +37,6 @@ OutputVector translate_cpy(const NodeContext& context) { return rename_outputs_with_suffix({res}, context.get_name()); } - auto src0_shape = context.get_input_shape(0).to_shape(); - auto output_shape = context.get_output_shape(0).to_shape(); - - std::vector input0_strides = context.get_input_stride(0); - std::vector output_strides = context.get_output_stride(0); - if (op_case == 1) { // Write K to cache_k auto indices = context.get_input("update_indices_k"); @@ -60,6 +48,7 @@ OutputVector translate_cpy(const NodeContext& context) { std::make_shared(src0, ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), false); + auto src0_shape = context.get_input_shape(0).to_shape(); int64_t total_head_size = src0_shape[1]; auto reshaped_src1 = std::make_shared( src1, diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 31325a0c115ed..95805866847ee 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -72,7 +72,6 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // cache_v layout: [N, H, S] (num_heads, head_size, seq) // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); std::shared_ptr update_indices_k; @@ -84,12 +83,8 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - if (ggml_model_decoder.is_static()) { - update_indices_k = past_token_len; - } else { - update_indices_k = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - } + update_indices_k = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); update_indices_k = std::make_shared(update_indices_k, one); update_indices_k->set_friendly_name("update_indices_k"); tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); @@ -108,14 +103,8 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - std::shared_ptr range_col; - if (ggml_model_decoder.is_static()) { - // aka inp_pos - range_col = past_token_len; - } else { - range_col = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - } + auto range_col = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); auto col_indices = std::make_shared( @@ -233,10 +222,9 @@ void TranslateSession::apply_transformations(const std::shared_ptr& model const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); manager.register_pass(kv_param_res_pairs); - - manager.register_pass(); } + manager.register_pass(); manager.run_passes(model); } From 7598ad079bd831f112149f7f328ebc8562748448 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 9 Jul 2025 10:16:06 +0800 Subject: [PATCH 089/156] Fix Phi3 SwiGLU and SoftMax --- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 27 ++++++++++++++----- .../ggml-openvino/openvino/op/soft_max.cpp | 8 ++---- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 28013fbaa00aa..138ef650901fd 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -1,6 +1,11 @@ +#include +#include #include +#include #include #include +#include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -12,13 +17,23 @@ namespace ggml { namespace op { OutputVector translate_glu_swiglu(const NodeContext& context) { - num_inputs_check(context, 2, 2); + num_inputs_check(context, 1, 2); - auto src1 = context.get_input(0); - auto src2 = context.get_input(1); - auto sigmoid = std::make_shared(src1); - auto silu = std::make_shared(src1, sigmoid); - auto res = std::make_shared(silu, src2); + ov::Output src0; + ov::Output src1; + if (context.get_input_size() == 2) { + src0 = context.get_input(0); + src1 = context.get_input(1); + } else { + auto combined = context.get_input(0); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2}); + auto split = std::make_shared(combined, split_axis, 2); + src0 = split->output(0); + src1 = split->output(1); + } + auto sigmoid = std::make_shared(src0); + auto silu = std::make_shared(src0, sigmoid); + auto res = std::make_shared(silu, src1); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 81d43c37fefae..d59f4499a3592 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -43,12 +43,8 @@ OutputVector translate_soft_max(const NodeContext& context) { const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - std::shared_ptr scaled_input; - if (scale != 1.0f) { - auto scale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); - scaled_input = std::make_shared(input_node, scale_node); - } + auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + auto scaled_input = std::make_shared(input_node, scale_node); auto mask_node = context.get_input(1); From 86c368e26fac5c4d1eb9a5d892eb4e18fd9343a4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 9 Jul 2025 15:14:10 +0800 Subject: [PATCH 090/156] Pull out sin cos from rope --- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 3 + ggml/src/ggml-openvino/openvino/decoder.hpp | 1 + ggml/src/ggml-openvino/openvino/op/rope.cpp | 116 ++---------------- .../openvino/translate_session.cpp | 92 ++++++++++++++ 5 files changed, 106 insertions(+), 107 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 19152a5e6dbe1..ae4beca23ead7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -209,6 +209,7 @@ void GgmlOvDecoder::set_llm_params() { } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; + m_rope_params = node->op_params; } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { m_num_heads_kv = node->ne[1]; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 171300b40611b..8b507438c547e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -98,6 +98,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual int get_head_size() const override { return m_head_size; } + virtual int32_t* get_rope_params() const override { return m_rope_params; } + virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } @@ -140,6 +142,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { int m_num_heads; int m_num_heads_kv; int m_head_size; + int32_t* m_rope_params; std::vector m_kv_names; bool m_is_static; bool m_is_first_token; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 8d2e06c0e5461..a3387ba3947a2 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -61,6 +61,7 @@ class GgmlDecoder : public DecoderBase { virtual int get_num_heads() const = 0; virtual int get_num_heads_kv() const = 0; virtual int get_head_size() const = 0; + virtual int32_t* get_rope_params() const = 0; virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 78523e5781bfd..f5736fefc87f5 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -3,131 +3,39 @@ #include #include #include -#include #include #include -#include -#include -#include #include #include #include -#include #include #include #include -#include #include #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" -#ifndef M_PI -# define M_PI 3.14159265358979323846 -#endif - -#define GGML_ROPE_TYPE_NEOX 2 - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - namespace ov { namespace frontend { namespace ggml { namespace op { -namespace { -float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); -} - -void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, - float dims[2]) { - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = MAX(0, start); - dims[1] = MIN(n_dims - 1, end); -} -} // namespace - OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); ov::Output res; - auto data_node = context.get_input(0); - auto pos_node = context.get_input(1); - pos_node = std::make_shared(pos_node, ov::element::f32); - - auto permutation_node = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); - Output pos_node_reshaped = std::make_shared(pos_node, permutation_node); + auto data_node = context.get_input(0).get_node_shared_ptr(); + auto cos_theta_node = context.get_input("rope_cos"); + auto sin_theta_node = context.get_input("rope_sin"); - auto output_shape = context.get_output_shape(0); - - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; int32_t* op_params = context.get_output_op_params(0); - const int n_dims = op_params[1]; const int mode = op_params[2]; - const int n_ctx_orig = op_params[4]; - memcpy(&freq_base, op_params + 5, sizeof(float)); - memcpy(&freq_scale, op_params + 6, sizeof(float)); - memcpy(&ext_factor, op_params + 7, sizeof(float)); - memcpy(&attn_factor, op_params + 8, sizeof(float)); - memcpy(&beta_fast, op_params + 9, sizeof(float)); - memcpy(&beta_slow, op_params + 10, sizeof(float)); - - const float theta_scale = powf(freq_base, -2.0f / n_dims); - - // TODO: corr_dims is not used in the current implementation - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - + constexpr int GGML_ROPE_TYPE_NEOX = 2; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - // TODO: GGML_OP_ROPE_BACK -> false - bool forward = true; - const float sin_sign = forward ? 1.0f : -1.0f; - - const int64_t ne0 = output_shape[2].get_length(); - std::vector factor(ne0 / 2); - factor[0] = freq_scale; - for (int64_t i = 1; i < ne0 / 2; i++) { - factor[i] = theta_scale * factor[i - 1]; - } - - Output factor_node = - std::make_shared(ov::element::f32, ov::Shape{factor.size()}, factor); - if (context.get_input_size() == 3) { - auto freq_factors_node = context.get_input(2); - factor_node = std::make_shared(factor_node, freq_factors_node); - } - - auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2}); - Output input_shape_node = std::make_shared( - OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim}, - 0); - Output factor_broadcasted_node = std::make_shared(factor_node, input_shape_node); - - Output cos_factor_broadcasted_node = std::make_shared( - std::make_shared(factor_broadcasted_node, pos_node_reshaped)); - Output sin_factor_broadcasted_node = std::make_shared( - std::make_shared(factor_broadcasted_node, pos_node_reshaped)); - - float mscale = attn_factor; - Output mscale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); - Output mscale_sin_sign_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale * sin_sign}); - Output cos_theta_node = std::make_shared(cos_factor_broadcasted_node, mscale_node); - Output sin_theta_node = std::make_shared(sin_factor_broadcasted_node, mscale_node); - if (!is_neox) { auto input_shape = context.get_input_shape(0); @@ -146,18 +54,12 @@ OutputVector translate_rope(const NodeContext& context) { std::make_shared(odd_slice, cos_theta_node)); auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); - auto shape_const = ov::op::v0::Constant::create( - ov::element::i64, - Shape{3}, - std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); - res = std::make_shared(stack, shape_const, false); + res = std::make_shared(stack, std::make_shared(data_node), false); } else { - auto slice_node = - std::make_shared(data_node, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), - 2); - Output slice_data_node_0 = slice_node->outputs()[0]; - Output slice_data_node_1 = slice_node->outputs()[1]; + auto data_split = std::make_shared( + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); + Output slice_data_node_0 = data_split->outputs()[0]; + Output slice_data_node_1 = data_split->outputs()[1]; auto first_half_node = std::make_shared( std::make_shared(slice_data_node_0, cos_theta_node), diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 95805866847ee..d122497e63d6f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,16 +1,23 @@ #include "translate_session.hpp" +#include #include #include #include #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -119,10 +126,95 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); } +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0.0f, start); + dims[1] = std::min(static_cast(n_dims - 1), end); +} + +void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + int32_t* rope_params = ggml_model_decoder.get_rope_params(); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + std::shared_ptr rope_freqs_weight; + + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { + rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); + } + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + const int n_dims = rope_params[1]; + const int n_ctx_orig = rope_params[4]; + memcpy(&freq_base, rope_params + 5, sizeof(float)); + memcpy(&freq_scale, rope_params + 6, sizeof(float)); + memcpy(&ext_factor, rope_params + 7, sizeof(float)); + memcpy(&attn_factor, rope_params + 8, sizeof(float)); + memcpy(&beta_fast, rope_params + 9, sizeof(float)); + memcpy(&beta_slow, rope_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + // TODO: corr_dims is not used in the current implementation + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + // TODO: GGML_OP_ROPE_BACK -> false + // bool forward = true; + // const float sin_sign = forward ? 1.0f : -1.0f; + + const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2; + std::vector factor(half_head_size); + factor[0] = freq_scale; + for (int64_t i = 1; i < half_head_size; i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output factor_node = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + if (rope_freqs_weight) { + factor_node = std::make_shared(factor_node, rope_freqs_weight); + } + + auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size}); + Output cos_factor = + std::make_shared(std::make_shared(factor_node, inp_pos)); + Output sin_factor = + std::make_shared(std::make_shared(factor_node, inp_pos)); + + float mscale = attn_factor; + Output mscale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); + + auto cos_theta = std::make_shared(cos_factor, mscale_node); + auto sin_theta = std::make_shared(sin_factor, mscale_node); + cos_theta->set_friendly_name("rope_cos"); + sin_theta->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta->output(0)}); + tensor_map.insert({"rope_sin", sin_theta->output(0)}); +} + // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); add_kv_update_indices(tensor_map, ggml_model_decoder); + add_rope_sin_cos(tensor_map, ggml_model_decoder); } } // namespace From 9b24b8b2e6bff5b2c4967cc7c7c00811037bc10b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 11 Jul 2025 15:44:19 +0800 Subject: [PATCH 091/156] Reduce memory: free ov weights node after graph conversion --- ggml/src/ggml-openvino/ggml-decoder.cpp | 16 +++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ ggml/src/ggml-openvino/utils.cpp | 4 +--- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ae4beca23ead7..20d8c1b7fe538 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), m_is_static(is_static), m_is_first_token(is_first_token) { - // TODO avoid static - static std::map> model_weights; if (m_node) { set_input_output(m_node); } else { - static bool printed = false; - if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { print_tensor_address_map(cgraph); - printed = true; } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; + auto timestamp = (long long) ggml_time_us(); + std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; dump_cgraph(cgraph, filename); } set_llm_params(); - static bool weight_created = false; - if (!weight_created) { - add_weight_const_parallel(model_weights); - weight_created = true; + if (is_first_token) { + add_weight_const_parallel(m_model_weights); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_nodes.push_back(cur_node); set_input_output(cur_node); } - m_model_weights = model_weights; add_extra_inputs(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 8b507438c547e..428edef3ae628 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -108,6 +108,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + void clear_model_weights() { m_model_weights.clear(); } + private: void set_input_output(ggml_tensor* node); void add_extra_inputs(); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2c4f0afe58105..e5a4401fec2b9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -9,10 +9,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } - // core.set_property(ov::enable_profiling(true)); static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; @@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); auto compiled_model = core.compile_model(model, device, config); From 0d069501008a13a4a52068be1af9fd9f2e5d999a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 17 Jul 2025 13:43:33 +0800 Subject: [PATCH 092/156] Fix CPY due to cgraph change --- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 5 +++++ src/llama-graph.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index e85094bb1870e..553f3c79666ca 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -39,6 +39,11 @@ OutputVector translate_cpy(const NodeContext& context) { if (op_case == 1) { // Write K to cache_k + int64_t head_size = context.get_head_size(); + int64_t num_heads_kv = context.get_num_heads_kv(); + auto src0_reshape_shape = + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads_kv, head_size}); + src0 = std::make_shared(src0, src0_reshape_shape, false); auto indices = context.get_input("update_indices_k"); auto updated = std::make_shared(src1, indices, src0); res = std::make_shared(updated, std::make_shared(src1), false); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 65ac8744e9012..d6a9f8a0c530f 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1499,7 +1499,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); - cb(inp->self_kq_mask, "KQ_mask", -1); + ggml_set_name(inp->self_kq_mask, "KQ_mask"); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; From 22b9037dd984d7792af1bcd9ec44c6c5aefcc58f Mon Sep 17 00:00:00 2001 From: ravi9 Date: Thu, 17 Jul 2025 17:51:10 -0700 Subject: [PATCH 093/156] Added OpenVINO CI/CD. Updated docs --- .devops/openvino.Dockerfile | 134 ++++++++++++++++++++++++++++++++++ .github/workflows/build.yml | 39 ++++++++++ .github/workflows/docker.yml | 13 ++-- .github/workflows/release.yml | 57 +++++++++++++++ ci/run.sh | 12 +++ docs/build.md | 112 ++++++++++++++++++---------- 6 files changed, 321 insertions(+), 46 deletions(-) create mode 100644 .devops/openvino.Dockerfile diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile new file mode 100644 index 0000000000000..16924e3937c90 --- /dev/null +++ b/.devops/openvino.Dockerfile @@ -0,0 +1,134 @@ +ARG OPENVINO_VERSION_MAJOR=2025.2 +ARG OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d +ARG UBUNTU_VERSION=24.04 + +# Optional proxy build arguments - empty by default +ARG http_proxy= +ARG https_proxy= + +## Build Image +FROM ubuntu:${UBUNTU_VERSION} AS build + +# Pass proxy args to build stage +ARG http_proxy +ARG https_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + gnupg \ + wget \ + git \ + cmake \ + ninja-build \ + build-essential \ + libtbb12 \ + libcurl4-openssl-dev && \ + rm -rf /var/lib/apt/lists/* + +# Install OpenVINO for Ubuntu 24.04 +ARG OPENVINO_VERSION_MAJOR +ARG OPENVINO_VERSION_FULL +RUN mkdir -p /opt/intel && \ + wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ + tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ + mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ + cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ + echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \ + cd - && \ + ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + +ENV OpenVINO_DIR=/opt/intel/openvino + +WORKDIR /app + +COPY . . + +# Build Stage +RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \ + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON && \ + cmake --build build/ReleaseOV -j$(nproc)" + +# Copy all necessary libraries +RUN mkdir -p /app/lib && \ + find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \ + find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \ + find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; + +# Create runtime directories and copy binaries +RUN mkdir -p /app/full \ + && cp build/ReleaseOV/bin/* /app/full/ \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base Runtime Image +FROM ubuntu:${UBUNTU_VERSION} AS base + +# Pass proxy args to runtime stage +ARG http_proxy +ARG https_proxy + +RUN apt-get update \ + && apt-get install -y libgomp1 libtbb12 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app/ + +### Full (all binaries) +FROM base AS full + +ARG http_proxy +ARG https_proxy + +COPY --from=build /app/full /app/ + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + python3 \ + python3-venv \ + python3-pip && \ + python3 -m venv /ov-venv && \ + /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \ + /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /tmp/* /var/tmp/* && \ + find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ + find /var/cache -type f -delete + +ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"] + + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app/ + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app/ + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8d6ba5f9f366f..a64bbf565b50d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -667,6 +667,45 @@ jobs: -DGGML_SYCL_F16=ON cmake --build build --config Release -j $(nproc) + ubuntu-24-cmake-openvino: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-24-cmake-openvino-no-preset-v1 + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + export OPENVINO_VERSION_MAJOR=2025.2 + export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar + sudo mkdir -p /opt/intel + wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz + tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz + sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + rm openvino_${OPENVINO_VERSION_MAJOR}.tgz + cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - + sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + + - name: Build + id: cmake_build + run: | + source /opt/intel/openvino/setupvars.sh + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON + cmake --build build/ReleaseOV --config Release -j $(nproc) + build-linux-cross: uses: ./.github/workflows/build-linux-cross.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f73a2bc9f458b..d6fd098c6d52d 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -39,12 +39,13 @@ jobs: # Note: the arm64 images are failing, which prevents the amd64 images from being built # https://github.com/ggml-org/llama.cpp/issues/11888 #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false } - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } - - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } - - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } - - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } - - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } - - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" } + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } + - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } + - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } + - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } + - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } + - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" } + - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true } steps: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2ad381159409c..759d89ac1ec67 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -241,6 +241,63 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip name: llama-bin-ubuntu-vulkan-x64.zip + ubuntu-24-openvino: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-24-cmake-openvino-release-no-preset-v1 + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + export OPENVINO_VERSION_MAJOR=2025.2 + export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar + sudo mkdir -p /opt/intel + wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz + tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz + sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + rm openvino_${OPENVINO_VERSION_MAJOR}.tgz + cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - + sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + + - name: Build + id: cmake_build + run: | + source /opt/intel/openvino/setupvars.sh + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON + cmake --build build/ReleaseOV --config Release -j $(nproc) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/ReleaseOV/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip ./build/ReleaseOV/bin/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip + name: llama-bin-ubuntu-openvino-x64.zip + windows-cpu: runs-on: windows-2025 diff --git a/ci/run.sh b/ci/run.sh index bf0d53f20af56..a00cbfdbdcc04 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -25,6 +25,9 @@ # # with KLEIDIAI support # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with OPENVINO support +# GG_BUILD_OPENVINO=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -146,6 +149,15 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then -DBUILD_SHARED_LIBS=OFF" fi +if [ ! -z ${GG_BUILD_OPENVINO} ]; then + if [ -z ${OpenVINO_DIR} ]; then + echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:" + echo "source /opt/intel/openvino/setupvars.sh" + exit 1 + fi + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON" +fi + ## helpers # download a file if it does not exist or if it is outdated diff --git a/docs/build.md b/docs/build.md index 4ee3666eab88c..2112ae0f9e8c2 100644 --- a/docs/build.md +++ b/docs/build.md @@ -25,7 +25,7 @@ The following sections describe how to build with different backends and options * [Arm® KleidiAI™](#arm-kleidiai) * [OpenCL](#opencl) * [Android](#android-1) -* [OPENVINO](#openvino) +* [OpenVINO](#openvino) * [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends) ## CPU Build @@ -590,20 +590,48 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) -## OPENVINO +## OpenVINO -[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. +[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. +The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. +### Prerequisites + +- Linux or Windows system with Intel hardware (CPU, GPU, or NPU) +- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html). +- Git, CMake, and Ninja software tools are needed for building +```bash + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar +``` + ### 1. Install OpenVINO Runtime - Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)** -- After installation, make sure to [source the environment setup script](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html#step-2-configure-the-environment): +
+📦 Click to expand OpenVINO 2025.2 installation commands +
+ ```bash -source /opt/intel/openvino_2025.1.0/setupvars.sh -``` +export OPENVINO_VERSION_MAJOR=2025.2 +export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d +sudo apt-get update +sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar +sudo mkdir -p /opt/intel +wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz +tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz +sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} +rm openvino_${OPENVINO_VERSION_MAJOR}.tgz +cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} +echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - +sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino +source /opt/intel/openvino/setupvars.sh +``` +
+ - Verify OpenVINO is initialized properly ```bash echo $OpenVINO_DIR @@ -619,23 +647,26 @@ cd llama.cpp git switch dev_backend_openvino # Build with OpenVINO support -cmake --preset ReleaseOV -cmake --build build/ReleaseOV --parallel - +source /opt/intel/openvino/setupvars.sh +cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON +cmake --build build/ReleaseOV --config Release -j $(nproc) ``` ### 3. Download Sample Model -Download the Phi-3 mini model for testing: +Download models for testing: ```bash # Create models directory -mkdir -p ~/models/Phi-3-mini-4k-instruct-gguf +mkdir -p ~/models/ -# Download model file -wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ - -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf +# Download model file: Llama-3.2-1B-Instruct.fp16.gguf +wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \ + -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf +# Download model file: Phi-3-mini-4k-instruct-fp16.gguf +wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ + -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf ``` ### 4. Run inference with OpenVINO backend: @@ -644,28 +675,19 @@ When using the OpenVINO backend, the first inference token may have slightly hig ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +# Default device is GPU. +# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. +export GGML_OPENVINO_DEVICE=GPU -./build/ReleaseOV/bin/llama-simple \ - -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ - -n 50 \ - "Hello, my name is " +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` -### Using Llama.cpp's Built-in CPU Backend (for Comparison) - -To compare performance with the deafult CPU backend: - +To run in chat mode: ```bash -# Build CPU-only version -cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU --parallel +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -# Run with Default CPU backend -./build/ReleaseCPU/bin/llama-simple \ - -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ - -n 50 \ - "Hello, my name is " +./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` @@ -673,13 +695,14 @@ cmake --build build/ReleaseCPU --parallel Control OpenVINO behavior using these environment variables: -- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. +- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. - **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. -- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling -- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt` -- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps -- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging -- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging +- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. +- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. +- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. +- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging. +- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging. ### Example with Profiling @@ -687,11 +710,20 @@ Control OpenVINO behavior using these environment variables: export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache export GGML_OPENVINO_PROFILING=1 -./build/ReleaseOV/bin/llama-simple \ - -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ - -n 50 \ - "Hello, my name is " +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " +``` + +### Using Llama.cpp's Built-in CPU Backend (for Comparison) + +To compare performance with the default CPU backend: + +```bash +# Build CPU-only version +cmake --preset ReleaseCPU +cmake --build build/ReleaseCPU --parallel +# Run with the default CPU backend +./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` ## Notes about GPU-accelerated backends From d0e6df470fb3d0c055bf0753dba1fe37067d486f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 23 Jul 2025 11:19:56 +0800 Subject: [PATCH 094/156] Fix llama-cli --- ggml/src/ggml-openvino/ggml-decoder.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 20d8c1b7fe538..a94a7ddf9c1c5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -244,22 +244,36 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } void GgmlOvDecoder::add_extra_inputs() { - // attention_size not used for NPU + // Extra inputs: + // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. + // Not used for NPU int64_t attention_size = -1; int64_t past_token_len = -1; + int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { + if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { + if (node->src[1]->type != GGML_TYPE_I32) { + throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); + } + past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; + } if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { assert(std::string(node->view_src->name).find("cache_k") == 0); - int64_t head_size = node->src[0]->ne[0]; - int64_t num_heads = node->src[0]->ne[1]; - past_token_len = (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); + past_token_len = + (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv); break; } } if (past_token_len == -1) { throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); } + if (past_token_len != past_token_len_from_inp_pos) { + throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " + + std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos)); + } + for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; From 1fb239bc8df610cdeea77248bd142c252c97f1f6 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 21 Jul 2025 21:52:39 +0800 Subject: [PATCH 095/156] Fix Phi3 ROPE; Add test-backend-ops --- ggml/src/ggml-openvino/.clang-format | 26 +--- ggml/src/ggml-openvino/ggml-decoder.cpp | 77 ++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 10 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 142 ++++++++++++++++-- ggml/src/ggml-openvino/openvino/frontend.cpp | 4 +- ggml/src/ggml-openvino/openvino/frontend.hpp | 2 +- .../ggml-openvino/openvino/node_context.hpp | 4 + ggml/src/ggml-openvino/openvino/op/cont.cpp | 14 +- .../ggml-openvino/openvino/op/get_rows.cpp | 31 +++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 60 ++++---- ggml/src/ggml-openvino/openvino/op/rope.cpp | 66 +++++--- .../ggml-openvino/openvino/op/soft_max.cpp | 33 ++-- .../openvino/translate_session.cpp | 83 +++------- .../openvino/translate_session.hpp | 3 +- ggml/src/ggml-openvino/openvino/utils.cpp | 139 +++++++++++++++++ ggml/src/ggml-openvino/openvino/utils.hpp | 10 ++ ggml/src/ggml-openvino/utils.cpp | 44 ++++++ ggml/src/ggml-openvino/utils.h | 4 + 18 files changed, 550 insertions(+), 202 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 6d77ecea3cc0a..d631bc6c01d1e 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -5,6 +5,10 @@ AlignConsecutiveDeclarations: false ReferenceAlignment: Left PointerAlignment: Left Cpp11BracedListStyle: true +AccessModifierOffset: -4 +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Attach Language: Cpp AlignAfterOpenBracket: Align @@ -27,29 +31,7 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true -BinPackArguments: true -BinPackParameters: true # OnePerLine BitFieldColonSpacing: Both -BreakBeforeBraces: Custom # Attach -BraceWrapping: - AfterCaseLabel: true - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never BreakBeforeBinaryOperators: None diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a94a7ddf9c1c5..8ce9354c69ecc 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -71,9 +74,19 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } } +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { + m_cgraph = cgraph; + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + set_input_output(cur_node, true); + } +} + // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; -// 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node) { +// 2. constructing a decoder for a node; +// 3. constructing a decoder for the whole graph naively (op test case) +void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -98,8 +111,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { m_inputs[src_name] = src; m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); - // If called for the whole graph, create constant nodes for weights and param nodes for inputs - if (!m_node && !src->view_src) { + // Add model inputs and weights constants, if called for the whole graph + if (naive) { + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + + } else if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -118,7 +137,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } } - if (!m_node) { + // Add model outputs, if called for the whole graph + if (naive) { + m_model_output_names.push_back(node->name); + } else if (!m_node) { static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || @@ -164,17 +186,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { m_op_case = 2; } break; - } - case GGML_OP_MUL_MAT: { - if (node->src[0]->view_src == nullptr) { - m_op_case = 1; - } else if (std::string(node->src[0]->name).find("cache_k") == 0) { - m_op_case = 2; - } else if (std::string(node->src[0]->name).find("cache_v") == 0) { - m_op_case = 3; } - break; - } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -188,6 +200,23 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } break; } + case GGML_OP_GET_ROWS: + { + if (node->src[1]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; + } + break; + } + case GGML_OP_ROPE: + { + if (node->src[0]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; + } + } default: break; } @@ -237,6 +266,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (std::string(src->name).find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + } else if (src->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ov::PartialShape{get_shape(src->view_src)}; } else { input_shape = ov::PartialShape{get_shape(src)}; } @@ -373,6 +405,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) weight_node = std::make_shared(node_type, node_shape, data_f16); break; } + case GGML_TYPE_BF16: + { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_bf16; + data_bf16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); + } + weight_node = std::make_shared(node_type, node_shape, data_bf16); + break; + } default: throw std::invalid_argument("Unsupported tensor type"); } @@ -496,6 +539,9 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { case GGML_TYPE_F16: type = ov::element::f16; break; + case GGML_TYPE_BF16: + type = ov::element::bf16; + break; case GGML_TYPE_I64: type = ov::element::i64; break; @@ -576,6 +622,7 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { + {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, {GGML_OP_ADD, "GGML_OP_ADD" }, {GGML_OP_ADD1, "GGML_OP_ADD1" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 428edef3ae628..f4fe9c402d53b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -15,6 +15,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size); + // Naive decoder + GgmlOvDecoder(struct ggml_cgraph* cgraph); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; GGML_UNUSED(name); @@ -111,7 +113,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void clear_model_weights() { m_model_weights.clear(); } private: - void set_input_output(ggml_tensor* node); + void set_input_output(ggml_tensor* node, bool naive = false); void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); @@ -124,13 +126,13 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::shared_ptr create_weight_node(ggml_tensor* tensor); void add_weight_const_parallel(std::map>& model_weights); - struct ggml_cgraph* m_cgraph; + struct ggml_cgraph* m_cgraph = nullptr; + ggml_tensor* m_node = nullptr; + std::vector m_nodes; std::map m_inputs; std::vector m_input_names; std::map m_outputs; std::vector m_output_names; - ggml_tensor* m_node; - std::vector m_nodes; std::string m_op_name; mutable std::string m_name; int m_op_case; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 167453b215657..2bc9d5199c6df 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1,15 +1,17 @@ -#include "ggml-backend-impl.h" -#include "ggml-impl.h" #include "ggml-openvino.h" -#include "ggml-openvino/utils.h" -#include "ggml.h" +#include #include #include #include #include #include +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml-openvino/utils.h" +#include "ggml.h" + #define GGML_OPENVINO_MAX_STREAMS 8 struct ggml_backend_openvino_context { @@ -234,9 +236,85 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } +static bool is_op_unsupported_case(const ggml_tensor* op) { + if (op->op == GGML_OP_SOFT_MAX) { + float scale = 1.0f; + float max_bias = 0.0f; + const auto* op_params = op->op_params; + memcpy(&scale, (const float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + const uint32_t h = op->src[0]->ne[2]; + const uint32_t n_head = op->src[0]->ne[0]; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = + (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + + if (slope != 1.0f) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n"); + return true; + } + } + + if (op->op == GGML_OP_MUL_MAT) { + if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || + (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { + GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); + return true; + } + } + + if (op->op == GGML_OP_ROPE) { + const int32_t* op_params = op->op_params; + const int n_dims = op_params[1]; + const int mode = op_params[2]; + if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); + return true; + } + if (n_dims != op->src[0]->ne[0]) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", + n_dims, + op->src[0]->ne[0]); + return true; + } + if (op->type != GGML_TYPE_F32) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); + return true; + } + float freq_scale; + memcpy(&freq_scale, op_params + 6, sizeof(float)); + if (freq_scale != 1.0f) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); + return true; + } + float ext_factor; + memcpy(&ext_factor, op_params + 7, sizeof(float)); + if (ext_factor != 0.0f) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); + return true; + } + if (op->src[0]->op == GGML_OP_VIEW) { + if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { + GGML_LOG_WARN( + "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n", + op->src[0]->view_src->ne[1], + op->src[0]->ne[2]); + return true; + } + } + } + return false; +} + static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); + static const std::set supported_types{ + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, @@ -248,18 +326,60 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_GLU_OP_SWIGLU, }; - auto res = false; switch (op->op) { case GGML_OP_UNARY: - res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); - break; + { + auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", + ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } + break; + } case GGML_OP_GLU: - res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); - break; + { + auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", + ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } + break; + } default: - res = supported_ops.find(op->op) != supported_ops.end(); + { + auto supported = supported_ops.find(op->op) != supported_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + return false; + } + } + } + + if (supported_types.find(op->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + return false; + } + if (op->ne[3] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); + return false; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (supported_types.find(op->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + return false; + } + if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); + return false; + } + } + + if (is_op_unsupported_case(op)) { + return false; } - return res; + return true; } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp index ff7f0e8392b0f..dbdae1ed45ca1 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.cpp +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -10,13 +10,13 @@ namespace ggml { FrontEnd::FrontEnd() {} -std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) { +std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model, bool naive) { auto ggml_model = std::dynamic_pointer_cast(model); FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); std::shared_ptr converted_model; const auto& supported_ops = get_supported_ops(); { - TranslateSession translate_session(model, supported_ops); + TranslateSession translate_session(model, supported_ops, naive); converted_model = translate_session.get_converted_model(); } return converted_model; diff --git a/ggml/src/ggml-openvino/openvino/frontend.hpp b/ggml/src/ggml-openvino/openvino/frontend.hpp index 5cc7ff1773216..f1c6f0c3e3ce3 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.hpp +++ b/ggml/src/ggml-openvino/openvino/frontend.hpp @@ -15,7 +15,7 @@ class FrontEnd { using Ptr = std::shared_ptr; FrontEnd(); - static std::shared_ptr convert(const InputModel::Ptr& model); + static std::shared_ptr convert(const InputModel::Ptr& model, bool naive = false); }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index b5f0f37406ac8..ceba64227523b 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -77,6 +77,10 @@ class NodeContext : public frontend::NodeContext { return m_tensor_map->at(name); } + bool has_input(const std::string& name) const { + return m_tensor_map->find(name) != m_tensor_map->end(); + } + const std::string& get_name() const override { return m_decoder->get_op_name(); } diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 5c6953caffe27..f83c0e62df77b 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -34,19 +34,7 @@ OutputVector translate_cont(const NodeContext& context) { false); } else { // The input comes from a VIEW - // Currently all cases are slicing at lowest dim - int32_t* op_params = context.get_input_op_params(0); - auto output_stride = context.get_output_stride(0); - - int64_t split_addr = op_params[0] / output_stride[2]; - std::vector begin = {0, 0, split_addr}; - std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; - std::vector strides = {1, 1, 1}; - - auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); - res = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); + res = process_view_input(context, 0); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 9ed5f4deaf047..c97bbbf5a3657 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,10 +1,12 @@ +#include #include #include #include #include #include #include -#include +#include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -18,19 +20,32 @@ namespace op { OutputVector translate_get_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); - auto data_node = context.get_input(0); - auto indices_node = context.get_input(1); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); - auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2}); - Output indice_reshaped = std::make_shared(indices_node, indices_shape, false); + Output res; + auto data = context.get_input(0); + auto indices = context.get_input(1); - auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + if (op_case == 2) { + // The input comes from a VIEW + indices = process_view_input(context, 1); + } + + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + if (indices.get_partial_shape()[1].get_length() == 1) { + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + res = std::make_shared(data, indices, axis); + } else { + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = std::make_shared(data, indices, axis, 1); + } - Output res = std::make_shared(data_node, indice_reshaped, axis_node); if (res.get_element_type() != context.get_output_type(0)) { res = std::make_shared(res, context.get_output_type(0)); } - return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 139498939542e..52d1e575dbd65 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -26,48 +26,46 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case"); - ov::Output res; + ov::Output B = context.get_input(0); + ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - if (op_case == 1) { - auto src0 = context.get_input(0); - auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); - auto result_lp = std::make_shared(src1, src0, false, true); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - ov::Output B = context.get_input(0); - ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); + auto B_shape = context.get_input_shape(0).to_shape(); + auto A_shape = context.get_input_shape(1).to_shape(); + int64_t A_batch = A_shape[0]; + int64_t B_batch = B_shape[0]; + auto A_batch_larger = A_batch > B_batch; + Output Z = A_batch_larger ? B : A; + int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch; + if (factor > 1) { + auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{A_batch}); + auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); + auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - int64_t num_heads = context.get_num_heads(); - int64_t num_heads_kv = context.get_num_heads_kv(); - int64_t kv_num_heads_factor = num_heads / num_heads_kv; - if (kv_num_heads_factor > 1) { - auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); - auto num_heads_kv_node = - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); - auto factor_node = - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_num_heads_factor}); - auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); + auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto B_unsqueezed = std::make_shared(B, unsqueeze_axes); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - auto broadcast_shape = std::make_shared( - ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0); - auto B_broadcasted = std::make_shared(B_unsqueezed, broadcast_shape); + Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; + Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; + auto broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0); + auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); - auto new_B_shape = - std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); - B = std::make_shared(B_broadcasted, new_B_shape, false); + auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dim}, 0); + Z = std::make_shared(Z_broadcasted, new_Z_shape, false); + } + if (A_batch_larger) { + B = Z; + } else { + A = Z; } auto result_lp = std::make_shared(A, B, false, true); res = std::make_shared(result_lp, context.get_output_type(0)); - } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index f5736fefc87f5..7951a1e012c54 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -25,37 +26,66 @@ namespace op { OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + ov::Output res; auto data_node = context.get_input(0).get_node_shared_ptr(); - auto cos_theta_node = context.get_input("rope_cos"); - auto sin_theta_node = context.get_input("rope_sin"); - + auto output_shape = context.get_output_shape(0).to_shape(); int32_t* op_params = context.get_output_op_params(0); - const int mode = op_params[2]; - constexpr int GGML_ROPE_TYPE_NEOX = 2; - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - if (!is_neox) { - auto input_shape = context.get_input_shape(0); + Output cos_theta_node; + Output sin_theta_node; + if (context.has_input("rope_cos")) { + cos_theta_node = context.get_input("rope_cos"); + sin_theta_node = context.get_input("rope_sin"); + } else { + auto inp_pos = context.get_input(1).get_node_shared_ptr(); + std::shared_ptr rope_freqs_weight; + if (context.get_input_size() == 3) { + rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); + } + auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight); + sin_theta_node = sin_cos.first; + cos_theta_node = sin_cos.second; + } + + if (op_case == 2) { + // The input comes from a VIEW + int slice_len = output_shape[1] * output_shape[2]; + data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr(); + auto data_shape = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); + data_node = std::make_shared(data_node, data_shape, false); + } - auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); - auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1}); - auto end = std::make_shared(data_node); - auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2}); - auto even_slice = std::make_shared(data_node, begin_even, end, stride); - auto odd_slice = std::make_shared(data_node, begin_odd, end, stride); + const int mode = op_params[2]; + constexpr int ROPE_TYPE_NEOX = 2; + constexpr int ROPE_TYPE_NORM = 0; - auto first_half = + if (mode == ROPE_TYPE_NORM) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); + auto even_slice = std::make_shared(data_node, zero, end, two, two); + auto odd_slice = std::make_shared(data_node, one, end, two, two); + + Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), std::make_shared(odd_slice, sin_theta_node)); - auto second_half = + Output second_half = std::make_shared(std::make_shared(even_slice, sin_theta_node), std::make_shared(odd_slice, cos_theta_node)); - auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); + first_half = std::make_shared(first_half, + ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + second_half = std::make_shared(second_half, + ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); - } else { + } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); Output slice_data_node_0 = data_split->outputs()[0]; diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index d59f4499a3592..001a62be8b5e2 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -33,9 +33,9 @@ OutputVector translate_soft_max(const NodeContext& context) { auto* op_params = context.get_output_op_params(0); memcpy(&scale, (float*) op_params + 0, sizeof(float)); memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); - const uint32_t h = context.get_head_size(); - - const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + auto src0_shape = context.get_input_shape(0).get_shape(); + const uint32_t h = src0_shape[2]; + const uint32_t n_head = src0_shape[0]; const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); const float m0 = powf(2.0f, -(max_bias) / n_head_log2); @@ -46,23 +46,30 @@ OutputVector translate_soft_max(const NodeContext& context) { auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); auto scaled_input = std::make_shared(input_node, scale_node); + if (context.get_input_size() < 2) { + res = std::make_shared(scaled_input, 2); + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto mask_node = context.get_input(1); - // Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX + std::shared_ptr token_len = get_dimensions(input_node, {1}); + // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul // can be fused into SDPA. - if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) { - throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) { + auto qk = input_node->get_input_node_shared_ptr(0); + if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { + token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); + } } - auto qk = input_node->get_input_node_shared_ptr(0); - if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) { - throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); - } - auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + std::shared_ptr mask_node_sliced = + std::make_shared(mask_node, zero, token_len, one, one); + if (mask_node_sliced->get_element_type() != context.get_output_type(0)) { + mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type(0)); + } Output slope_mask; if (slope != 1.0f) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d122497e63d6f..129c3592c903c 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -145,69 +145,18 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); std::shared_ptr rope_freqs_weight; - - inp_pos = std::make_shared(inp_pos, ov::element::f32); - auto pos_perm = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); - inp_pos = std::make_shared(inp_pos, pos_perm); if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); } - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - const int n_dims = rope_params[1]; - const int n_ctx_orig = rope_params[4]; - memcpy(&freq_base, rope_params + 5, sizeof(float)); - memcpy(&freq_scale, rope_params + 6, sizeof(float)); - memcpy(&ext_factor, rope_params + 7, sizeof(float)); - memcpy(&attn_factor, rope_params + 8, sizeof(float)); - memcpy(&beta_fast, rope_params + 9, sizeof(float)); - memcpy(&beta_slow, rope_params + 10, sizeof(float)); - - const float theta_scale = powf(freq_base, -2.0f / n_dims); - - // TODO: corr_dims is not used in the current implementation - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - // TODO: GGML_OP_ROPE_BACK -> false - // bool forward = true; - // const float sin_sign = forward ? 1.0f : -1.0f; - - const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2; - std::vector factor(half_head_size); - factor[0] = freq_scale; - for (int64_t i = 1; i < half_head_size; i++) { - factor[i] = theta_scale * factor[i - 1]; - } - - Output factor_node = - std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); - if (rope_freqs_weight) { - factor_node = std::make_shared(factor_node, rope_freqs_weight); - } + auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight); + auto sin_theta = sin_cos.first; + auto cos_theta = sin_cos.second; - auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size}); - Output cos_factor = - std::make_shared(std::make_shared(factor_node, inp_pos)); - Output sin_factor = - std::make_shared(std::make_shared(factor_node, inp_pos)); - - float mscale = attn_factor; - Output mscale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); - - auto cos_theta = std::make_shared(cos_factor, mscale_node); - auto sin_theta = std::make_shared(sin_factor, mscale_node); - cos_theta->set_friendly_name("rope_cos"); - sin_theta->set_friendly_name("rope_sin"); - tensor_map.insert({"rope_cos", cos_theta->output(0)}); - tensor_map.insert({"rope_sin", sin_theta->output(0)}); + cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos"); + sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta}); + tensor_map.insert({"rope_sin", sin_theta}); } // Create common patterns @@ -220,10 +169,12 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // namespace TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map) - : m_input_model(input_model), - m_translator_map(translator_map), - m_ov_model(nullptr) {} + const std::unordered_map& translator_map, + bool naive) : + m_input_model(input_model), + m_translator_map(translator_map), + m_ov_model(nullptr), + m_naive(naive) {} std::shared_ptr TranslateSession::get_converted_model() { if (m_ov_model) { @@ -258,6 +209,10 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto node_visitor = [&](std::shared_ptr node) { auto operation_type = node->get_op_type(); + if (operation_type == "GGML_OP_NONE") { + return; + } + ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), @@ -285,7 +240,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; - preprocess(*tensor_map, *ggml_model_decoder); + if (!m_naive) { + preprocess(*tensor_map, *ggml_model_decoder); + } ggml_model_decoder->visit_subgraph(node_visitor); for (const auto& name : ggml_model_decoder->get_model_output_names()) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 9167b55fe52ea..9eea5fd11cb01 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -10,7 +10,7 @@ namespace ggml { class TranslateSession { public: TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map); + const std::unordered_map& translator_map, bool naive = false); std::shared_ptr get_converted_model(); std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); @@ -20,6 +20,7 @@ class TranslateSession { const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; + bool m_naive; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 69e26f05ca095..9634900753224 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -1,9 +1,20 @@ #include "utils.hpp" +#include #include #include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include namespace ov { @@ -58,6 +69,134 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: return outputs; } +namespace { +ov::Output rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) { + int half_n_dims = n_dims / 2; + std::vector dim_ids_vec(half_n_dims); + std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0); + auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec); + auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]}); + auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]}); + auto denom = + std::make_shared(std::make_shared(corr_high, corr_low), + ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f})); + auto ramp_y = + std::make_shared(std::make_shared(dim_ids, corr_low), denom); + auto ramp_clamped = std::make_shared(ramp_y, 0.0f, 1.0f); + auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor}); + auto ramp_mix = std::make_shared(ramp_clamped, ext_factor_node); + return ramp_mix; +} + +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, + int n_ctx_orig, + float freq_base, + float beta_fast, + float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0.0f, start); + dims[1] = std::min(static_cast(n_dims - 1), end); +} +} // namespace + +std::pair, ov::Output> make_sin_cos(int32_t* rope_params, + std::shared_ptr inp_pos, + std::shared_ptr rope_freqs_weight) { + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + const int n_dims = rope_params[1]; + const int n_ctx_orig = rope_params[4]; + memcpy(&freq_base, rope_params + 5, sizeof(float)); + memcpy(&freq_scale, rope_params + 6, sizeof(float)); + memcpy(&ext_factor, rope_params + 7, sizeof(float)); + memcpy(&attn_factor, rope_params + 8, sizeof(float)); + memcpy(&beta_fast, rope_params + 9, sizeof(float)); + memcpy(&beta_slow, rope_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + std::vector factor(n_dims / 2); + factor[0] = freq_scale; + for (size_t i = 1; i < factor.size(); i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + if (rope_freqs_weight) { + freq_factors = std::make_shared(freq_factors, rope_freqs_weight); + } + + auto theta_extrap = std::make_shared(freq_factors, inp_pos); + auto theta_interp = std::make_shared( + theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); + + Output theta; + float mscale = attn_factor; + if (ext_factor == 0.0f) { + theta = theta_interp; + } else { + auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); + auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + auto one_minus_ramp = std::make_shared(one, ramp_mix); + + theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), + std::make_shared(theta_extrap, ramp_mix)); + mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + } + + Output cos_theta = std::make_shared(theta); + Output sin_theta = std::make_shared(theta); + + auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + + cos_theta = std::make_shared(cos_theta, mscale_node); + sin_theta = std::make_shared(sin_theta, mscale_node); + return std::make_pair(sin_theta, cos_theta); +} + +ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len) { + // Only works for VIEW operations that slice at the lowest dimension + // If the VIEW also reshape the result, `slice_len` should be provided + auto input = context.get_input(input_index); + int32_t* op_params = context.get_input_op_params(input_index); + auto src1_stride = context.get_input_stride(input_index); + + int64_t split_addr = op_params[0] / src1_stride[2]; + if (slice_len == 0) { + slice_len = context.get_input_shape(input_index)[2].get_length(); + } + int64_t slice_end = split_addr + slice_len; + + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); + auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto sliced = std::make_shared(input, begin, end, stride, axes); + return sliced; +} + } // namespace ggml } // namespace frontend } // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index b54b2b92c9dac..6c6d2ae8d4f23 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -1,6 +1,10 @@ #pragma once +#include +#include #include +#include +#include #include "node_context.hpp" @@ -60,6 +64,12 @@ std::shared_ptr get_dimensions(const std::shared_ptr& node, OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); +std::pair, ov::Output> make_sin_cos(int32_t* rope_params, + std::shared_ptr inp_pos, + std::shared_ptr rope_freqs_weight = nullptr); + +ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); + namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e5a4401fec2b9..fcfd3639a7136 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -21,6 +21,7 @@ #include #include "ggml-impl.h" +#include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" @@ -35,6 +36,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, ov::Shape input_shape; if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); + } else if (ggml_tensor->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } @@ -81,6 +85,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c config = get_npu_config(); } + if (cgraph->n_nodes == 1) { + return naive_compute(cgraph, core, device, config); + } + auto start_time = ggml_time_us(); auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); @@ -242,6 +250,42 @@ ov::AnyMap get_npu_config() { return config; } +enum ggml_status naive_compute(struct ggml_cgraph* cgraph, + ov::Core& core, + const std::string& device, + const ov::AnyMap& config) { + if (cgraph->nodes[0]->op == GGML_OP_NONE) { + return GGML_STATUS_SUCCESS; + } + + auto decoder = std::make_shared(cgraph); + auto input_model = std::make_shared(decoder); + auto naive = true; + auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); + auto infer_request = core.compile_model(model, device, config).create_infer_request(); + + ov::serialize(model, "IR.xml"); + + auto ov_params = model->get_parameters(); + for (size_t i = 0; i < ov_params.size(); i++) { + auto param_name = ov_params[i]->get_friendly_name(); + auto input_tensor = get_ov_input_tensor(decoder, param_name); + infer_request.set_input_tensor(i, input_tensor); + } + + infer_request.infer(); + + auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder); + auto ov_results = model->get_results(); + for (size_t i = 0; i < ov_results.size(); i++) { + auto result_name = ov_results[i]->get_friendly_name(); + const auto output_tensor = infer_request.get_output_tensor(i); + + std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); + } + return GGML_STATUS_SUCCESS; +} + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { bool is_static = ggml_decoder->is_static(); bool is_first_token = ggml_decoder->is_first_token(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 1d23e285227e6..367b2829bec3b 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,5 @@ #include +#include #include "ggml-backend-impl.h" #include "ggml-decoder.h" @@ -42,3 +43,6 @@ bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); + +enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, + const ov::AnyMap& config); From dcbbe7f0578d5b66b032115920737bc8085ccb69 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 23 Jul 2025 15:37:58 +0800 Subject: [PATCH 096/156] Fix NPU --- ggml/src/ggml-openvino/.clang-format | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 68 ++++++++++++------------- ggml/src/ggml-openvino/ggml-decoder.h | 14 +++-- ggml/src/ggml-openvino/utils.cpp | 16 +++--- 4 files changed, 52 insertions(+), 48 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index d631bc6c01d1e..18280772b6341 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -7,7 +7,6 @@ PointerAlignment: Left Cpp11BracedListStyle: true AccessModifierOffset: -4 BinPackArguments: false -BinPackParameters: false BreakBeforeBraces: Attach Language: Cpp @@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true +BinPackParameters: true BitFieldColonSpacing: Both # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8ce9354c69ecc..b233ff8ebd3f8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -31,47 +31,45 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : - GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) { - m_context_size = context_size; - m_num_heads = num_heads; - m_num_heads_kv = num_heads_kv; - m_head_size = head_size; + m_cgraph(cgraph), + m_node(node), + m_op_name(std::string(node->name)), + m_context_size(context_size), + m_num_heads(num_heads), + m_num_heads_kv(num_heads_kv), + m_head_size(head_size), + m_is_static(is_static), + m_is_first_token(is_first_token) { + set_input_output(node); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, + std::map>& model_weights, bool is_static, bool is_first_token) : m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_op_name(m_node ? std::string(m_node->name) : ""), + m_model_weights(model_weights), m_is_static(is_static), m_is_first_token(is_first_token) { - if (m_node) { - set_input_output(m_node); - } else { - if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { - print_tensor_address_map(cgraph); - } - - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - auto timestamp = (long long) ggml_time_us(); - std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; - dump_cgraph(cgraph, filename); - } - - set_llm_params(); + if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + print_tensor_address_map(cgraph); + } - if (is_first_token) { - add_weight_const_parallel(m_model_weights); - } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + auto timestamp = (long long) ggml_time_us(); + std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; + dump_cgraph(cgraph, filename); + } - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); - set_input_output(cur_node); - } + set_llm_params(); - add_extra_inputs(); + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + set_input_output(cur_node); } + + add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { @@ -334,10 +332,11 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { +std::map> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { + std::map> model_weights; static std::mutex weights_mutex; - auto* nodes = m_cgraph->nodes; - auto n_nodes = m_cgraph->n_nodes; + auto* nodes = cgraph->nodes; + auto n_nodes = cgraph->n_nodes; std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { for (int i = 0; i < GGML_MAX_SRC; i++) { auto* src = node->src[i]; @@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f4fe9c402d53b..78422afaf7d2c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,12 +11,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + // Graph decoder + GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights, + bool is_static, bool is_first_token); + + // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size); - // Naive decoder + // Naive graph decoder GgmlOvDecoder(struct ggml_cgraph* cgraph); + virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; GGML_UNUSED(name); @@ -110,6 +115,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + static std::shared_ptr create_weight_node(ggml_tensor* tensor); + static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); void clear_model_weights() { m_model_weights.clear(); } private: @@ -123,9 +130,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { // set context_size, num_heads, etc void set_llm_params(); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - void add_weight_const_parallel(std::map>& model_weights); - struct ggml_cgraph* m_cgraph = nullptr; ggml_tensor* m_node = nullptr; std::vector m_nodes; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index fcfd3639a7136..be06c54e8b938 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -26,10 +26,6 @@ #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) { - return std::make_shared(nullptr, cgraph, is_static, is_first_token); -} - ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto* input_data = ggml_tensor->data; @@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { - ggml_decoder = get_ggml_decoder(cgraph, is_static, false); + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache @@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); if (is_static) { - ggml_decoder = get_ggml_decoder(cgraph, is_static, true); - auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + ggml_decoder_kvcache->clear_model_weights(); conversion_end_time = ggml_time_us(); auto compiled_model = core.compile_model(model, device, config); @@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model_kvcache, timestamped_filename); } } else { - ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); From c0bf84e34da41a623d03713b692cd99aec9683b1 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 24 Jul 2025 11:56:25 +0800 Subject: [PATCH 097/156] Fix llama-bench; Clang-format --- ggml/src/ggml-openvino/.clang-format | 4 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 67 +++++++++++------------- ggml/src/ggml-openvino/ggml-openvino.cpp | 53 +++++++++---------- 3 files changed, 58 insertions(+), 66 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 18280772b6341..63dc2c472a95d 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -8,6 +8,8 @@ Cpp11BracedListStyle: true AccessModifierOffset: -4 BinPackArguments: false BreakBeforeBraces: Attach +IndentCaseBlocks: false +IndentCaseLabels: false Language: Cpp AlignAfterOpenBracket: Align @@ -68,8 +70,6 @@ IncludeCategories: IncludeIsMainRegex: '([-_](test|unittest))?$' IncludeIsMainSourceRegex: '' IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: true IndentExternBlock: NoIndent IndentGotoLabels: false IndentPPDirectives: AfterHash diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b233ff8ebd3f8..3dc2a3eeac177 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -176,7 +176,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { break; } case GGML_OP_CPY: { - if (ggml_is_contiguous(node)) { + if (std::string(node->src[1]->name).find("cache_k") == 0) { // Write K to cache_k m_op_case = 1; } else { @@ -184,7 +184,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { m_op_case = 2; } break; - } + } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -198,23 +198,21 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } - case GGML_OP_GET_ROWS: - { - if (node->src[1]->op == GGML_OP_VIEW) { - m_op_case = 2; - } else { - m_op_case = 1; - } - break; + case GGML_OP_GET_ROWS: { + if (node->src[1]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; } - case GGML_OP_ROPE: - { - if (node->src[0]->op == GGML_OP_VIEW) { - m_op_case = 2; - } else { - m_op_case = 1; - } + break; + } + case GGML_OP_ROPE: { + if (node->src[0]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; } + } default: break; } @@ -405,17 +403,16 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) weight_node = std::make_shared(node_type, node_shape, data_f16); break; } - case GGML_TYPE_BF16: - { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data_bf16; - data_bf16.reserve(ne_total); - for (int i = 0; i < ne_total; ++i) { - data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); - } - weight_node = std::make_shared(node_type, node_shape, data_bf16); - break; + case GGML_TYPE_BF16: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_bf16; + data_bf16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); } + weight_node = std::make_shared(node_type, node_shape, data_bf16); + break; + } default: throw std::invalid_argument("Unsupported tensor type"); } @@ -614,8 +611,8 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, - m_num_heads, m_num_heads_kv, m_head_size); + auto decoder = std::make_shared( + node, m_cgraph, m_is_static, m_is_first_token, m_context_size, m_num_heads, m_num_heads_kv, m_head_size); node_visitor(decoder); } } @@ -667,12 +664,12 @@ const std::string& GgmlOvDecoder::get_op_type() const { }; switch (m_node->op) { - case GGML_OP_UNARY: - return unary_ops.at(ggml_get_unary_op(m_node)); - case GGML_OP_GLU: - return glu_ops.at(ggml_get_glu_op(m_node)); - default: - return ops.at(m_node->op); + case GGML_OP_UNARY: + return unary_ops.at(ggml_get_unary_op(m_node)); + case GGML_OP_GLU: + return glu_ops.at(ggml_get_glu_op(m_node)); + default: + return ops.at(m_node->op); } static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 2bc9d5199c6df..7edd4667d9b56 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -309,7 +309,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return false; } -static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); static const std::set supported_types{ @@ -327,34 +327,29 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con }; switch (op->op) { - case GGML_OP_UNARY: - { - auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); - if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", - ggml_unary_op_name(ggml_get_unary_op(op))); - return false; - } - break; - } - case GGML_OP_GLU: - { - auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); - if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", - ggml_glu_op_name(ggml_get_glu_op(op))); - return false; - } - break; - } - default: - { - auto supported = supported_ops.find(op->op) != supported_ops.end(); - if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); - return false; - } - } + case GGML_OP_UNARY: { + auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } + break; + } + case GGML_OP_GLU: { + auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } + break; + } + default: { + auto supported = supported_ops.find(op->op) != supported_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + return false; + } + } } if (supported_types.find(op->type) == supported_types.end()) { From 3651cd6b03be6af65daab1ef2f4773ad9ad9e07e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 24 Jul 2025 17:44:32 +0800 Subject: [PATCH 098/156] Fix llama-perplexity --- ggml/src/ggml-openvino/ggml-decoder.cpp | 67 ++++++++++++------- .../openvino/translate_session.cpp | 53 +++++++-------- ggml/src/ggml-openvino/utils.cpp | 9 ++- 3 files changed, 71 insertions(+), 58 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3dc2a3eeac177..b43f45dbbdd2b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -236,8 +236,9 @@ void GgmlOvDecoder::set_llm_params() { } ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { + auto name = std::string(src->name); ov::PartialShape input_shape; - if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + if (name == "inp_tokens" || name == "inp_pos") { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, 1, m_context_size}; @@ -247,7 +248,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; } - } else if (std::string(src->name) == "KQ_mask") { + } else if (name == "inp_out_ids" && !m_is_static) { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; + } else if (name == "KQ_mask") { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, m_context_size, m_context_size}; @@ -258,9 +261,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } - } else if (std::string(src->name).find("cache_k") == 0) { + } else if (name.find("cache_k") == 0) { input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; - } else if (std::string(src->name).find("cache_v") == 0) { + } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work @@ -273,18 +276,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, - // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU + // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for + // llama-perplexity. + // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. + // Not used for NPU + int64_t past_token_len = -1; int64_t attention_size = -1; - int64_t past_token_len = -1; + int64_t token_len = -1; int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { if (node->src[1]->type != GGML_TYPE_I32) { throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); } + token_len = node->src[1]->ne[0]; past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; } if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { @@ -294,29 +301,39 @@ void GgmlOvDecoder::add_extra_inputs() { break; } } + if (past_token_len == -1) { throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); } if (past_token_len != past_token_len_from_inp_pos) { - throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " + - std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos)); + GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n", + past_token_len, + past_token_len_from_inp_pos); } - for (const auto& node : m_nodes) { - if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { - int64_t total_token_len = node->src[1]->ne[0] + past_token_len; - attention_size = GGML_PAD(total_token_len, 32); - std::string name = "attention_size"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; - - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = attention_size; - m_model_extra_input_values[name] = tensor; - break; - } + { + std::string name = "past_token_len"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = past_token_len; + m_model_extra_input_values[name] = tensor; + } + { + int64_t total_token_len = token_len + past_token_len; + attention_size = GGML_PAD(total_token_len, 32); + std::string name = "attention_size"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = attention_size; + m_model_extra_input_values[name] = tensor; } } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 129c3592c903c..83581ec5a84ad 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -78,11 +79,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // cache_k layout: [S, N, H] (seq, num_heads, head_size) // cache_v layout: [N, H, S] (num_heads, head_size, seq) // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - std::shared_ptr update_indices_k; - std::shared_ptr update_indices_v; + Output update_indices_k; + Output update_indices_v; auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); @@ -90,11 +91,19 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - update_indices_k = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - update_indices_k = std::make_shared(update_indices_k, one); - update_indices_k->set_friendly_name("update_indices_k"); - tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); + auto past_token_len_scalar = std::make_shared(past_token_len, zero); + auto token_len_scalar = std::make_shared(token_len, zero); + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + + Output update_indices = std::make_shared( + past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); + if (ggml_model_decoder.is_static()) { + update_indices = past_token_len; + } + + update_indices_k = std::make_shared(update_indices, one); + update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k"); + tensor_map.insert({"update_indices_k", update_indices_k}); auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); @@ -102,7 +111,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // 1D tensor of shape [total_head_size], values starting from 0 auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32); + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); auto range_row_reshaped = std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); auto row_indices = std::make_shared( @@ -110,8 +119,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto range_col = update_indices; auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); auto col_indices = std::make_shared( @@ -119,26 +127,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + update_indices_v = std::make_shared(OutputVector{row_indices, col_indices}, 2); update_indices_v = std::make_shared( - indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); - update_indices_v->set_friendly_name("update_indices_v"); - tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); -} - -float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { -#ifndef M_PI -# define M_PI 3.14159265358979323846 -#endif - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); -} - -void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, - float dims[2]) { - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = std::max(0.0f, start); - dims[1] = std::min(static_cast(n_dims - 1), end); + update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); + update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v"); + tensor_map.insert({"update_indices_v", update_indices_v}); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index be06c54e8b938..45ed73499f451 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -356,10 +356,13 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) std::cout << *(tensor.data()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << *(tensor.data()) << std::endl; break; case ov::element::i32: - std::cout << *(tensor.data()) << std::endl; + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; break; case ov::element::i64: std::cout << *(tensor.data()) << std::endl; @@ -379,7 +382,7 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << *(tensor.data()) << std::endl; std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; break; default: From 33bad119e1e6b39bbc1af11f281566d64ebfb34e Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Mon, 28 Jul 2025 17:14:20 -0700 Subject: [PATCH 099/156] temp. changes for mark decomp --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 12 +++++++++++- .../src/ggml-openvino/openvino/translate_session.cpp | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 52d1e575dbd65..aa230550a42bd 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -28,7 +28,17 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); - ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); + ov::Output A = context.get_input(1); + if (context.get_op_case() == 1) { + if (context.get_input_type(0) == ov::element::f16) { + B = std::make_shared(context.get_input(0), ov::element::f32); + } + if (context.get_input_type(1) == ov::element::f16) { + A = std::make_shared(context.get_input(1), ov::element::f32); + } + } else { + A = std::make_shared(context.get_input(1), context.get_input_type(0)); + } auto B_shape = context.get_input_shape(0).to_shape(); auto A_shape = context.get_input_shape(1).to_shape(); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 83581ec5a84ad..563613aa7f56d 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -258,6 +259,7 @@ void TranslateSession::apply_transformations(const std::shared_ptr& model ov::pass::Manager manager; manager.set_per_pass_validation(true); + manager.register_pass(); manager.register_pass(); if (!ggml_model_decoder->is_static()) { From eee61b422200cba78eb0d4317db60d2a992dc3c4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 29 Jul 2025 14:07:03 +0800 Subject: [PATCH 100/156] matmul in fp32 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 14 ++------- .../ggml-openvino/openvino/op/soft_max.cpp | 7 ++--- .../openvino/pass/fuse_to_sdpa.cpp | 11 +++---- .../openvino/translate_session.cpp | 31 ++++++++++--------- .../openvino/translate_session.hpp | 2 +- 7 files changed, 29 insertions(+), 39 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b43f45dbbdd2b..f7846382b9734 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -212,6 +212,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } else { m_op_case = 1; } + break; } default: break; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 78422afaf7d2c..c1970af53aee6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -139,7 +139,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::vector m_output_names; std::string m_op_name; mutable std::string m_name; - int m_op_case; + int m_op_case = 0; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index aa230550a42bd..57fd476f0abaa 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -29,15 +29,8 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); - if (context.get_op_case() == 1) { - if (context.get_input_type(0) == ov::element::f16) { - B = std::make_shared(context.get_input(0), ov::element::f32); - } - if (context.get_input_type(1) == ov::element::f16) { - A = std::make_shared(context.get_input(1), ov::element::f32); - } - } else { - A = std::make_shared(context.get_input(1), context.get_input_type(0)); + if (context.get_input_type(0) != context.get_input_type(1)) { + B = std::make_shared(context.get_input(0), context.get_input_type(1)); } auto B_shape = context.get_input_shape(0).to_shape(); @@ -72,8 +65,7 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - auto result_lp = std::make_shared(A, B, false, true); - res = std::make_shared(result_lp, context.get_output_type(0)); + res = std::make_shared(A, B, false, true); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 001a62be8b5e2..401acaf86530d 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -57,11 +57,8 @@ OutputVector translate_soft_max(const NodeContext& context) { // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul // can be fused into SDPA. - if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) { - auto qk = input_node->get_input_node_shared_ptr(0); - if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { - token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - } + if (input_node->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { + token_len = get_dimensions(input_node->get_input_node_shared_ptr(0), {1}); } auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index 1b7ac602716ad..aa6e28b627fff 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -22,15 +23,13 @@ FuseToSDPA::FuseToSDPA() { const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); - const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); const auto m_scale = ov::pass::pattern::any_input(); - const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk, m_scale}); const auto m_mask = ov::pass::pattern::any_input(); const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); - const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); const auto m_v = ov::pass::pattern::any_input(); - const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk, m_v}); const auto callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); @@ -42,9 +41,7 @@ FuseToSDPA::FuseToSDPA() { auto v_trans = register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto mask_f16 = register_new_node(mask, ov::element::f16); - auto scale_f16 = register_new_node(scale, ov::element::f16); - auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + auto sdpa = std::make_shared(q, k, v_trans, mask, scale, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 563613aa7f56d..c4fe8c88ee22a 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -254,22 +254,25 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo return resulting_model; } -void TranslateSession::apply_transformations(const std::shared_ptr& model) { +std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr model) { auto ggml_model_decoder = std::dynamic_pointer_cast(m_input_model)->get_model_decoder(); + { + ov::pass::Manager manager; + manager.set_per_pass_validation(true); + + if (!ggml_model_decoder->is_static()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } - ov::pass::Manager manager; - manager.set_per_pass_validation(true); - manager.register_pass(); - manager.register_pass(); - - if (!ggml_model_decoder->is_static()) { - const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); - const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); - manager.register_pass(kv_param_res_pairs); + // SDPA is even worse on performance + // manager.register_pass(); + manager.run_passes(model); } - - manager.register_pass(); - manager.run_passes(model); + auto preprocessor = ov::preprocess::PrePostProcessor(model); + model = preprocessor.build(); + return model; } } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 9eea5fd11cb01..7072d4a9e8b1a 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -16,7 +16,7 @@ class TranslateSession { std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); private: - void apply_transformations(const std::shared_ptr& model); + std::shared_ptr apply_transformations(std::shared_ptr model); const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; From 881ae9b648e6044c2719b854014c8009249f561a Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 29 Jul 2025 17:55:15 -0700 Subject: [PATCH 101/156] mulmat input conversion fix --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 57fd476f0abaa..6905777a09a0f 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -29,8 +30,10 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); - if (context.get_input_type(0) != context.get_input_type(1)) { + if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { B = std::make_shared(context.get_input(0), context.get_input_type(1)); + } else if (context.get_input_type(0) != context.get_input_type(1)) { + A = std::make_shared(context.get_input(1), context.get_input_type(0)); } auto B_shape = context.get_input_shape(0).to_shape(); From bd93697e24a6369625b782a64fad9f4cbd4c9ab4 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 29 Jul 2025 18:17:14 -0700 Subject: [PATCH 102/156] mulmat type conversion update --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 6905777a09a0f..9148a27517b92 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -30,10 +30,13 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); + + bool convert_out_type = false; if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { B = std::make_shared(context.get_input(0), context.get_input_type(1)); } else if (context.get_input_type(0) != context.get_input_type(1)) { A = std::make_shared(context.get_input(1), context.get_input_type(0)); + convert_out_type = true; } auto B_shape = context.get_input_shape(0).to_shape(); @@ -68,7 +71,12 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - res = std::make_shared(A, B, false, true); + if (convert_out_type) { + auto result_lp = std::make_shared(A, B, false, true); + res = std::make_shared(result_lp, context.get_output_type(0)); + } else { + res = std::make_shared(A, B, false, true); + } return rename_outputs_with_suffix({res}, context.get_name()); } From aa7621dcb955043411bc76ab6ab7dd46b79a73b4 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 29 Jul 2025 21:37:57 -0700 Subject: [PATCH 103/156] add mark decomp pass --- ...decompression_convert_constant_folding.hpp | 29 +++++++++++++++++++ .../openvino/translate_session.cpp | 5 +++- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp new file mode 100644 index 0000000000000..163422bf339f7 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "mark_decompression_convert_constant_folding.hpp" +#include "openvino/pass/matcher_pass.hpp" +#include "openvino/core/visibility.hpp" + +#ifdef OPENVINO_STATIC_LIBRARY +# define TRANSFORMATIONS_API +#else +# ifdef IMPLEMENT_OPENVINO_API +# define TRANSFORMATIONS_API OPENVINO_CORE_EXPORTS +# else +# define TRANSFORMATIONS_API OPENVINO_CORE_IMPORTS +# endif // IMPLEMENT_OPENVINO_API +#endif // OPENVINO_STATIC_LIBRARY + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API MarkCompressedFloatConstants; + +} // namespace pass +} // namespace ov + +class ov::pass::MarkCompressedFloatConstants : public MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants"); + MarkCompressedFloatConstants(); +}; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index c4fe8c88ee22a..ed7db614148fa 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -28,6 +28,7 @@ #include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" #include "pass/fuse_to_sdpa.hpp" +#include "pass/mark_decompression_convert_constant_folding.hpp" namespace ov { namespace frontend { @@ -259,6 +260,8 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); + manager.register_pass(); if (!ggml_model_decoder->is_static()) { const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); @@ -267,7 +270,7 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); + manager.register_pass(); manager.run_passes(model); } auto preprocessor = ov::preprocess::PrePostProcessor(model); From 15a91a08453421838fbb7bc295c35be9af4dfe33 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 30 Jul 2025 22:55:41 +0800 Subject: [PATCH 104/156] Revert changes in fuse_to_sdpa --- ggml/src/ggml-openvino/openvino/op/soft_max.cpp | 8 +------- ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp | 11 +++++++---- ggml/src/ggml-openvino/openvino/translate_session.cpp | 4 ---- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 401acaf86530d..046cb93c8bac5 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -53,13 +53,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto mask_node = context.get_input(1); - std::shared_ptr token_len = get_dimensions(input_node, {1}); - // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX - // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul - // can be fused into SDPA. - if (input_node->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { - token_len = get_dimensions(input_node->get_input_node_shared_ptr(0), {1}); - } + auto token_len = context.get_input("token_len"); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); std::shared_ptr mask_node_sliced = diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index aa6e28b627fff..1b7ac602716ad 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -23,13 +22,15 @@ FuseToSDPA::FuseToSDPA() { const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); + const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); const auto m_scale = ov::pass::pattern::any_input(); - const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk, m_scale}); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); const auto m_mask = ov::pass::pattern::any_input(); const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); + const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); const auto m_v = ov::pass::pattern::any_input(); - const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk, m_v}); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); const auto callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); @@ -41,7 +42,9 @@ FuseToSDPA::FuseToSDPA() { auto v_trans = register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto sdpa = std::make_shared(q, k, v_trans, mask, scale, false); + auto mask_f16 = register_new_node(mask, ov::element::f16); + auto scale_f16 = register_new_node(scale, ov::element::f16); + auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index ed7db614148fa..daef12fb90535 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -269,12 +268,9 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } - // SDPA is even worse on performance manager.register_pass(); manager.run_passes(model); } - auto preprocessor = ov::preprocess::PrePostProcessor(model); - model = preprocessor.build(); return model; } From e15133b5cd94b711a679dd10094c176b5bf4bd07 Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 30 Jul 2025 19:34:10 -0700 Subject: [PATCH 105/156] Update build.md --- docs/build.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/build.md b/docs/build.md index 2112ae0f9e8c2..b5ef7c27ae2e4 100644 --- a/docs/build.md +++ b/docs/build.md @@ -601,7 +601,7 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi - Linux or Windows system with Intel hardware (CPU, GPU, or NPU) - **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html). -- Git, CMake, and Ninja software tools are needed for building +- Git, CMake, and Ninja software tools are needed for building. ```bash sudo apt-get update sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar @@ -609,10 +609,10 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi ### 1. Install OpenVINO Runtime -- Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)** +- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
-📦 Click to expand OpenVINO 2025.2 installation commands +📦 Click to expand OpenVINO 2025.2 installation commands on Linux
```bash @@ -686,7 +686,6 @@ export GGML_OPENVINO_DEVICE=GPU To run in chat mode: ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache - ./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` @@ -712,6 +711,7 @@ export GGML_OPENVINO_PROFILING=1 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` +> **Note:** To apply your code changes, clear the `GGML_OPENVINO_CACHE_DIR` directory and rebuild the project. ### Using Llama.cpp's Built-in CPU Backend (for Comparison) From 6fca726da79d0c3b79b0d0b880befa82b0591d47 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 31 Jul 2025 16:22:21 +0800 Subject: [PATCH 106/156] Fix test-backend-ops --- ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++ ggml/src/ggml-openvino/ggml-openvino.cpp | 13 +++++++++++++ ggml/src/ggml-openvino/openvino/op/soft_max.cpp | 2 +- .../mark_decompression_convert_constant_folding.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 11 +++++++---- ggml/src/ggml-openvino/utils.h | 2 ++ 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f7846382b9734..2f7ae333e7f56 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,6 +76,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { m_cgraph = cgraph; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; + if (cur_node->op == GGML_OP_NONE) { + continue; + } m_nodes.push_back(cur_node); set_input_output(cur_node, true); } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 7edd4667d9b56..8c700445b2548 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -258,12 +258,25 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } } + if (op->op == GGML_OP_PERMUTE) { + if (op->type == GGML_TYPE_BF16) { + // err msg: [GPU] Could not find a suitable kernel for transpose + GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); + return true; + } + } + if (op->op == GGML_OP_MUL_MAT) { if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); return true; } + if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { + // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` + GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); + return true; + } } if (op->op == GGML_OP_ROPE) { diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 046cb93c8bac5..e072658ecb156 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -53,7 +53,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto mask_node = context.get_input(1); - auto token_len = context.get_input("token_len"); + auto token_len = context.has_input("token_len") ? context.get_input("token_len") : get_dimensions(input_node, {1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); std::shared_ptr mask_node_sliced = diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp index 163422bf339f7..b40eaf4205703 100644 --- a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp +++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp @@ -24,6 +24,6 @@ class TRANSFORMATIONS_API MarkCompressedFloatConstants; class ov::pass::MarkCompressedFloatConstants : public MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants"); + OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants") MarkCompressedFloatConstants(); }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 45ed73499f451..a64637f9501a9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -81,7 +81,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c config = get_npu_config(); } - if (cgraph->n_nodes == 1) { + if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } @@ -250,11 +250,16 @@ ov::AnyMap get_npu_config() { return config; } +bool is_naive(struct ggml_cgraph* cgraph) { + constexpr int naive_graph_size_threshold = 20; + return cgraph->n_nodes < naive_graph_size_threshold; +} + enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { - if (cgraph->nodes[0]->op == GGML_OP_NONE) { + if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) { return GGML_STATUS_SUCCESS; } @@ -264,8 +269,6 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); auto infer_request = core.compile_model(model, device, config).create_infer_request(); - ov::serialize(model, "IR.xml"); - auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 367b2829bec3b..0d71963f53aca 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -44,5 +44,7 @@ ov::AnyMap get_npu_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); +bool is_naive(struct ggml_cgraph* cgraph); + enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config); From 39d2a38a6dcdadaef9a7674d23fb2a86503a2976 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 31 Jul 2025 16:50:58 +0800 Subject: [PATCH 107/156] Skip test-thread-safety; Run ctest only in ci/run.sh --- ci/run.sh | 2 +- tests/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index a00cbfdbdcc04..a06cf22fffc51 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -26,7 +26,7 @@ # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # # # with OPENVINO support -# GG_BUILD_OPENVINO=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt # if [ -z "$2" ]; then diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d9cc5e933f4ce..3174a5bbc3ded 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -190,6 +190,9 @@ if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") else() llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) endif() +if (NOT GGML_OPENVINO) + llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) +endif() # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135) if (NOT WIN32) From bf165eab95f225a9d201dfc0839cf9d6c69a5124 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 1 Aug 2025 11:46:52 +0800 Subject: [PATCH 108/156] Use CiD for NPU --- ggml/src/ggml-openvino/utils.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a64637f9501a9..cf0fc4dfd39cf 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -235,17 +235,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::AnyMap get_npu_config() { ov::AnyMap config = { - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - { "NPU_USE_NPUW", "YES" }, - { "NPUW_DEVICES", "NPU" }, - { "NPUW_FOLD", "YES" }, - { "NPUW_HOST_GATHER", "YES" }, - { "NPUW_DQ", "YES" }, - { "NPUW_FUNCALL_ASYNC", "YES" }, - { "NPUW_WEIGHTS_BANK", "shared" }, - // Option 'CACHE_DIR' is not supported with MLIR compiler type - // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - { "NPU_COMPILER_TYPE", "MLIR" }, + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_HOST_GATHER", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared" }, + {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; return config; } From 45d6719ab4933da28cdb9cf743e064f755a66ce7 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 4 Aug 2025 17:20:06 +0800 Subject: [PATCH 109/156] Optimize tensor conversion, improve TTFT --- ggml/src/ggml-openvino/ggml-decoder.cpp | 75 ++++++------------------- 1 file changed, 17 insertions(+), 58 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2f7ae333e7f56..eb0cdcb28d717 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "ggml-backend-impl.h" @@ -391,53 +392,12 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { - std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); - switch (tensor->type) { - case GGML_TYPE_I32: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data(ptr, ptr + ne_total); - weight_node = std::make_shared(node_type, node_shape, data); - break; - } - case GGML_TYPE_I64: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data(ptr, ptr + ne_total); - weight_node = std::make_shared(node_type, node_shape, data); - break; - } - case GGML_TYPE_F32: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data(ptr, ptr + ne_total); - weight_node = std::make_shared(node_type, node_shape, data); - break; - } - case GGML_TYPE_F16: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data_f16; - data_f16.reserve(ne_total); - for (int i = 0; i < ne_total; ++i) { - data_f16.push_back(ov::float16::from_bits(ptr[i])); - } - weight_node = std::make_shared(node_type, node_shape, data_f16); - break; - } - case GGML_TYPE_BF16: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data_bf16; - data_bf16.reserve(ne_total); - for (int i = 0; i < ne_total; ++i) { - data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); - } - weight_node = std::make_shared(node_type, node_shape, data_bf16); - break; - } - default: - throw std::invalid_argument("Unsupported tensor type"); - } - return weight_node; + ov::Tensor weights(node_type, node_shape); + memcpy(weights.data(), tensor->data, ne_total * node_type.size()); + return std::make_shared(weights); } void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { @@ -549,27 +509,26 @@ std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { } ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { - ov::element::Type type = ov::element::dynamic; switch (tensor->type) { + case GGML_TYPE_F64: + return ov::element::f64; case GGML_TYPE_F32: - type = ov::element::f32; - break; + return ov::element::f32; case GGML_TYPE_F16: - type = ov::element::f16; - break; + return ov::element::f16; case GGML_TYPE_BF16: - type = ov::element::bf16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; + return ov::element::bf16; + case GGML_TYPE_I8: + return ov::element::i8; + case GGML_TYPE_I16: + return ov::element::i16; case GGML_TYPE_I32: - type = ov::element::i32; - break; + return ov::element::i32; + case GGML_TYPE_I64: + return ov::element::i64; default: - break; + throw std::runtime_error("Unsupported tensor type"); } - return type; } ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { From a663c2158603598cb56bc690ac6e69f60bb0ecb7 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 13 Aug 2025 10:57:22 +0800 Subject: [PATCH 110/156] Support op SET_ROWS --- ggml/src/ggml-openvino/ggml-decoder.cpp | 33 ++++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 3 ++ ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- .../ggml-openvino/openvino/node_context.hpp | 2 + .../src/ggml-openvino/openvino/op/reshape.cpp | 7 ++- .../ggml-openvino/openvino/op/set_rows.cpp | 51 +++++++++++++++++++ ggml/src/ggml-openvino/openvino/op_table.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.hpp | 1 + 8 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/set_rows.cpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index eb0cdcb28d717..c952fb8eaf057 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,7 +90,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { // 3. constructing a decoder for the whole graph naively (op test case) void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; - if (node->op == GGML_OP_CPY) { + if (node->op == GGML_OP_CPY || node->op == GGML_OP_SET_ROWS) { // CPY updates the input tensor in place. For later ov op that uses the // input tensor of CPY, we need to make sure they get the updated tensor // by putting the src tensor name in the tensor_map in @@ -151,9 +151,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); } - auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); - if (it == m_model_output_names.end()) { + if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + it == m_model_output_names.end()) { m_model_output_names.push_back(name); + } + if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), name); it == m_kv_names.end()) { m_kv_names.push_back(name); } } @@ -166,6 +168,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { m_op_case = 1; } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { m_op_case = 2; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[1]) { + m_op_case = 3; } break; } @@ -270,6 +274,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + } else if (get_tensor_used_op(src)->op == GGML_OP_SET_ROWS) { + input_shape = ov::PartialShape{1, 1, -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -283,6 +289,8 @@ void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for // llama-perplexity. + // Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See: + // https://github.com/ggml-org/llama.cpp/pull/14285 // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU @@ -305,6 +313,10 @@ void GgmlOvDecoder::add_extra_inputs() { (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv); break; } + if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { + assert(node->src[1]->type == GGML_TYPE_I64); + past_token_len = *(int64_t*) (node->src[1]->data); + } } if (past_token_len == -1) { @@ -342,6 +354,18 @@ void GgmlOvDecoder::add_extra_inputs() { } } +const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + const auto* node = m_cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] == tensor) { + return node; + } + } + } + throw std::runtime_error("Tensor not found in cgraph"); +} + std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; for (const auto& name : m_kv_names) { @@ -618,7 +642,8 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, {GGML_OP_SUB, "GGML_OP_SUB" }, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_VIEW, "GGML_OP_VIEW" } + {GGML_OP_VIEW, "GGML_OP_VIEW" }, + {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, }; static const std::map unary_ops = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c1970af53aee6..f6a4f7416397f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -117,6 +117,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::shared_ptr create_weight_node(ggml_tensor* tensor); static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); + + const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; + void clear_model_weights() { m_model_weights.clear(); } private: diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 8c700445b2548..14999ba66b734 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -331,7 +331,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, - GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX}; + GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index ceba64227523b..cc1b5c03329c9 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -46,6 +46,8 @@ class NodeContext : public frontend::NodeContext { return m_decoder->get_input_stride(m_input_names[index]); } + std::string get_output_name() const { return m_output_names[0]; } + PartialShape get_output_shape(size_t index) const { return m_decoder->get_output_shape(m_output_names[index]); } diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 3a695683bfafb..4ef3833c90252 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -23,7 +23,7 @@ OutputVector translate_reshape(const NodeContext& context) { } int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported RESHAPE case"); auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; @@ -32,11 +32,14 @@ OutputVector translate_reshape(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); - } else { + } else if (op_case == 2) { new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + } else { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp new file mode 100644 index 0000000000000..b6caa372b8e98 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_set_rows(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto data = context.get_input(0); + auto indices = context.get_input(1); + auto dst = context.get_input(context.get_output_name()); + auto dst_shape = context.get_output_shape(0).to_shape(); + FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); + + auto dst_reshaped = std::make_shared( + dst, + ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), + false); + auto indices_reshaped = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto data_converted = std::make_shared(data, context.get_output_type(0)); + auto data_reshaped = std::make_shared(data_converted, zero); + auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); + auto res = std::make_shared(updated, std::make_shared(dst), false); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index a99450ea95643..744f355a5446f 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -35,6 +35,7 @@ std::unordered_map get_supported_ops() { {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_OP_SET_ROWS", op::translate_set_rows }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 9b141d6d20149..631812aaa3c99 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -26,6 +26,7 @@ GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); +GGML_OP_CONVERTER(translate_set_rows); } // namespace op From d1f23976c791f92a3fc091cad8654aa4a8a3cb80 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 15:40:36 +0800 Subject: [PATCH 111/156] Fix NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 37 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 1 + .../ggml-openvino/openvino/op/set_rows.cpp | 30 ++++++++++++--- ggml/src/ggml-openvino/utils.cpp | 3 ++ 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c952fb8eaf057..472dd157ef131 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -193,6 +193,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } + case GGML_OP_SET_ROWS: { + if (std::string(node->name).find("cache_k") == 0) { + m_op_case = 1; + } else { + m_op_case = 2; + } + break; + } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -274,8 +282,18 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; - } else if (get_tensor_used_op(src)->op == GGML_OP_SET_ROWS) { + } else if (const auto* op = get_tensor_used_op(src); op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, -1}; + if (m_is_static) { + if (m_is_first_token) { + // Dummy static shape, since the indices are not used in this case + input_shape = ov::PartialShape{1}; + } else if (std::string(op->name).find("cache_k") == 0) { + input_shape = ov::PartialShape{1, 1, 1}; + } else { + input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size}; + } + } } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -316,6 +334,7 @@ void GgmlOvDecoder::add_extra_inputs() { if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { assert(node->src[1]->type == GGML_TYPE_I64); past_token_len = *(int64_t*) (node->src[1]->data); + break; } } @@ -366,6 +385,22 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) throw std::runtime_error("Tensor not found in cgraph"); } +const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + const auto* node = m_cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + const auto* src = node->src[j]; + if (src == nullptr) { + break; + } + if (std::string(src->name) == name) { + return src; + } + } + } + return nullptr; +} + std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; for (const auto& name : m_kv_names) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f6a4f7416397f..ae378273d32e0 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -119,6 +119,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; + const ggml_tensor* get_tensor_from_name(const std::string& name) const; void clear_model_weights() { m_model_weights.clear(); } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index b6caa372b8e98..758454cd9d72a 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -25,21 +26,40 @@ OutputVector translate_set_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); auto data = context.get_input(0); - auto indices = context.get_input(1); - auto dst = context.get_input(context.get_output_name()); + data = std::make_shared(data, context.get_output_type(0)); + auto dst_shape = context.get_output_shape(0).to_shape(); FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); - auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); + if (context.is_static() && context.is_first_token()) { + Output res; + if (context.get_op_case() == 2) { + res = std::make_shared( + data, + ov::op::v0::Constant::create( + ov::element::i64, + {3}, + {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}), + false); + res = std::make_shared( + res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0})); + } else { + res = data; + } + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto indices = context.get_input(1); + auto dst = context.get_input(context.get_output_name()); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); auto dst_reshaped = std::make_shared( dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_converted = std::make_shared(data, context.get_output_type(0)); - auto data_reshaped = std::make_shared(data_converted, zero); + auto data_reshaped = std::make_shared(data, zero); auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); auto res = std::make_shared(updated, std::make_shared(dst), false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index cf0fc4dfd39cf..83ab7353a9f8e 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -328,6 +328,9 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons std::copy(padded_data.begin(), padded_data.end(), data_ptr); } + } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); + op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { + input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } From 53e33789d411b587d550553674d220c2ab355178 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:00:38 +0800 Subject: [PATCH 112/156] Remove CPY --- ggml/src/ggml-openvino/ggml-decoder.cpp | 71 +++--------------- ggml/src/ggml-openvino/ggml-openvino.cpp | 19 ++++- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 73 ------------------- ggml/src/ggml-openvino/openvino/op_table.cpp | 1 - ggml/src/ggml-openvino/openvino/op_table.hpp | 1 - .../openvino/translate_session.cpp | 60 --------------- 6 files changed, 25 insertions(+), 200 deletions(-) delete mode 100644 ggml/src/ggml-openvino/openvino/op/cpy.cpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 472dd157ef131..38c7122f4c4d7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,10 +90,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { // 3. constructing a decoder for the whole graph naively (op test case) void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; - if (node->op == GGML_OP_CPY || node->op == GGML_OP_SET_ROWS) { - // CPY updates the input tensor in place. For later ov op that uses the - // input tensor of CPY, we need to make sure they get the updated tensor - // by putting the src tensor name in the tensor_map in + if (node->op == GGML_OP_SET_ROWS) { + // SET_ROWS updates the tensor in place. For later ov op that uses the + // the view_src of SET_ROWS, we need to make sure they get the updated tensor + // by putting the view_src name in the tensor_map in // /src/frontends/ggml/src/translate_session.cpp node_name = std::string(node->view_src->name); } else { @@ -183,16 +183,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } - case GGML_OP_CPY: { - if (std::string(node->src[1]->name).find("cache_k") == 0) { - // Write K to cache_k - m_op_case = 1; - } else { - // Write V to cache_v - m_op_case = 2; - } - break; - } case GGML_OP_SET_ROWS: { if (std::string(node->name).find("cache_k") == 0) { m_op_case = 1; @@ -305,62 +295,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for - // llama-perplexity. - // Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See: - // https://github.com/ggml-org/llama.cpp/pull/14285 - // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU - int64_t past_token_len = -1; int64_t attention_size = -1; - - int64_t token_len = -1; - int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { - if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { - if (node->src[1]->type != GGML_TYPE_I32) { - throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); + if (node->op == GGML_OP_SOFT_MAX) { + auto* mask = node->src[1]; + if (std::string(mask->name).find("KQ_mask") != 0) { + throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name)); } - token_len = node->src[1]->ne[0]; - past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; - } - if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { - assert(std::string(node->view_src->name).find("cache_k") == 0); - past_token_len = - (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv); + attention_size = mask->ne[0]; break; } - if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { - assert(node->src[1]->type == GGML_TYPE_I64); - past_token_len = *(int64_t*) (node->src[1]->data); - break; - } - } - - if (past_token_len == -1) { - throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); - } - if (past_token_len != past_token_len_from_inp_pos) { - GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n", - past_token_len, - past_token_len_from_inp_pos); } { - std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; - - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = past_token_len; - m_model_extra_input_values[name] = tensor; - } - { - int64_t total_token_len = token_len + past_token_len; - attention_size = GGML_PAD(total_token_len, 32); std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); @@ -663,7 +613,6 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_ADD, "GGML_OP_ADD" }, {GGML_OP_ADD1, "GGML_OP_ADD1" }, {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_CPY, "GGML_OP_CPY" }, {GGML_OP_DIV, "GGML_OP_DIV" }, {GGML_OP_DUP, "GGML_OP_DUP" }, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 14999ba66b734..fb5451be32d62 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -328,10 +328,21 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_types{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; - static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, - GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, - GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, - GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS}; + static const std::set supported_ops{GGML_OP_NONE, + GGML_OP_ADD, + GGML_OP_MUL, + GGML_OP_MUL_MAT, + GGML_OP_VIEW, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_ROPE, + GGML_OP_RMS_NORM, + GGML_OP_SCALE, + GGML_OP_SOFT_MAX, + GGML_OP_SET_ROWS}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp deleted file mode 100644 index 553f3c79666ca..0000000000000 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_cpy(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case"); - - auto src0 = context.get_input(0); - auto src1 = context.get_input(1); - - src0 = std::make_shared(src0, context.get_input_type(1)); - ov::Output res; - - if (context.is_static() && context.is_first_token()) { - res = src0; - return rename_outputs_with_suffix({res}, context.get_name()); - } - - if (op_case == 1) { - // Write K to cache_k - int64_t head_size = context.get_head_size(); - int64_t num_heads_kv = context.get_num_heads_kv(); - auto src0_reshape_shape = - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads_kv, head_size}); - src0 = std::make_shared(src0, src0_reshape_shape, false); - auto indices = context.get_input("update_indices_k"); - auto updated = std::make_shared(src1, indices, src0); - res = std::make_shared(updated, std::make_shared(src1), false); - } else { - // Write V to cache_v - auto flattend_src0 = - std::make_shared(src0, - ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), - false); - auto src0_shape = context.get_input_shape(0).to_shape(); - int64_t total_head_size = src0_shape[1]; - auto reshaped_src1 = std::make_shared( - src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto indices = context.get_input("update_indices_v"); - auto updated = std::make_shared(reshaped_src1, indices, flattend_src0); - res = std::make_shared(updated, std::make_shared(src1), false); - } - - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index 744f355a5446f..ce4b01c3b5163 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -19,7 +19,6 @@ std::unordered_map get_supported_ops() { {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, {"GGML_OP_CONT", op::translate_cont }, - {"GGML_OP_CPY", op::translate_cpy }, {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, {"GGML_OP_GET_ROWS", op::translate_get_rows }, {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 631812aaa3c99..332930c3ac115 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -12,7 +12,6 @@ namespace op { GGML_OP_CONVERTER(translate_add); GGML_OP_CONVERTER(translate_cont); -GGML_OP_CONVERTER(translate_cpy); GGML_OP_CONVERTER(translate_get_rows); GGML_OP_CONVERTER(translate_mul); GGML_OP_CONVERTER(translate_mulmat); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index daef12fb90535..a09247347f3f1 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -76,65 +76,6 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } -void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { - // cache_k layout: [S, N, H] (seq, num_heads, head_size) - // cache_v layout: [N, H, S] (num_heads, head_size, seq) - // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened - auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); - auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - - Output update_indices_k; - Output update_indices_v; - - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - - auto past_token_len_scalar = std::make_shared(past_token_len, zero); - auto token_len_scalar = std::make_shared(token_len, zero); - auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); - - Output update_indices = std::make_shared( - past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); - if (ggml_model_decoder.is_static()) { - update_indices = past_token_len; - } - - update_indices_k = std::make_shared(update_indices, one); - update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k"); - tensor_map.insert({"update_indices_k", update_indices_k}); - - auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); - - // 1D tensor of shape [total_head_size], values starting from 0 - auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); - auto range_row_reshaped = - std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); - auto row_indices = std::make_shared( - range_row_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = update_indices; - auto range_col_reshaped = - std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); - auto col_indices = std::make_shared( - range_col_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - update_indices_v = std::make_shared(OutputVector{row_indices, col_indices}, 2); - update_indices_v = std::make_shared( - update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); - update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v"); - tensor_map.insert({"update_indices_v", update_indices_v}); -} - void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); @@ -156,7 +97,6 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_kv_update_indices(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From f1e81791d1b7eda7ecd566b8513263644c30cb72 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:27:24 +0800 Subject: [PATCH 113/156] Fix test-backend-ops --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 4 ++++ ggml/src/ggml-openvino/utils.cpp | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 38c7122f4c4d7..6bc2c253e8ac8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -272,7 +272,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; - } else if (const auto* op = get_tensor_used_op(src); op->op == GGML_OP_SET_ROWS) { + } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, -1}; if (m_is_static) { if (m_is_first_token) { @@ -324,6 +324,9 @@ void GgmlOvDecoder::add_extra_inputs() { } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { + if (tensor == nullptr) { + return nullptr; + } for (int i = 0; i < m_cgraph->n_nodes; i++) { const auto* node = m_cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -332,7 +335,7 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) } } } - throw std::runtime_error("Tensor not found in cgraph"); + return nullptr; } const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index fb5451be32d62..13c2ef74628c7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -238,6 +238,10 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool is_op_unsupported_case(const ggml_tensor* op) { if (op->op == GGML_OP_SOFT_MAX) { + if (op->src[2] != nullptr) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); + return true; + } float scale = 1.0f; float max_bias = 0.0f; const auto* op_params = op->op_params; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 83ab7353a9f8e..522e922db8dee 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -329,7 +329,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); - op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { + op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); From 7fa993d6de12d4f59cec15f5f6ff6e73dc701fbd Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:52:29 +0800 Subject: [PATCH 114/156] Minor updates for raising PR --- CMakePresets.json | 20 -------------------- docs/build.md | 21 +++------------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +-- 3 files changed, 4 insertions(+), 40 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 392c357f37c0b..b5afeb3c0f2f9 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,26 +1,6 @@ { "version": 4, "configurePresets": [ - { - "name": "ReleaseOV", - "generator": "Ninja", - "binaryDir": "${sourceDir}/build/${presetName}", - "installDir": "${sourceDir}/build/install/${presetName}", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release", - "GGML_OPENVINO": true, - "OpenVINO_DIR": "$env{OPENVINO_LLAMA_PATH}/build/Release" - } - }, - { - "name": "ReleaseCPU", - "generator": "Ninja", - "binaryDir": "${sourceDir}/build/${presetName}", - "installDir": "${sourceDir}/build/install/${presetName}", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release" - } - }, { "name": "base", "hidden": true, diff --git a/docs/build.md b/docs/build.md index b5ef7c27ae2e4..c7e15a4e78482 100644 --- a/docs/build.md +++ b/docs/build.md @@ -592,7 +592,7 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OpenVINO -[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. +[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. @@ -694,9 +694,8 @@ export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache Control OpenVINO behavior using these environment variables: -- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. -- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. -- **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. - **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. - **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. - **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. @@ -711,20 +710,6 @@ export GGML_OPENVINO_PROFILING=1 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` -> **Note:** To apply your code changes, clear the `GGML_OPENVINO_CACHE_DIR` directory and rebuild the project. - -### Using Llama.cpp's Built-in CPU Backend (for Comparison) - -To compare performance with the default CPU backend: - -```bash -# Build CPU-only version -cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU --parallel - -# Run with the default CPU backend -./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " -``` ## Notes about GPU-accelerated backends diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6bc2c253e8ac8..09919c85052ca 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -57,8 +57,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - auto timestamp = (long long) ggml_time_us(); - std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; + std::string filename = "cgraph.txt"; dump_cgraph(cgraph, filename); } From 7eec19e86b2cb1fef25799dd73c9861abd95726b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 27 Aug 2025 17:06:35 +0800 Subject: [PATCH 115/156] Perf: RMS fused to OV internal RMS op --- ggml/src/ggml-openvino/openvino/op/rms_norm.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 211692a3c706c..c9df4c42f3e0d 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -19,18 +20,17 @@ OutputVector translate_rms_norm(const NodeContext& context) { num_inputs_check(context, 1, 1); auto input_node = context.get_input(0); - auto square = std::make_shared(input_node, input_node); + auto square = std::make_shared( + input_node, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f})); - auto mean = - std::make_shared(square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true); + auto mean = std::make_shared( + square, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); float eps; memcpy(&eps, context.get_output_op_params(0), sizeof(float)); auto rms = std::make_shared( - std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); + std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}))); auto reciprocal = std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms); From adabc5dccf9bad28c6aad713ec3417b9e20107ca Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 4 Sep 2025 17:42:39 +0800 Subject: [PATCH 116/156] Fix after rebasing - Layout of cache k and cache v are unified: [seq, n_head, head_size] - Add CPY and FLASH_ATTN_EXT, flash attn is not used yet - Skip test-backend-ops due to flash attn test crash - Add mutex around graph conversion to avoid test-thread-safety fali in the future - Update NPU config - Update GPU config to disable SDPA opt to make phi-3 run --- ggml/src/ggml-openvino/ggml-decoder.cpp | 96 ++++----- ggml/src/ggml-openvino/ggml-openvino.cpp | 14 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 5 +- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 20 ++ .../openvino/op/flash_attn_ext.cpp | 35 ++++ .../ggml-openvino/openvino/op/get_rows.cpp | 1 - ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 19 +- .../src/ggml-openvino/openvino/op/permute.cpp | 5 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 1 - .../ggml-openvino/openvino/op/set_rows.cpp | 16 +- .../openvino/op/{soft_max.cpp => softmax.cpp} | 0 .../ggml-openvino/openvino/op/transpose.cpp | 3 +- ggml/src/ggml-openvino/openvino/op_table.cpp | 40 ++-- ggml/src/ggml-openvino/openvino/op_table.hpp | 2 + .../openvino/pass/fuse_to_sdpa.cpp | 4 +- ggml/src/ggml-openvino/openvino/utils.cpp | 1 + ggml/src/ggml-openvino/utils.cpp | 194 ++++++++++-------- ggml/src/ggml-openvino/utils.h | 3 +- tests/CMakeLists.txt | 4 +- 19 files changed, 269 insertions(+), 194 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/cpy.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp rename ggml/src/ggml-openvino/openvino/op/{soft_max.cpp => softmax.cpp} (100%) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 09919c85052ca..0ee2338199aa1 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -73,6 +73,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph.txt"; + dump_cgraph(cgraph, filename); + } + m_cgraph = cgraph; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; @@ -173,32 +178,33 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { break; } case GGML_OP_CONT: { - if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) { - // The input comes from a PERMUTE - m_op_case = 1; - } else { - // The input comes from a VIEW which is subtensor - m_op_case = 2; - } - break; - } - case GGML_OP_SET_ROWS: { - if (std::string(node->name).find("cache_k") == 0) { + if (node->src[0]->op == GGML_OP_PERMUTE) { m_op_case = 1; - } else { + } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { m_op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW) { + // The input comes from a VIEW which is subtensor + m_op_case = 3; } break; } case GGML_OP_PERMUTE: { - if (node->src[0]->view_src == nullptr) { - // Permute Qcur + if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { // Permute cache_k (view) m_op_case = 2; } else { - // Permute cache_v (view) + // Permute cache_v (view), deprecated, cache_v will also fall to case 2 + m_op_case = 3; + } + break; + } + case GGML_OP_MUL_MAT: { + if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { + m_op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + // test-backend-ops case m_op_case = 3; } break; @@ -206,16 +212,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { case GGML_OP_GET_ROWS: { if (node->src[1]->op == GGML_OP_VIEW) { m_op_case = 2; - } else { - m_op_case = 1; } break; } case GGML_OP_ROPE: { if (node->src[0]->op == GGML_OP_VIEW) { m_op_case = 2; - } else { - m_op_case = 1; } break; } @@ -270,19 +272,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else if (name.find("cache_k") == 0) { input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { - input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { - input_shape = ov::PartialShape{1, 1, -1}; - if (m_is_static) { - if (m_is_first_token) { - // Dummy static shape, since the indices are not used in this case - input_shape = ov::PartialShape{1}; - } else if (std::string(op->name).find("cache_k") == 0) { - input_shape = ov::PartialShape{1, 1, 1}; - } else { - input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size}; - } - } + input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -610,26 +602,28 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { - {GGML_OP_NONE, "GGML_OP_NONE" }, - {GGML_OP_ACC, "GGML_OP_ACC" }, - {GGML_OP_ADD, "GGML_OP_ADD" }, - {GGML_OP_ADD1, "GGML_OP_ADD1" }, - {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_DIV, "GGML_OP_DIV" }, - {GGML_OP_DUP, "GGML_OP_DUP" }, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, - {GGML_OP_MUL, "GGML_OP_MUL" }, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, - {GGML_OP_ROPE, "GGML_OP_ROPE" }, - {GGML_OP_SCALE, "GGML_OP_SCALE" }, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, - {GGML_OP_SUB, "GGML_OP_SUB" }, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_VIEW, "GGML_OP_VIEW" }, - {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, + {GGML_OP_NONE, "GGML_OP_NONE" }, + {GGML_OP_ACC, "GGML_OP_ACC" }, + {GGML_OP_ADD, "GGML_OP_ADD" }, + {GGML_OP_ADD1, "GGML_OP_ADD1" }, + {GGML_OP_CONT, "GGML_OP_CONT" }, + {GGML_OP_DIV, "GGML_OP_DIV" }, + {GGML_OP_DUP, "GGML_OP_DUP" }, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, + {GGML_OP_MUL, "GGML_OP_MUL" }, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, + {GGML_OP_ROPE, "GGML_OP_ROPE" }, + {GGML_OP_SCALE, "GGML_OP_SCALE" }, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, + {GGML_OP_SUB, "GGML_OP_SUB" }, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" }, + {GGML_OP_VIEW, "GGML_OP_VIEW" }, + {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, + {GGML_OP_CPY, "GGML_OP_CPY" }, + {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"}, }; static const std::map unary_ops = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 13c2ef74628c7..e3eaf40254043 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -270,12 +270,14 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } } - if (op->op == GGML_OP_MUL_MAT) { - if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || - (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { - GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); + if (op->op == GGML_OP_CPY) { + if (op->src[1] != op) { + GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); return true; } + } + + if (op->op == GGML_OP_MUL_MAT) { if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); @@ -346,7 +348,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS}; + GGML_OP_SET_ROWS, + GGML_OP_FLASH_ATTN_EXT, + GGML_OP_CPY}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index f83c0e62df77b..9ae0f420ccb2f 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -19,7 +19,7 @@ OutputVector translate_cont(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); @@ -32,6 +32,9 @@ OutputVector translate_cont(const NodeContext& context) { context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); + } else if (op_case == 2) { + // The input comes from a TRANSPOSE + return {context.get_input(0)}; } else { // The input comes from a VIEW res = process_view_input(context, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp new file mode 100644 index 0000000000000..54b49018a9699 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -0,0 +1,20 @@ +#include +#include +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cpy(const NodeContext& context) { + auto res = std::make_shared(context.get_input(0), context.get_output_type(0)); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp new file mode 100644 index 0000000000000..5c0ad4c20e4c8 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_flash_attn_ext(const NodeContext& context) { + num_inputs_check(context, 4, 4); + auto q_f32 = context.get_input(0); + auto k = context.get_input(1); + auto v = context.get_input(2); + auto mask = context.get_input(3); + + float* params = reinterpret_cast(context.get_output_op_params(0)); + float scale = params[0]; + // float max_bias = params[1]; + // float logit_softcap = params[2]; + + auto q = std::make_shared(q_f32, ov::element::f16); + auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); + auto res = std::make_shared(q, k, v , mask, scale_node, false); + auto res_f32 = std::make_shared(res, ov::element::f32); + return rename_outputs_with_suffix({res_f32}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index c97bbbf5a3657..36795fd43eabd 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -21,7 +21,6 @@ OutputVector translate_get_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); Output res; auto data = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 9148a27517b92..150fbcbb880a4 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -27,15 +27,26 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); + int op_case = context.get_op_case(); + ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); + bool transpose_b = true; + if (op_case == 2) { + B = B.get_node_shared_ptr()->input_value(0); + transpose_b = false; + } else if (op_case == 3) { + B = process_view_input(context, 0); + A = process_view_input(context, 1); + } + bool convert_out_type = false; if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { - B = std::make_shared(context.get_input(0), context.get_input_type(1)); + B = std::make_shared(B, context.get_input_type(1)); } else if (context.get_input_type(0) != context.get_input_type(1)) { - A = std::make_shared(context.get_input(1), context.get_input_type(0)); + A = std::make_shared(A, context.get_input_type(0)); convert_out_type = true; } @@ -72,10 +83,10 @@ OutputVector translate_mulmat(const NodeContext& context) { } if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, true); + auto result_lp = std::make_shared(A, B, false, transpose_b); res = std::make_shared(result_lp, context.get_output_type(0)); } else { - res = std::make_shared(A, B, false, true); + res = std::make_shared(A, B, false, transpose_b); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 978b5377fb514..fcb091016a4f1 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -21,13 +21,12 @@ OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case"); ov::Output res; if (op_case == 1) { - auto perm = argsort_descend(context.get_output_stride(0)); res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 7951a1e012c54..4b1e3b500cf3e 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -27,7 +27,6 @@ OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); ov::Output res; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 758454cd9d72a..0d94a95e44276 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -32,21 +32,7 @@ OutputVector translate_set_rows(const NodeContext& context) { FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); if (context.is_static() && context.is_first_token()) { - Output res; - if (context.get_op_case() == 2) { - res = std::make_shared( - data, - ov::op::v0::Constant::create( - ov::element::i64, - {3}, - {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}), - false); - res = std::make_shared( - res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0})); - } else { - res = data; - } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({data}, context.get_name()); } auto indices = context.get_input(1); diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp similarity index 100% rename from ggml/src/ggml-openvino/openvino/op/soft_max.cpp rename to ggml/src/ggml-openvino/openvino/op/softmax.cpp diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index b35f1fb8610ea..c585dffa6e1b9 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -12,9 +12,8 @@ namespace op { OutputVector translate_transpose(const NodeContext& context) { num_inputs_check(context, 1, 1); - auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index ce4b01c3b5163..ee55f84b96f80 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -16,25 +16,27 @@ namespace ggml { std::unordered_map get_supported_ops() { using namespace ov::op; return { - {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, - {"GGML_OP_CONT", op::translate_cont }, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, - {"GGML_OP_GET_ROWS", op::translate_get_rows }, - {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL_MAT", op::translate_mulmat }, - {"GGML_OP_PERMUTE", op::translate_permute }, - {"GGML_OP_RESHAPE", op::translate_reshape }, - {"GGML_OP_RMS_NORM", op::translate_rms_norm }, - {"GGML_OP_ROPE", op::translate_rope }, - {"GGML_OP_SCALE", op::translate_scale }, - {"GGML_OP_SOFT_MAX", op::translate_soft_max }, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose }, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, - {"GGML_OP_VIEW", op::translate_view }, - {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, - {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + {"GGML_OP_CONT", op::translate_cont }, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + {"GGML_OP_GET_ROWS", op::translate_get_rows }, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat }, + {"GGML_OP_PERMUTE", op::translate_permute }, + {"GGML_OP_RESHAPE", op::translate_reshape }, + {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_ROPE", op::translate_rope }, + {"GGML_OP_SCALE", op::translate_scale }, + {"GGML_OP_SOFT_MAX", op::translate_soft_max }, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_OP_VIEW", op::translate_view }, + {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_CPY", op::translate_cpy }, + {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 332930c3ac115..faa61f5f6c8d2 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -26,6 +26,8 @@ GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); GGML_OP_CONVERTER(translate_set_rows); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_flash_attn_ext); } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index 1b7ac602716ad..c36579910d48c 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -40,11 +40,9 @@ FuseToSDPA::FuseToSDPA() { auto mask = pattern_to_output[m_mask]; auto scale = pattern_to_output[m_scale]; - auto v_trans = - register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); auto mask_f16 = register_new_node(mask, ov::element::f16); auto scale_f16 = register_new_node(scale, ov::element::f16); - auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + auto sdpa = std::make_shared(q, k, v, mask_f16, scale_f16, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 9634900753224..c4197ccc3abdc 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -65,6 +65,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: name += "_"; name += suffix; node->set_friendly_name(name); + // std::cout << name << " " << output.get_partial_shape() << std::endl; } return outputs; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 522e922db8dee..473fa72f99fd5 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -77,8 +78,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (device == "NPU") { - config = get_npu_config(); + if (device == "GPU") { + config = { + {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} + }; } if (is_naive(cgraph)) { @@ -92,6 +95,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } + static std::mutex cache_mutex; static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; @@ -105,89 +109,93 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c int64_t conversion_end_time; int64_t compile_end_time; - auto it = infer_request_cache.find(cgraph); - if (it != infer_request_cache.end()) { - std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); - decoder_end_time = ggml_time_us(); - - // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache - if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { - infer_request_cache[cgraph] = - std::make_shared(compiled_model_cache[cgraph].create_infer_request()); - compiled_model_cache.erase(cgraph); - } - infer_request = *infer_request_cache[cgraph]; - - conversion_end_time = ggml_time_us(); - compile_end_time = conversion_end_time; - } else { - std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + { + std::lock_guard lock(cache_mutex); - if (is_static) { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); + auto it = infer_request_cache.find(cgraph); + if (it != infer_request_cache.end()) { + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); - - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); - ggml_decoder_kvcache->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - auto compiled_model = core.compile_model(model, device, config); - auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); - compiled_model_cache[cgraph] = compiled_model_kvcache; - compile_end_time = ggml_time_us(); - - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; - compiled_model_cache[cgraph] = compiled_model_kvcache; - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); - ov::serialize(model_kvcache, timestamped_filename); + // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache + if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { + infer_request_cache[cgraph] = + std::make_shared(compiled_model_cache[cgraph].create_infer_request()); + compiled_model_cache.erase(cgraph); } - } else { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - decoder_end_time = ggml_time_us(); - - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - auto compiled_model = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); infer_request = *infer_request_cache[cgraph]; - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + conversion_end_time = ggml_time_us(); + compile_end_time = conversion_end_time; + } else { + std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + + if (is_static) { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); + + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + ggml_decoder_kvcache->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); + ov::serialize(model_kvcache, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, get_npu_prefill_config()); + auto compiled_model_kvcache = core.compile_model(model_kvcache, device, get_npu_generate_config()); + compiled_model_cache[cgraph] = compiled_model_kvcache; + compile_end_time = ggml_time_us(); + + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; + compiled_model_cache[cgraph] = compiled_model_kvcache; + } else { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; } - } - std::vector ov_input_names; - std::vector ov_output_names; - for (const auto& ov_param : model->get_parameters()) { - ov_input_names.push_back(ov_param->get_friendly_name()); - } - for (const auto& ov_output : model->get_results()) { - ov_output_names.push_back(ov_output->get_friendly_name()); + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto& ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto& ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[cgraph] = ov_input_names; + ov_output_names_cache[cgraph] = ov_output_names; } - ov_input_names_cache[cgraph] = ov_input_names; - ov_output_names_cache[cgraph] = ov_output_names; } auto ov_input_names = ov_input_names_cache[cgraph]; @@ -233,21 +241,30 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } -ov::AnyMap get_npu_config() { +ov::AnyMap get_npu_prefill_config() { ov::AnyMap config = { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_HOST_GATHER", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared" }, + {"NPUW_SLICE_OUT", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; return config; } +ov::AnyMap get_npu_generate_config() { + ov::AnyMap config = get_npu_prefill_config(); + config.emplace("NPUW_UNFOLD_IREQS", "YES"); + return config; +} + bool is_naive(struct ggml_cgraph* cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -257,9 +274,12 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { - if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) { + if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } + if (cgraph->nodes[0]->op == GGML_OP_FLASH_ATTN_EXT) { + return GGML_STATUS_FAILED; + } auto decoder = std::make_shared(cgraph); auto input_model = std::make_shared(decoder); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 0d71963f53aca..f377fe9d2735d 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -40,7 +40,8 @@ void set_zero_diagonal(std::vector& matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); -ov::AnyMap get_npu_config(); +ov::AnyMap get_npu_prefill_config(); +ov::AnyMap get_npu_generate_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3174a5bbc3ded..1b77876f7ed3a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -204,7 +204,9 @@ if (NOT LLAMA_SANITIZE_ADDRESS) llama_build_and_test(test-opt.cpp) endif() llama_build_and_test(test-gguf.cpp) -llama_build_and_test(test-backend-ops.cpp) +if (NOT GGML_OPENVINO) + llama_build_and_test(test-backend-ops.cpp) +endif() llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") From 970e8e2690c8b73f4176fc1752fab5709be44a0e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 5 Sep 2025 16:41:15 +0800 Subject: [PATCH 117/156] Change openvino device_type to GPU; Enable flash_attn --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++ ggml/src/ggml-openvino/ggml-openvino.cpp | 9 +-- .../openvino/op/flash_attn_ext.cpp | 56 ++++++++++++++++++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 32 +++++------ .../src/ggml-openvino/openvino/op/softmax.cpp | 18 +++--- .../openvino/translate_session.cpp | 12 ++++ 6 files changed, 104 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0ee2338199aa1..0fd64c685f71c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -299,6 +299,13 @@ void GgmlOvDecoder::add_extra_inputs() { attention_size = mask->ne[0]; break; } + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + auto* mask = node->src[3]; + if (std::string(mask->name).find("KQ_mask") != 0) { + throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); + } + attention_size = mask->ne[0]; + } } { diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e3eaf40254043..ed612a24660c4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -173,14 +173,15 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size GGML_ASSERT(free != nullptr); GGML_ASSERT(total != nullptr); ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; - // Placeholder GGML_ASSERT(ctx->device >= 0); // ggml_openvino_set_device(ctx->device); + *total = 1; + *free = 1; } static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_ACCEL; + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { @@ -293,7 +294,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); return true; } - if (n_dims != op->src[0]->ne[0]) { + if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, op->src[0]->ne[0]); @@ -305,7 +306,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } float freq_scale; memcpy(&freq_scale, op_params + 6, sizeof(float)); - if (freq_scale != 1.0f) { + if (freq_scale != 0.0f && freq_scale != 1.0f) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); return true; } diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 5c0ad4c20e4c8..d97603d98a941 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -1,6 +1,12 @@ #include +#include +#include #include +#include #include +#include +#include + #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" @@ -24,9 +30,53 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto q = std::make_shared(q_f32, ov::element::f16); auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); - auto res = std::make_shared(q, k, v , mask, scale_node, false); - auto res_f32 = std::make_shared(res, ov::element::f32); - return rename_outputs_with_suffix({res_f32}, context.get_name()); + + ov::Output mask_sliced; + if (context.has_input("KQ_mask_sliced")) { + mask_sliced = context.get_input("KQ_mask_sliced"); + } else { + auto token_len = get_dimensions(q, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_sliced = std::make_shared(mask, zero, token_len, one, one); + } + + if (mask_sliced.get_element_type() != ov::element::f16) { + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + } + + auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv) { + int64_t factor = q_batch / kv_batch; + if (factor > 1) { + auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{q_batch}); + auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_batch}); + auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); + + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + + auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); + auto kv_broadcast_shape = + std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); + kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); + + auto new_kv_shape = + std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); + kv = std::make_shared(kv, new_kv_shape, false); + } + return kv; + }; + + auto q_shape = context.get_input_shape(0).to_shape(); + auto k_shape = context.get_input_shape(1).to_shape(); + k = tile_kv(q_shape[0], k_shape[0], k); + v = tile_kv(q_shape[0], k_shape[0], v); + + auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); + auto sdpa_f32 = std::make_shared(sdpa, ov::element::f32); + auto res = std::make_shared(sdpa_f32, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 150fbcbb880a4..bfccc28163522 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -62,7 +62,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); + auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); @@ -70,26 +70,26 @@ OutputVector translate_mulmat(const NodeContext& context) { Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; auto broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0); + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); - auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dim}, 0); + auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); Z = std::make_shared(Z_broadcasted, new_Z_shape, false); - } - if (A_batch_larger) { - B = Z; - } else { - A = Z; - } + } + if (A_batch_larger) { + B = Z; + } else { + A = Z; + } - if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, transpose_b); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - res = std::make_shared(A, B, false, transpose_b); - } + if (convert_out_type) { + auto result_lp = std::make_shared(A, B, false, transpose_b); + res = std::make_shared(result_lp, context.get_output_type(0)); + } else { + res = std::make_shared(A, B, false, transpose_b); + } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index e072658ecb156..1aa3bf76a06bb 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -51,14 +51,18 @@ OutputVector translate_soft_max(const NodeContext& context) { return rename_outputs_with_suffix({res}, context.get_name()); } - auto mask_node = context.get_input(1); + ov::Output mask_node_sliced; + if (context.has_input("KQ_mask_sliced")) { + mask_node_sliced = context.get_input("KQ_mask_sliced"); + } else { + auto token_len = get_dimensions(input_node, {1}); + auto mask_node = context.get_input(1); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + } - auto token_len = context.has_input("token_len") ? context.get_input("token_len") : get_dimensions(input_node, {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr mask_node_sliced = - std::make_shared(mask_node, zero, token_len, one, one); - if (mask_node_sliced->get_element_type() != context.get_output_type(0)) { + if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type(0)); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index a09247347f3f1..3e27a689d52ff 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -36,6 +36,7 @@ namespace ggml { using namespace ov::op; namespace { + ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( const std::shared_ptr& model, const std::map& kv_param_res_names) { ov::pass::MakeStateful::ParamResPairs pairs; @@ -76,6 +77,16 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } +void add_sliced_mask(TensorMap& tensor_map) { + auto mask = tensor_map.at("KQ_mask").get_node_shared_ptr(); + auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + std::shared_ptr mask_sliced = std::make_shared(mask, zero, token_len, one, one); + mask_sliced->set_friendly_name("KQ_mask_sliced"); + tensor_map.insert({"KQ_mask_sliced", mask_sliced->output(0)}); +} + void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); @@ -97,6 +108,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); + add_sliced_mask(tensor_map); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From 2af9b7a0514b8505bdd36210733c9ab1ee55dd57 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 5 Aug 2025 19:51:01 +0800 Subject: [PATCH 118/156] Update supports_buft and supports_op for quantized models --- ggml/src/ggml-openvino/ggml-openvino.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index ed612a24660c4..f81b1ee4834de 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,6 +8,7 @@ #include #include "ggml-backend-impl.h" +#include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-openvino/utils.h" #include "ggml.h" @@ -332,8 +333,16 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_types{ - GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; + static const std::set supported_types{GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_I64, + GGML_TYPE_I32, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q8_0, + GGML_TYPE_Q6_K}; static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, @@ -411,7 +420,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); + // TODO quantized weigts are cpu_repack_buffer_type which does not implement ggml_backend_buft_is_host + return ggml_backend_buft_is_host(buft) || strcmp(buft->device->iface.get_name(buft->device), "CPU") == 0; GGML_UNUSED(dev); } From bbfb8427cdddb92a4ba859b3368c31a4bb5dfa6a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 5 Aug 2025 20:56:50 +0800 Subject: [PATCH 119/156] Add quant weight conversion functions from genai gguf reader --- ggml/src/ggml-openvino/ggml-decoder.cpp | 76 +++++- ggml/src/ggml-openvino/ggml-quant.cpp | 313 ++++++++++++++++++++++++ ggml/src/ggml-openvino/ggml-quant.hpp | 44 ++++ 3 files changed, 429 insertions(+), 4 deletions(-) create mode 100644 ggml/src/ggml-openvino/ggml-quant.cpp create mode 100644 ggml/src/ggml-openvino/ggml-quant.hpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0fd64c685f71c..c2e164b808baa 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-quant.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : @@ -402,12 +404,78 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { + std::set weight_types = { + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + if (weight_types.find(tensor->type) == weight_types.end()) { + throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + + ggml_type_name(tensor->type)); + } + auto node_type = get_ov_type(tensor); auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); - ov::Tensor weights(node_type, node_shape); - memcpy(weights.data(), tensor->data, ne_total * node_type.size()); - return std::make_shared(weights); + + if (node_type != ov::element::dynamic) { + ov::Tensor weights(node_type, node_shape); + memcpy(weights.data(), tensor->data, ne_total * node_type.size()); + std::shared_ptr weight_node = std::make_shared(weights); + if (node_type == ov::element::f16) { + weight_node = std::make_shared(weight_node, ov::element::f32); + } + weight_node->set_friendly_name(tensor->name); + return weight_node; + } + + uint64_t weights_per_byte; + if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { + weights_per_byte = 2; + } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K + weights_per_byte = 1; + } + + uint64_t weights_per_block; + // here we only consider sub block, q6k:16 q4k:32 + if (tensor->type == GGML_TYPE_Q6_K) { + weights_per_block = 16; + } else { + weights_per_block = 32; + } + + OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, + "[load_gguf] tensor ", + tensor->name, + " has incompatible last dim shape: ", + node_shape.back()); + + auto weights_shape = node_shape; + weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8 + + ov::Tensor weights(ov::element::u32, weights_shape); + // For scales and bias + node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; + + ov::Tensor scales(ov::element::f16, node_shape); + ov::Tensor biases(ov::element::f16, node_shape); + ov::Output weight_node; + if (tensor->type == GGML_TYPE_Q4_0) { + extract_q4_0_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q4_1) { + extract_q4_1_data(tensor, weights, scales, biases); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q8_0) { + extract_q8_0_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q6_K) { + // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled. + extract_q6_k_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q4_K) { + extract_q4_k_data(tensor, weights, scales, biases); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } + weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); + return weight_node.get_node_shared_ptr(); } void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { @@ -537,7 +605,7 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { case GGML_TYPE_I64: return ov::element::i64; default: - throw std::runtime_error("Unsupported tensor type"); + return ov::element::dynamic; } } diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp new file mode 100644 index 0000000000000..4311ab138ea0d --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-quant.cpp @@ -0,0 +1,313 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" + +void unpack_32_4(const uint8_t* data, uint8_t* dst) { + std::fill_n(dst, 16, 0); + for (int j = 0; j < 16; ++j) { + uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. + uint8_t y = (data[j + 2] >> 4); + if (j % 2 != 0) { + x <<= 4; + y <<= 4; + } + dst[j / 2] |= x; + dst[8 + j / 2] |= y; // Last 16 weights are in the higher bits + } +} + +// Extracts (weight, scales, biases) from Q4_0 tensors. +// Data layout is: |16 bit scale|32 x 4bit weights|. +void extract_q4_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + biases[i] = ov::float16(-8.f * static_cast(scales[i])); + unpack_32_4(data + i * bytes_per_block, weights + i * 16); + }); +} + +// Extracts (weight, scales, biases) from Q4_1 tensors. +// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. +void extract_q4_1_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 1))); + unpack_32_4(data + i * bytes_per_block, weights + i * 16); + }); +} + +// Extracts (weight, scales, biases) from Q8_0 tensors. +// Data layout is: |16 bit scale|32 x 8bit weights|. +void extract_q8_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t weights_per_block = 32; + const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + for (int64_t i = 0; i < scales_arr.get_size(); i++) { + uint8_t* block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); + biases[i] = ov::float16(-128.f * static_cast(scales[i])); + for (int64_t j = 0; j < weights_per_block; ++j) { + uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. + // Original data is in int8_t, so we add a bias of -128 and invert the + // first bit. + x ^= 1 << 7; + weights[i * weights_per_block + j] = x; + } + } +} + +void unpack_256_4(const uint8_t* data, uint8_t* dst) { + // Initialize the output array with zeros + std::fill_n(dst, 128, 0); + + for (size_t i = 0; i < 4; ++i) { + for (int j = 0; j < 32; ++j) { + uint8_t x = (data[i * 32 + j] & 0x0F); + uint8_t y = (data[i * 32 + j] >> 4); + if (j % 2 != 0) { + x <<= 4; + y <<= 4; + } + dst[i * 32 + j / 2] |= x; + dst[i * 32 + 16 + j / 2] |= y; // Last 16 weights are in the higher bits + } + } +} + +void extract_q4_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 2 + 2 + 12 + 128; + // TODO tensor->nb[3] + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t* block_data = data + i * bytes_per_block; + + // Extract scale factors and offsets + float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t*)block_data))); + float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 1))); + + // Extract qs1 and qs2 + uint8_t* qs1 = block_data + 4; + uint8_t* qs2 = block_data + 16; + + scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); + scales[i * 8 + 1] = ov::float16(scale_scales * static_cast((*(qs1 + 1) & 0b111111))); + scales[i * 8 + 2] = ov::float16(scale_scales * static_cast((*(qs1 + 2) & 0b111111))); + scales[i * 8 + 3] = ov::float16(scale_scales * static_cast((*(qs1 + 3) & 0b111111))); + scales[i * 8 + 4] = + ov::float16(scale_scales * static_cast((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4))); + scales[i * 8 + 5] = + ov::float16(scale_scales * static_cast((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4))); + scales[i * 8 + 6] = + ov::float16(scale_scales * static_cast((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4))); + scales[i * 8 + 7] = + ov::float16(scale_scales * static_cast((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4))); + + biases[i * 8] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 4) & 0b111111))); + biases[i * 8 + 1] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 5) & 0b111111))); + biases[i * 8 + 2] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 6) & 0b111111))); + biases[i * 8 + 3] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 7) & 0b111111))); + biases[i * 8 + 4] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4))); + biases[i * 8 + 5] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4))); + biases[i * 8 + 6] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4))); + biases[i * 8 + 7] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4))); + unpack_256_4(block_data + 16, weights + i * 128); + }); +} + +void extract_q6_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 128 + 64 + 16 + 2; + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + // std::string name(tensor.name, tensor.namelen); + for (int64_t i = 0; i < n_super_block; i++) { + uint8_t* block_data = data + i * bytes_per_block; + + float scale_factor = + static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 104))); // (128+64+16)/2 + + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t*)(block_data + 128 + 64 + j)))); + biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + } + + // Extract ql and qh + uint8_t* ql = block_data; + uint8_t* qh = block_data + 128; + + // Extract weights + for (int64_t j = 0; j < 32; ++j) { + weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); + weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); + weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); + weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); + weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); + weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); + weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); + weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); + } + } +} + +// TODO Reorder for make_intX_weights + +ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { + + // Reshape weight to (num_heads, -1, group_size) + ov::Shape orig_shape = weight.get_shape(); + orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t); + size_t num_groups = orig_shape[1] / group_size; + + // Expand dimensions for scales and biases + auto scale_shape = scales.get_shape(); + scale_shape.push_back(1); + scales.set_shape(scale_shape); + biases.set_shape(scale_shape); + + // Create graph nodes + auto weights_node = std::make_shared(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto scales_f16 = std::make_shared(scales); + ov::Tensor biases_u8(ov::element::u8, scale_shape); + + // Calculate zero point + const ov::float16* bias_data = biases.data::value_type>(); + const ov::float16* scale_data = scales.data::value_type>(); + uint8_t* bias_u8_data = biases_u8.data(); + for (size_t i = 0; i < biases_u8.get_size(); ++i) { + bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + } + + auto zero_point = std::make_shared(biases_u8); + + // Quantization operations + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + + auto w_zp = std::make_shared( + weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY + ); + auto w_zp_s = std::make_shared( + w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY + ); + + // Reshape back to original dimensions + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape + ); + auto w_zp_s_r = std::make_shared( + w_zp_s, final_shape, false + ); + + return std::make_shared(w_zp_s_r, ov::element::f32); +} + +ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { + + // Convert weight to uint8 view and adjust shape + ov::Shape orig_weight_shape = weight.get_shape(); + orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation + + // Expand dimensions for scales and biases + ov::Shape scale_bias_shape = scales.get_shape(); + scale_bias_shape.push_back(1); // Add new axis at the end + scales.set_shape(scale_bias_shape); + biases.set_shape(scale_bias_shape); + + // Create INT4 weight tensor + ov::Shape packed_shape = { + orig_weight_shape[0], + orig_weight_shape[1] / group_size, + group_size + }; + + auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holde"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + // Pack zero points: two subsequent values into one + const ov::float16* bias_data = biases.data::value_type>(); + const ov::float16* scale_data = scales.data::value_type>(); + ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); + uint8_t* zero_point_data = static_cast(zero_point_tensor.data()); + for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { + uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / static_cast(scale_data[i * 2 + 1])); + zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + } + + // CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation. + // This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin. + // Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results. + zero_point_data[0] += 1; + + auto zero_points_node = std::make_shared(zero_point_tensor); + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + + auto scales_f16 = std::make_shared(scales); + + // Perform dequantization + auto w_zp = std::make_shared( + weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + + auto w_zp_s = std::make_shared( + w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + + // Reshape back to original shape + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + + auto w_zp_s_r = std::make_shared( + w_zp_s, final_shape, false); + + return std::make_shared(w_zp_s_r, ov::element::f32); +} diff --git a/ggml/src/ggml-openvino/ggml-quant.hpp b/ggml/src/ggml-openvino/ggml-quant.hpp new file mode 100644 index 0000000000000..9c0dd89a95aee --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-quant.hpp @@ -0,0 +1,44 @@ +#include +#include +#include "ggml.h" + +void unpack_32_4(const uint8_t* data, uint8_t* dst); + +void extract_q4_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q4_1_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q8_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void unpack_256_4(const uint8_t* data, uint8_t* dst); + +void extract_q4_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q6_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32; + +ov::Output make_int8_weights(ov::Tensor& weight, + ov::Tensor& scales, + ov::Tensor& biases, + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + +ov::Output make_int4_weights(ov::Tensor& weight, + ov::Tensor& scales, + ov::Tensor& biases, + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); From 720a1e044cbbcc6c163fcc60084a8f8c57f3700d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 6 Aug 2025 15:54:40 +0800 Subject: [PATCH 120/156] Quant models run with accuracy issue --- ggml/src/ggml-openvino/ggml-decoder.cpp | 20 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-quant.cpp | 4 +++- .../ggml-openvino/openvino/op/get_rows.cpp | 11 ++++++++-- .../openvino/translate_session.cpp | 1 - ggml/src/ggml-openvino/openvino/utils.cpp | 2 ++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c2e164b808baa..a3e7059fa2147 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -415,6 +417,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); + OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); + + // F16 and F32 case if (node_type != ov::element::dynamic) { ov::Tensor weights(node_type, node_shape); memcpy(weights.data(), tensor->data, ne_total * node_type.size()); @@ -426,6 +431,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) return weight_node; } + // Quantized case + node_shape.erase(node_shape.begin()); + uint64_t weights_per_byte; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { weights_per_byte = 2; @@ -459,7 +467,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) ov::Output weight_node; if (tensor->type == GGML_TYPE_Q4_0) { extract_q4_0_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q4_1) { extract_q4_1_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); @@ -474,7 +482,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) extract_q4_k_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); } + + OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); + // weight_node = std::make_shared( + // weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0})); + weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); + // GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n", + // tensor->name, + // ggml_type_name(tensor->type), + // weight_node.get_element_type().get_type_name().c_str(), + // weight_node.get_partial_shape().to_string().c_str()); return weight_node.get_node_shared_ptr(); } diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp index 4311ab138ea0d..14ef58a3f777a 100644 --- a/ggml/src/ggml-openvino/ggml-quant.cpp +++ b/ggml/src/ggml-openvino/ggml-quant.cpp @@ -1,4 +1,7 @@ +#include "ggml-quant.hpp" + #include +#include #include #include #include @@ -6,7 +9,6 @@ #include #include #include -#include #include "ggml.h" diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 36795fd43eabd..0de77da59ffc5 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -7,6 +6,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) { indices = process_view_input(context, 1); } - auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + Output axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); if (indices.get_partial_shape()[1].get_length() == 1) { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + if (data.get_partial_shape().rank() == 2) { + axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + } res = std::make_shared(data, indices, axis); + if (data.get_partial_shape().rank() == 2) { + res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + } } else { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3e27a689d52ff..62804670414ef 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -212,7 +212,6 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); - manager.register_pass(); if (!ggml_model_decoder->is_static()) { const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index c4197ccc3abdc..ef5f51ebbc4a7 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -17,6 +17,8 @@ #include #include +#include "ggml-impl.h" + namespace ov { namespace frontend { namespace ggml { From 7fc46fab01d666e6612b4fc3f910f5e7610da84d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 7 Aug 2025 14:25:20 +0800 Subject: [PATCH 121/156] Fix accuracy: disable cpu_repack --- docs/build.md | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++++ ggml/src/ggml-openvino/ggml-openvino.cpp | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/build.md b/docs/build.md index c7e15a4e78482..e2ef8b4e08b5b 100644 --- a/docs/build.md +++ b/docs/build.md @@ -648,7 +648,7 @@ git switch dev_backend_openvino # Build with OpenVINO support source /opt/intel/openvino/setupvars.sh -cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON +cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF cmake --build build/ReleaseOV --config Release -j $(nproc) ``` diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a3e7059fa2147..cd897e5f688bb 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -432,6 +432,10 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) } // Quantized case + OPENVINO_ASSERT( + tensor->extra == nullptr, + "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); + node_shape.erase(node_shape.begin()); uint64_t weights_per_byte; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index f81b1ee4834de..23a92c58ac8b8 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -420,8 +420,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - // TODO quantized weigts are cpu_repack_buffer_type which does not implement ggml_backend_buft_is_host - return ggml_backend_buft_is_host(buft) || strcmp(buft->device->iface.get_name(buft->device), "CPU") == 0; + return ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } From 5ae2d39a270a9988bd1ca291441d8251d5b5d797 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 7 Aug 2025 15:22:58 +0800 Subject: [PATCH 122/156] Fix CI; Disable test-backend-ops --- ci/run.sh | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 2 +- .../ggml-openvino/{ggml-quant.cpp => ggml-quants.cpp} | 10 +++++----- .../ggml-openvino/{ggml-quant.hpp => ggml-quants.hpp} | 0 4 files changed, 7 insertions(+), 7 deletions(-) rename ggml/src/ggml-openvino/{ggml-quant.cpp => ggml-quants.cpp} (98%) rename ggml/src/ggml-openvino/{ggml-quant.hpp => ggml-quants.hpp} (100%) diff --git a/ci/run.sh b/ci/run.sh index a06cf22fffc51..3456005ffbe13 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -155,7 +155,7 @@ if [ ! -z ${GG_BUILD_OPENVINO} ]; then echo "source /opt/intel/openvino/setupvars.sh" exit 1 fi - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF" fi ## helpers diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index cd897e5f688bb..cde99f32883bf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -32,7 +32,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-quant.hpp" +#include "ggml-quants.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp similarity index 98% rename from ggml/src/ggml-openvino/ggml-quant.cpp rename to ggml/src/ggml-openvino/ggml-quants.cpp index 14ef58a3f777a..8d4fb141896f4 100644 --- a/ggml/src/ggml-openvino/ggml-quant.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,4 +1,4 @@ -#include "ggml-quant.hpp" +#include "ggml-quants.hpp" #include #include @@ -75,11 +75,11 @@ void extract_q8_0_data(const ggml_tensor* tensor, auto weights = static_cast(weights_arr.data()); auto scales = scales_arr.data::value_type>(); auto biases = biases_arr.data::value_type>(); - for (int64_t i = 0; i < scales_arr.get_size(); i++) { + for (size_t i = 0; i < scales_arr.get_size(); i++) { uint8_t* block_data = data + i * bytes_per_block; scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); biases[i] = ov::float16(-128.f * static_cast(scales[i])); - for (int64_t j = 0; j < weights_per_block; ++j) { + for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. // Original data is in int8_t, so we add a bias of -128 and invert the // first bit. @@ -128,7 +128,7 @@ void extract_q4_k_data(const ggml_tensor* tensor, // Extract qs1 and qs2 uint8_t* qs1 = block_data + 4; - uint8_t* qs2 = block_data + 16; + // uint8_t* qs2 = block_data + 16; scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); scales[i * 8 + 1] = ov::float16(scale_scales * static_cast((*(qs1 + 1) & 0b111111))); @@ -170,7 +170,7 @@ void extract_q6_k_data(const ggml_tensor* tensor, auto scales = scales_arr.data::value_type>(); auto biases = biases_arr.data::value_type>(); // std::string name(tensor.name, tensor.namelen); - for (int64_t i = 0; i < n_super_block; i++) { + for (size_t i = 0; i < n_super_block; i++) { uint8_t* block_data = data + i * bytes_per_block; float scale_factor = diff --git a/ggml/src/ggml-openvino/ggml-quant.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp similarity index 100% rename from ggml/src/ggml-openvino/ggml-quant.hpp rename to ggml/src/ggml-openvino/ggml-quants.hpp From 545630219c9dc3e35534349f1cb192efd95b9af2 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 8 Aug 2025 11:07:10 +0800 Subject: [PATCH 123/156] Fix Q4_1 --- ggml/src/ggml-openvino/ggml-quants.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 8d4fb141896f4..e969b0b54adfc 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -15,8 +15,8 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst) { std::fill_n(dst, 16, 0); for (int j = 0; j < 16; ++j) { - uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. - uint8_t y = (data[j + 2] >> 4); + uint8_t x = (data[j] & 0x0F); + uint8_t y = (data[j] >> 4); if (j % 2 != 0) { x <<= 4; y <<= 4; @@ -41,7 +41,7 @@ void extract_q4_0_data(const ggml_tensor* tensor, ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); biases[i] = ov::float16(-8.f * static_cast(scales[i])); - unpack_32_4(data + i * bytes_per_block, weights + i * 16); + unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); } @@ -58,8 +58,8 @@ void extract_q4_1_data(const ggml_tensor* tensor, auto biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); - biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 1))); - unpack_32_4(data + i * bytes_per_block, weights + i * 16); + biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2))); + unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); }); } From 51833fb2df38e37b71821ed794c98dea023971a4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 8 Aug 2025 15:15:12 +0800 Subject: [PATCH 124/156] Fix test-thread-safety --- tests/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1b77876f7ed3a..677d4e01d8c60 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -190,9 +190,6 @@ if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") else() llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) endif() -if (NOT GGML_OPENVINO) - llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) -endif() # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135) if (NOT WIN32) From 1b459c74cbb83ebf204233c1a29f5975ab23504d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 12 Aug 2025 09:44:21 +0800 Subject: [PATCH 125/156] Fix test-backend-ops: Treat quantized tensors as weights --- ggml/src/ggml-openvino/ggml-decoder.cpp | 16 ++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 5 +++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 14 +++++++++++--- ggml/src/ggml-openvino/utils.cpp | 6 +++++- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index cde99f32883bf..b20bfd0c76f52 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,13 +76,15 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, add_extra_inputs(); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, + std::map>& model_weights) { if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; dump_cgraph(cgraph, filename); } m_cgraph = cgraph; + m_model_weights = model_weights; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; if (cur_node->op == GGML_OP_NONE) { @@ -123,10 +125,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { // Add model inputs and weights constants, if called for the whole graph if (naive) { - auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; + if (m_model_weights.find(src_name) == m_model_weights.end()) { + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } } else if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; @@ -381,7 +385,7 @@ std::map> GgmlOvDecoder::create_weight_no std::string src_name(src->name); if (!src->view_src) { ggml_backend_buffer* buffer = src->buffer; - if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) { bool should_create = false; { std::lock_guard lock(weights_mutex); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index ae378273d32e0..df23c649f4f47 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -20,7 +20,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { int context_size, int num_heads, int num_heads_kv, int head_size); // Naive graph decoder - GgmlOvDecoder(struct ggml_cgraph* cgraph); + GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -115,6 +115,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); @@ -126,7 +128,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { private: void set_input_output(ggml_tensor* node, bool naive = false); void add_extra_inputs(); - static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 23a92c58ac8b8..4b743be6884b0 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -403,14 +403,22 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con return false; } for (int i = 0; i < GGML_MAX_SRC; i++) { - if (supported_types.find(op->type) == supported_types.end()) { - GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + auto* src = op->src[i]; + if (src == nullptr) { + break; + } + if (supported_types.find(src->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type)); return false; } - if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) { + if (src->ne[3] != 1) { GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); return false; } + if (ggml_is_quantized(src->type) && src->ne[2] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n"); + return false; + } } if (is_op_unsupported_case(op)) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 473fa72f99fd5..43fa0c469d60a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -281,10 +281,14 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, return GGML_STATUS_FAILED; } - auto decoder = std::make_shared(cgraph); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); auto naive = true; auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + ov::serialize(model, "IR_naive.xml"); + } auto infer_request = core.compile_model(model, device, config).create_infer_request(); auto ov_params = model->get_parameters(); From 2c93c011d5f9c76cf94f2ab77638b39929ca6772 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 19 Aug 2025 14:56:28 +0800 Subject: [PATCH 126/156] Add NPU Q4_0 support --- ggml/src/ggml-openvino/ggml-openvino.cpp | 28 +++++++++++++++--------- ggml/src/ggml-openvino/ggml-quants.cpp | 13 ++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 13 +++++++++++ 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 4b743be6884b0..a6ec1c64c2904 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -333,16 +333,24 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_types{GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_I64, - GGML_TYPE_I32, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q8_0, - GGML_TYPE_Q6_K}; + static std::set supported_types{GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_I64, + GGML_TYPE_I32, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q8_0, + GGML_TYPE_Q6_K}; + + std::string device = std::string(getenv("GGML_OPENVINO_DEVICE")); + bool is_npu = device == "NPU"; + if (is_npu) { + // NPU has poor support for asymmetric quantization + supported_types.erase(GGML_TYPE_Q4_1); + supported_types.erase(GGML_TYPE_Q4_K); + } static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index e969b0b54adfc..97aa494ed85aa 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -230,6 +230,10 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o } auto zero_point = std::make_shared(biases_u8); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } // Quantization operations auto weights_f16 = std::make_shared(weights_node, ov::element::f16); @@ -287,12 +291,11 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); } - // CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation. - // This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin. - // Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results. - zero_point_data[0] += 1; - auto zero_points_node = std::make_shared(zero_point_tensor); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); auto scales_f16 = std::make_shared(scales); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 9c0dd89a95aee..ae37b1618ed14 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,5 +1,7 @@ #include +#include #include + #include "ggml.h" void unpack_32_4(const uint8_t* data, uint8_t* dst); @@ -42,3 +44,14 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + +namespace ov { +namespace op { +namespace util { +// From /src/common/transformations/include/transformations/utils/utils.hpp +bool get_single_value(const std::shared_ptr& const_node, + float& value, + bool check_value_range = true); +} // namespace util +} // namespace op +} // namespace ov From ac4a25a6f5523f75b4c7052b9d3160b6dff7bf50 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 22 Aug 2025 15:00:38 +0800 Subject: [PATCH 127/156] NPU perf: eliminate zp --- .../openvino/pass/eliminate_zp.cpp | 116 ++++++++++++++++++ .../openvino/pass/eliminate_zp.hpp | 17 +++ .../openvino/translate_session.cpp | 2 + 3 files changed, 135 insertions(+) create mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp create mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp new file mode 100644 index 0000000000000..d2e5a040dd28f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp @@ -0,0 +1,116 @@ +#include "eliminate_zp.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +EliminateZeroPoints::EliminateZeroPoints() { + // Find pattern: + // (Multiply Any(scale) + // (Subtract (Convert Constant(data))) + // (Convert Constant(zero_point))) + // where zero_point is a scalar + // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val + // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant + + auto m_data_constant = ov::pass::pattern::wrap_type(); + auto m_data_convert = ov::pass::pattern::wrap_type({m_data_constant}); + + auto m_zp_constant = ov::pass::pattern::wrap_type(); + auto m_zp_convert = ov::pass::pattern::wrap_type({m_zp_constant}); + + auto m_subtract = ov::pass::pattern::wrap_type({m_data_convert, m_zp_convert}); + auto m_scale = ov::pass::pattern::any_input(); + auto m_multiply = ov::pass::pattern::wrap_type({m_scale, m_subtract}); + + const auto callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto multiply_node = std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); + auto subtract_node = std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); + auto data_constant = std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); + auto zp_constant = std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); + + if (!multiply_node || !subtract_node || !data_constant || !zp_constant) { + return false; + } + + if (ov::shape_size(zp_constant->get_shape()) != 1) { + return false; + } + + auto data_type = data_constant->get_element_type(); + auto zp_data = zp_constant->cast_vector(); + + if (zp_data.empty()) { + return false; + } + + int zp_value = zp_data[0]; + + bool should_eliminate = false; + ov::element::Type target_type; + + if (data_type == ov::element::u4 && zp_value == 8) { + should_eliminate = true; + target_type = ov::element::i4; + } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) { + should_eliminate = true; + target_type = ov::element::i8; + } + + if (!should_eliminate) { + return false; + } + + auto data_shape = data_constant->get_shape(); + size_t total_elements = ov::shape_size(data_shape); + + std::shared_ptr new_constant; + + if (data_type == ov::element::u4) { + auto data_values = data_constant->cast_vector(); + std::vector adjusted_values(total_elements); + + ov::parallel_for(total_elements, [&](size_t i) { + adjusted_values[i] = static_cast(static_cast(data_values[i]) - 8); + }); + + new_constant = std::make_shared(target_type, data_shape, adjusted_values); + } else if (data_type == ov::element::u8) { + auto data_values = data_constant->cast_vector(); + std::vector adjusted_values(total_elements); + + ov::parallel_for(total_elements, [&, zp_value](size_t i) { + adjusted_values[i] = static_cast(static_cast(data_values[i]) - zp_value); + }); + + new_constant = std::make_shared(target_type, data_shape, adjusted_values); + } + + auto new_convert = std::make_shared(new_constant, subtract_node->get_output_element_type(0)); + ov::replace_node(subtract_node, new_convert); + + return true; + }; + + register_matcher(std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), + callback); +} + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp new file mode 100644 index 0000000000000..edd3cd718d9b0 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp @@ -0,0 +1,17 @@ +#include "openvino/pass/matcher_pass.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +class EliminateZeroPoints : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints") + EliminateZeroPoints(); +}; + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 62804670414ef..634fea40e923f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -26,6 +26,7 @@ #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" +#include "pass/eliminate_zp.hpp" #include "pass/fuse_to_sdpa.hpp" #include "pass/mark_decompression_convert_constant_folding.hpp" @@ -219,6 +220,7 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } + manager.register_pass(); manager.register_pass(); manager.run_passes(model); } From 41121bf6436b8f15d059e77968179510e70cef7d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 26 Aug 2025 15:55:06 +0800 Subject: [PATCH 128/156] NPU perf: Faster compilation --- IR.xml | 462 +++++++++++++++++++++++++++ ggml/src/ggml-openvino/utils.cpp.bak | 72 +++++ 2 files changed, 534 insertions(+) create mode 100644 IR.xml create mode 100644 ggml/src/ggml-openvino/utils.cpp.bak diff --git a/IR.xml b/IR.xml new file mode 100644 index 0000000000000..f5b1df8740a66 --- /dev/null +++ b/IR.xml @@ -0,0 +1,462 @@ + + + + + + + + 2 + 128 + 64 + + + + + + + + 1 + 1 + 32 + + + + + + + + 1 + 1 + 2 + + + + + + + + + + + + + + 2 + 128 + 64 + + + + + + 2 + 128 + 32 + + + 2 + 128 + 32 + + + + + + + + 1 + 1 + 32 + + + + + + + + 1 + 1 + 32 + + + 1 + 1 + 32 + + + + + 1 + 1 + 32 + + + + + + + + 1 + 1 + 2 + + + + + 1 + 1 + 2 + + + + + + + + 3 + + + + + + + 1 + 1 + 2 + + + 3 + + + + + 2 + 1 + 1 + + + + + + + + 1 + 1 + 32 + + + 2 + 1 + 1 + + + + + 2 + 1 + 32 + + + + + + + + 1 + + + + + + + + 2 + 1 + 32 + + + 1 + + + + + 2 + 1 + 32 + + + + + + + 2 + 1 + 32 + + + + + 2 + 1 + 32 + + + + + + + + + + + + + + 2 + 1 + 32 + + + + + + 2 + 1 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 1 + 32 + + + + + 2 + 128 + 32 + + + + + + + 2 + 1 + 32 + + + + + 2 + 1 + 32 + + + + + + + + 2 + 1 + 32 + + + + + + 2 + 1 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 1 + 32 + + + + + 2 + 128 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 128 + 32 + + + + + 2 + 128 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 1 + 32 + + + + + 2 + 128 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 1 + 32 + + + + + 2 + 128 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 128 + 32 + + + + + 2 + 128 + 32 + + + + + + + + 2 + 128 + 32 + + + 2 + 128 + 32 + + + + + 2 + 128 + 64 + + + + + + + 2 + 128 + 64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ggml/src/ggml-openvino/utils.cpp.bak b/ggml/src/ggml-openvino/utils.cpp.bak new file mode 100644 index 0000000000000..8fef1985f91ae --- /dev/null +++ b/ggml/src/ggml-openvino/utils.cpp.bak @@ -0,0 +1,72 @@ +void model_cut() { + ov::Core core; + std::shared_ptr model = + core.read_model("/home/zijun/dev/llama.cpp-ov/tmp/fold_graph/Model1_01_0x5555601c5ac0.xml"); + + ov::ParameterVector new_params; + + auto ops = model->get_ops(); + std::shared_ptr node_a; + std::shared_ptr node_b; + for (const auto& op : ops) { + if (op->get_friendly_name() == "Multiply_4636_ffn_norm-0") { + node_a = op; + } else if (op->get_friendly_name() == "Multiply_4645_ffn_gate_par-0") { + node_b = op; + } else if (op->get_friendly_name() == "Parameter_39914") { + auto param = std::dynamic_pointer_cast(op); + new_params.push_back(param); + } else if (op->get_friendly_name() == "Parameter_39915") { + auto param = std::dynamic_pointer_cast(op); + new_params.push_back(param); + } + } + + auto subgraph_input_tensor = node_a->output(0); + auto subgraph_output_tensor = node_b->output(0); + + auto new_input = std::make_shared(subgraph_input_tensor.get_element_type(), + subgraph_input_tensor.get_shape()); + new_input->set_friendly_name("subgraph_input"); + new_params.push_back(new_input); + + // Rewire: replace all consumers of original tensor with new input + subgraph_input_tensor.replace(new_input); + + auto result = std::make_shared(subgraph_output_tensor); + result->set_friendly_name("subgraph_output"); + + auto subgraph = std::make_shared(ov::ResultVector{result}, new_params, "trimmed_subgraph"); + + ov::serialize(subgraph, "/home/zijun/dev/llama.cpp-ov/tmp/subgraph.xml"); + + assert(false); +} + +void create_graph() { + // Input shapes: [256, 1, 1] + ov::Shape input_shape{256, 1, 1}; + + // Define input parameters + auto input0 = std::make_shared(ov::element::f32, input_shape); + auto input1 = std::make_shared(ov::element::f32, input_shape); + + // Concat on axis 2 -> shape becomes [256, 1, 2] + auto concat = std::make_shared(ov::OutputVector{input0, input1}, 2); + + // Target shape constant for reshape: [256, 2] + auto reshape_shape = ov::op::v0::Constant::create(ov::element::i64, {2}, {256, 2}); + + // special_zero = false + auto reshape = std::make_shared(concat, reshape_shape, false); + + // Define result node + auto result = std::make_shared(reshape); + + // Create model + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}, "ReshapeConcatModel"); + + ov::serialize(subgraph, "/home/zijun/dev/llama.cpp-ov/tmp/subgraph3.xml"); + + exit(0); +} From 7bb86bb4a820bc77de04df1af4cb4072b9f85fc4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 29 Aug 2025 11:39:27 +0800 Subject: [PATCH 129/156] Dequantize q4_1 q4_k q6_k for NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 25 +++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 5 +++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 8 -------- ggml/src/ggml-openvino/utils.cpp | 6 +++++- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b20bfd0c76f52..fef8648ebdac4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -370,7 +370,8 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -std::map> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { +std::map> GgmlOvDecoder::create_weight_nodes( + struct ggml_cgraph* cgraph, std::set types_to_dequantize) { std::map> model_weights; static std::mutex weights_mutex; auto* nodes = cgraph->nodes; @@ -395,7 +396,7 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto weight_node = create_weight_node(src); + auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -409,7 +410,7 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) { std::set weight_types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -422,15 +423,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) auto ne_total = ggml_nelements(tensor); OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); + node_shape.erase(node_shape.begin()); // F16 and F32 case if (node_type != ov::element::dynamic) { ov::Tensor weights(node_type, node_shape); memcpy(weights.data(), tensor->data, ne_total * node_type.size()); std::shared_ptr weight_node = std::make_shared(weights); - if (node_type == ov::element::f16) { - weight_node = std::make_shared(weight_node, ov::element::f32); - } + // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU + // if (node_type == ov::element::f16) { + // weight_node = std::make_shared(weight_node, ov::element::f32); + // } weight_node->set_friendly_name(tensor->name); return weight_node; } @@ -440,7 +443,15 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); - node_shape.erase(node_shape.begin()); + if (to_dequantize) { + std::vector weights_f32(ne_total); + ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); + ov::Tensor weights(ov::element::f16, node_shape); + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); + std::shared_ptr weight_node = std::make_shared(weights); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } uint64_t weights_per_byte; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index df23c649f4f47..b446841514794 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -117,8 +117,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); + static std::shared_ptr create_weight_node(ggml_tensor* tensor, bool to_dequantize); + static std::map> create_weight_nodes( + struct ggml_cgraph* cgraph, std::set types_to_dequantize = {}); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index a6ec1c64c2904..60a2eb388ea1e 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -344,14 +344,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - std::string device = std::string(getenv("GGML_OPENVINO_DEVICE")); - bool is_npu = device == "NPU"; - if (is_npu) { - // NPU has poor support for asymmetric quantization - supported_types.erase(GGML_TYPE_Q4_1); - supported_types.erase(GGML_TYPE_Q4_K); - } - static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 43fa0c469d60a..e49d941da4ab1 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -130,7 +130,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + std::set types_to_dequantize; + if (is_static) { + types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + } + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); From 692778c4d6968c066f57404272ec13ea217a8961 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 2 Sep 2025 13:52:45 +0800 Subject: [PATCH 130/156] Add custom quant type: q8_1_c, q4_0_128 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 44 ++---- ggml/src/ggml-openvino/ggml-decoder.h | 7 +- ggml/src/ggml-openvino/ggml-quants.cpp | 194 +++++++++++++++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 10 ++ ggml/src/ggml-openvino/utils.cpp | 16 +- 5 files changed, 203 insertions(+), 68 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fef8648ebdac4..d00b78e891ee0 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -371,7 +372,7 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes( - struct ggml_cgraph* cgraph, std::set types_to_dequantize) { + struct ggml_cgraph* cgraph, std::map types_to_requantize) { std::map> model_weights; static std::mutex weights_mutex; auto* nodes = cgraph->nodes; @@ -396,7 +397,10 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0); + auto requant_type = types_to_requantize.count(src->type) ? + std::optional(types_to_requantize.at(src->type)) : + std::nullopt; + auto weight_node = create_weight_node(src, requant_type); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -410,7 +414,8 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, + std::optional requant_type) { std::set weight_types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -443,21 +448,15 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); - if (to_dequantize) { - std::vector weights_f32(ne_total); - ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); - ov::Tensor weights(ov::element::f16, node_shape); - ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); - std::shared_ptr weight_node = std::make_shared(weights); - weight_node->set_friendly_name(tensor->name); - return weight_node; + if (requant_type.has_value()) { + return requantize(tensor, requant_type.value()); } - uint64_t weights_per_byte; + ov::element::Type weight_type; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { - weights_per_byte = 2; + weight_type = ov::element::u4; } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K - weights_per_byte = 1; + weight_type = ov::element::u8; } uint64_t weights_per_block; @@ -474,15 +473,12 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, " has incompatible last dim shape: ", node_shape.back()); - auto weights_shape = node_shape; - weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8 - - ov::Tensor weights(ov::element::u32, weights_shape); - // For scales and bias + ov::Tensor weights(weight_type, node_shape); + // For scales and biases node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; - ov::Tensor scales(ov::element::f16, node_shape); ov::Tensor biases(ov::element::f16, node_shape); + ov::Output weight_node; if (tensor->type == GGML_TYPE_Q4_0) { extract_q4_0_data(tensor, weights, scales, biases); @@ -494,7 +490,6 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, extract_q8_0_data(tensor, weights, scales, biases); weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q6_K) { - // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled. extract_q6_k_data(tensor, weights, scales, biases); weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q4_K) { @@ -503,15 +498,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); - // weight_node = std::make_shared( - // weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0})); weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); - // GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n", - // tensor->name, - // ggml_type_name(tensor->type), - // weight_node.get_element_type().get_type_name().c_str(), - // weight_node.get_partial_shape().to_string().c_str()); return weight_node.get_node_shared_ptr(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b446841514794..24e1d92dcfd68 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -4,8 +4,10 @@ #include #include #include +#include #include +#include "ggml-quants.hpp" #include "ggml.h" #include "openvino/decoder.hpp" @@ -117,9 +119,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor, bool to_dequantize); + static std::shared_ptr create_weight_node(ggml_tensor* tensor, + std::optional requant_type = std::nullopt); static std::map> create_weight_nodes( - struct ggml_cgraph* cgraph, std::set types_to_dequantize = {}); + struct ggml_cgraph* cgraph, std::map types_to_requantize = {}); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 97aa494ed85aa..1603e65355274 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,15 +1,20 @@ #include "ggml-quants.hpp" #include +#include +#include #include #include +#include #include #include #include #include #include #include +#include +#include "ggml-impl.h" #include "ggml.h" void unpack_32_4(const uint8_t* data, uint8_t* dst) { @@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor, // TODO Reorder for make_intX_weights ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { - - // Reshape weight to (num_heads, -1, group_size) ov::Shape orig_shape = weight.get_shape(); - orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t); - size_t num_groups = orig_shape[1] / group_size; // Expand dimensions for scales and biases auto scale_shape = scales.get_shape(); - scale_shape.push_back(1); - scales.set_shape(scale_shape); - biases.set_shape(scale_shape); + + ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; + + if (packed_shape[1] == 1) { + packed_shape.erase(packed_shape.begin() + 1); + } else { + scale_shape.push_back(1); + scales.set_shape(scale_shape); + biases.set_shape(scale_shape); + } // Create graph nodes - auto weights_node = std::make_shared(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared( + ov::element::u8, packed_shape, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); ov::Tensor biases_u8(ov::element::u8, scale_shape); @@ -242,32 +251,24 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o auto w_zp = std::make_shared( weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY ); - auto w_zp_s = std::make_shared( - w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY - ); - - // Reshape back to original dimensions - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape - ); - auto w_zp_s_r = std::make_shared( - w_zp_s, final_shape, false - ); + ov::Output w_zp_s = + std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + + if (packed_shape.size() != 2) { + // If not requantized channel-wise case, reshape back to original shape + auto final_shape = + std::make_shared(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape); + w_zp_s = std::make_shared(w_zp_s, final_shape, false); + } - return std::make_shared(w_zp_s_r, ov::element::f32); + return std::make_shared(w_zp_s, ov::element::f32); } ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { - - // Convert weight to uint8 view and adjust shape ov::Shape orig_weight_shape = weight.get_shape(); - orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); - scale_bias_shape.push_back(1); // Add new axis at the end - scales.set_shape(scale_bias_shape); - biases.set_shape(scale_bias_shape); // Create INT4 weight tensor ov::Shape packed_shape = { @@ -276,8 +277,17 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o group_size }; + // Requantized channel-wise case + if (packed_shape[1] == 1) { + packed_shape.erase(packed_shape.begin() + 1); + } else { + scale_bias_shape.push_back(1); + scales.set_shape(scale_bias_shape); + biases.set_shape(scale_bias_shape); + } + auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holde"] = weight; + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); // Pack zero points: two subsequent values into one @@ -304,15 +314,129 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o auto w_zp = std::make_shared( weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - auto w_zp_s = std::make_shared( - w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output w_zp_s = + std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + + if (packed_shape.size() != 2) { + // If not requantized channel-wise case, reshape back to original shape + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + + w_zp_s = std::make_shared(w_zp_s, final_shape, false); + } + + return std::make_shared(w_zp_s, ov::element::f32); +} - // Reshape back to original shape - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); +std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) { + std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); + ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); - auto w_zp_s_r = std::make_shared( - w_zp_s, final_shape, false); + std::shared_ptr weight_node; + ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; + + if (requant_type == ExtraQuantType::F16) { + ov::Tensor weights(ov::element::f16, node_shape); + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); + std::shared_ptr weight_node = std::make_shared(weights); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } - return std::make_shared(w_zp_s_r, ov::element::f32); + int64_t block_size = node_shape[1]; + if (requant_type == ExtraQuantType::Q4_0_128) { + block_size = 128; + } + auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; + + ov::Tensor weights; + ov::Tensor scales(ov::element::f16, scales_shape); + ov::Tensor bias(ov::element::f16, scales_shape); + + if (requant_type == ExtraQuantType::Q4_0_C) { + weights = ov::Tensor(ov::element::u4, node_shape); + quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q8_1_C) { + weights = ov::Tensor(ov::element::u8, node_shape); + quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q4_0_128) { + weights = ov::Tensor(ov::element::u4, node_shape); + quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } + + weight_node->set_friendly_name(tensor->name); + return weight_node; +} + +void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(-8.f * d); + + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); + weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } + } +} + +void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float min = std::numeric_limits::max(); + float max = std::numeric_limits::lowest(); + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (v < min) { + min = v; + } + if (v > max) { + max = v; + } + } + + const float d = (max - min) / ((1 << 8) - 1); + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(min); + + for (int j = 0; j < qk; ++j) { + const float x0 = (x[i * qk + j] - min) * id; + const uint8_t xi0 = roundf(x0); + weights[i * qk + j] = xi0; + } + } } diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index ae37b1618ed14..fbae2aa1f43ef 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,3 +1,4 @@ +#pragma once #include #include #include @@ -45,6 +46,15 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 }; + +std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); + +void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); +void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); + namespace ov { namespace op { namespace util { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e49d941da4ab1..3f728c242dd41 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -130,11 +130,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - std::set types_to_dequantize; + std::map types_to_requantize; if (is_static) { - types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + types_to_requantize = { + {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + }; + } else if (device == "GPU") { + types_to_requantize = { + // CVS-166739 + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + }; } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); From 9f9dd7fbd7771cd4c3adbeff8647156f946b8283 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 2 Sep 2025 14:52:04 +0800 Subject: [PATCH 131/156] Set m_is_static=false as default in decoder --- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 24e1d92dcfd68..4ba147da20d62 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -161,7 +161,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { int m_head_size; int32_t* m_rope_params; std::vector m_kv_names; - bool m_is_static; + bool m_is_static = false; bool m_is_first_token; }; From fc70742b846234884ff9e4b2bc1ba54d1f6750a4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 2 Sep 2025 14:53:09 +0800 Subject: [PATCH 132/156] Simpilfy translation of get_rows --- .../ggml-openvino/openvino/op/get_rows.cpp | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 0de77da59ffc5..5e4c7d901ac32 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -3,10 +3,7 @@ #include #include #include -#include -#include #include -#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -31,22 +28,15 @@ OutputVector translate_get_rows(const NodeContext& context) { indices = process_view_input(context, 1); } - Output axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); - if (indices.get_partial_shape()[1].get_length() == 1) { - indices = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - if (data.get_partial_shape().rank() == 2) { - axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); - } - res = std::make_shared(data, indices, axis); - if (data.get_partial_shape().rank() == 2) { - res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - } - } else { - indices = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + // data[b,x,y] ind[1,b,x'] test-backend-ops case + // data[x,y] ind[1,1,x'] normal case + indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + if (data.get_partial_shape().rank() == 3) { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); res = std::make_shared(data, indices, axis, 1); + } else { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + res = std::make_shared(data, indices, axis); } if (res.get_element_type() != context.get_output_type(0)) { From 327f4684a8cf8438cf65cda8c7532bd221dc490c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 8 Sep 2025 16:52:58 +0800 Subject: [PATCH 133/156] Fix after rebasing --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index bfccc28163522..b4103378ebb1b 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -41,13 +41,8 @@ OutputVector translate_mulmat(const NodeContext& context) { B = process_view_input(context, 0); A = process_view_input(context, 1); } - - bool convert_out_type = false; - if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { - B = std::make_shared(B, context.get_input_type(1)); - } else if (context.get_input_type(0) != context.get_input_type(1)) { - A = std::make_shared(A, context.get_input_type(0)); - convert_out_type = true; + if (A.get_element_type() != B.get_element_type()) { + B = std::make_shared(context.get_input(0), context.get_input_type(1)); } auto B_shape = context.get_input_shape(0).to_shape(); @@ -82,12 +77,7 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, transpose_b); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - res = std::make_shared(A, B, false, transpose_b); - } + res = std::make_shared(A, B, false, transpose_b); return rename_outputs_with_suffix({res}, context.get_name()); } From 24e5f77937977a4e97dab1d5db8c01d4e26d383a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 10 Sep 2025 15:38:15 +0800 Subject: [PATCH 134/156] Improve debug util; Eliminate nop ReshapeReshape --- ggml/src/ggml-openvino/ggml-decoder.cpp | 27 +++++---- .../src/ggml-openvino/openvino/op/reshape.cpp | 7 ++- ggml/src/ggml-openvino/utils.cpp | 55 +++++++++++++++---- 3 files changed, 65 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d00b78e891ee0..0dfc11e4905f6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -154,22 +154,22 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { // Add model outputs, if called for the whole graph if (naive) { - m_model_output_names.push_back(node->name); + m_model_output_names.push_back(node_name); } else if (!m_node) { + // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph - if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || - std::string(node->name).find("result") == 0 || debug_output_names.count(node->name)) { - auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); - if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { - assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); + if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 || + debug_output_names.count(node_name)) { + if (node->op == GGML_OP_SET_ROWS) { + assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0); + if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) { + m_kv_names.push_back(node_name); + } } - if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), node_name); it == m_model_output_names.end()) { - m_model_output_names.push_back(name); - } - if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), name); it == m_kv_names.end()) { - m_kv_names.push_back(name); + m_model_output_names.push_back(node_name); } } } @@ -177,7 +177,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (m_node) { switch (node->op) { case GGML_OP_RESHAPE: { - if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + if (node->src[0]->op == GGML_OP_RESHAPE && node->src[0]->src[0]->ne[0] == node->ne[0] && + node->src[0]->src[0]->ne[1] == node->ne[1]) { + m_op_case = 4; + } else if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { m_op_case = 1; } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { m_op_case = 2; diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 4ef3833c90252..1ed6f4b880b0a 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -23,7 +23,8 @@ OutputVector translate_reshape(const NodeContext& context) { } int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported RESHAPE case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4, + "Unsupported RESHAPE case"); auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; @@ -37,9 +38,11 @@ OutputVector translate_reshape(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); - } else { + } else if (op_case == 3) { new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); + } else if (op_case == 4) { + return {context.get_input(0).get_node_shared_ptr()->input_value(0)}; } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3f728c242dd41..588404df19d07 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -418,17 +420,50 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::map& output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; + + auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) { + if (size == 0) { + return; + } + + float first = get_value(0); + float min = first; + float max = first; + double sum = first; + + for (size_t i = 1; i < size; ++i) { + float v = get_value(i); + if (v < min) { + min = v; + } + if (v > max) { + max = v; + } + sum += v; + } + double mean = sum / size; + + std::cout << std::right << std::setw(6) << type_name << std::right << std::setw(12) << "First" << std::setw(12) + << "Min" << std::setw(12) << "Max" << std::setw(12) << "Mean" << std::endl; + std::cout << std::right << std::setw(6) << "" << std::right << std::setw(12) << first << std::setw(12) << min + << std::setw(12) << max << std::setw(12) << mean << std::endl; + }; + switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << *(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - default: - break; + case ov::element::f32: { + const float* data = tensor.data(); + size_t size = tensor.get_size(); + print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); + break; + } + case ov::element::f16: { + const ov::float16* data = tensor.data(); + size_t size = tensor.get_size(); + print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); + break; + } + default: + break; } } From 421a2351454716d5b0845122f4f6c1c7b763d329 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 10 Sep 2025 16:54:57 +0800 Subject: [PATCH 135/156] STYLE: make get_types_to_requant a function --- ggml/src/ggml-openvino/utils.cpp | 33 +++++++++++++++++--------------- ggml/src/ggml-openvino/utils.h | 2 ++ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 588404df19d07..2438f2dd1191b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -132,21 +132,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - std::map types_to_requantize; - if (is_static) { - types_to_requantize = { - {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, - }; - } else if (device == "GPU") { - types_to_requantize = { - // CVS-166739 - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, - }; - } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); @@ -275,6 +261,23 @@ ov::AnyMap get_npu_prefill_config() { return config; } +std::map get_types_to_requant(const std::string& device) { + if (device == "NPU") { + return { + {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + }; + } + if (device == "GPU") { + return { + // CVS-166739 + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + }; + } +} + ov::AnyMap get_npu_generate_config() { ov::AnyMap config = get_npu_prefill_config(); config.emplace("NPUW_UNFOLD_IREQS", "YES"); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index f377fe9d2735d..42686c593b3ce 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -43,6 +43,8 @@ bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_prefill_config(); ov::AnyMap get_npu_generate_config(); +std::map get_types_to_requant(const std::string& device); + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); bool is_naive(struct ggml_cgraph* cgraph); From 50a66dcf54568bbdc80f1d801845cb13c75ddd21 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 11 Sep 2025 14:34:17 +0800 Subject: [PATCH 136/156] Support BF16 model --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ++++++++-- ggml/src/ggml-openvino/utils.cpp | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0dfc11e4905f6..0bdb9aa8971f6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -419,8 +419,14 @@ std::map> GgmlOvDecoder::create_weight_no std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, std::optional requant_type) { - std::set weight_types = { - GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + std::set weight_types = {GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2438f2dd1191b..cf0a02c3ad671 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -276,6 +276,7 @@ std::map get_types_to_requant(const std::string& devi {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, }; } + return {}; } ov::AnyMap get_npu_generate_config() { From ee95ea7cc8c0453ceee9fcb0cf1f60609374e3b9 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 12 Sep 2025 11:42:02 +0800 Subject: [PATCH 137/156] Fix NPU compile --- ggml/src/ggml-openvino/utils.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index cf0a02c3ad671..c03ec1acbcf53 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -251,7 +251,6 @@ ov::AnyMap get_npu_prefill_config() { {"NPUW_DEVICES", "NPU" }, {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_SLICE_OUT", "YES" }, {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, {"NPUW_DQ", "YES" }, From 9318f730bddc9cfd5b6636cab69575a6c3b147e5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 12 Sep 2025 16:32:41 +0800 Subject: [PATCH 138/156] WA for npu 1st token acc issue --- ggml/src/ggml-openvino/utils.cpp | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c03ec1acbcf53..7b696769fba84 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -218,7 +218,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { - auto result_name = ov_output_names[i]; + auto& result_name = ov_output_names[i]; const auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); @@ -243,20 +243,34 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } -ov::AnyMap get_npu_prefill_config() { - ov::AnyMap config = { +namespace { +ov::AnyMap get_npu_base_config() { + return { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, {"NPU_USE_NPUW", "YES" }, {"NPUW_DEVICES", "NPU" }, {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; +} +} // namespace + +ov::AnyMap get_npu_prefill_config() { + auto config = get_npu_base_config(); + config.emplace("NPUW_FUNCALL_ASYNC", "NO"); + config.emplace("NPUW_ACC_CHECK", "YES"); + config.emplace("NPUW_ACC_DEVICE", "CPU"); + return config; +} + +ov::AnyMap get_npu_generate_config() { + auto config = get_npu_base_config(); + config.emplace("NPUW_FUNCALL_ASYNC", "YES"); return config; } @@ -266,7 +280,7 @@ std::map get_types_to_requant(const std::string& devi {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, }; } if (device == "GPU") { @@ -278,12 +292,6 @@ std::map get_types_to_requant(const std::string& devi return {}; } -ov::AnyMap get_npu_generate_config() { - ov::AnyMap config = get_npu_prefill_config(); - config.emplace("NPUW_UNFOLD_IREQS", "YES"); - return config; -} - bool is_naive(struct ggml_cgraph* cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -373,7 +381,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { - input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); + input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } From 07349a9cd102d6e3db8e792571f8683cba7aa078 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 12 Sep 2025 16:51:46 +0800 Subject: [PATCH 139/156] Apply EliminateZP only for npu --- ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp | 1 + ggml/src/ggml-openvino/openvino/translate_session.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index c36579910d48c..f38c0837d1374 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -19,6 +19,7 @@ namespace ggml { namespace pass { FuseToSDPA::FuseToSDPA() { + // Not maintained since FLASH_ATTN_EXT has replaced this pattern const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 634fea40e923f..3b8c30361a5e8 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -27,7 +27,6 @@ #include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" #include "pass/eliminate_zp.hpp" -#include "pass/fuse_to_sdpa.hpp" #include "pass/mark_decompression_convert_constant_folding.hpp" namespace ov { @@ -220,8 +219,9 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } - manager.register_pass(); - manager.register_pass(); + if (ggml_model_decoder->is_static()) { + manager.register_pass(); + } manager.run_passes(model); } return model; From a4c23cc54eef369cae7f8bfedcb44ecf4880cb80 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 15 Sep 2025 11:13:59 +0800 Subject: [PATCH 140/156] Add GeGLU --- ggml/src/ggml-openvino/ggml-openvino.cpp | 37 ++++++++++---- .../ggml-openvino/openvino/op/glu_geglu.cpp | 50 +++++++++++++++++++ .../ggml-openvino/openvino/op/glu_swiglu.cpp | 7 +++ ggml/src/ggml-openvino/openvino/op_table.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.hpp | 1 + 5 files changed, 85 insertions(+), 11 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 60a2eb388ea1e..6da653716f7ed 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -249,17 +249,30 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { const auto* op_params = op->op_params; memcpy(&scale, (const float*) op_params + 0, sizeof(float)); memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); - const uint32_t h = op->src[0]->ne[2]; - const uint32_t n_head = op->src[0]->ne[0]; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = - (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + if (max_bias > 0) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); + return true; + } + } - if (slope != 1.0f) { - GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n"); + if (op->op == GGML_OP_FLASH_ATTN_EXT) { + if (op->src[4] != nullptr) { + GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); + return true; + } + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + const auto* op_params = op->op_params; + memcpy(&scale, (const float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float)); + if (max_bias > 0) { + GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); + return true; + } + if (logit_softcap != 0) { + GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n"); return true; } } @@ -357,7 +370,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, - GGML_OP_SOFT_MAX, + // softmax is not updated due to replaced by flash_attn_ext + // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; @@ -366,6 +380,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con }; static const std::set supported_glu_ops{ GGML_GLU_OP_SWIGLU, + GGML_GLU_OP_GEGLU, }; switch (op->op) { diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp new file mode 100644 index 0000000000000..4295bf7517c3c --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_glu_geglu(const NodeContext& context) { + num_inputs_check(context, 1, 2); + + ov::Output src0; + ov::Output src1; + if (context.get_input_size() == 2) { + src0 = context.get_input(0); + src1 = context.get_input(1); + } else { + auto combined = context.get_input(0); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2}); + auto split = std::make_shared(combined, split_axis, 2); + src0 = split->output(0); + src1 = split->output(1); + } + + int32_t* params = context.get_output_op_params(0); + const int32_t swapped = params[1]; + if (swapped) { + std::swap(src0, src1); + } + + auto gelu = std::make_shared(src0); + auto res = std::make_shared(gelu, src1); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 138ef650901fd..bef42fe4b70c0 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -31,6 +31,13 @@ OutputVector translate_glu_swiglu(const NodeContext& context) { src0 = split->output(0); src1 = split->output(1); } + + int32_t* params = context.get_output_op_params(0); + const int32_t swapped = params[1]; + if (swapped) { + std::swap(src0, src1); + } + auto sigmoid = std::make_shared(src0); auto silu = std::make_shared(src0, sigmoid); auto res = std::make_shared(silu, src1); diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index ee55f84b96f80..e36e8f17cc94e 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -34,6 +34,7 @@ std::unordered_map get_supported_ops() { {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_GLU_OP_GEGLU", op::translate_glu_geglu }, {"GGML_OP_SET_ROWS", op::translate_set_rows }, {"GGML_OP_CPY", op::translate_cpy }, {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index faa61f5f6c8d2..5d4f0538604d1 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -25,6 +25,7 @@ GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); +GGML_OP_CONVERTER(translate_glu_geglu); GGML_OP_CONVERTER(translate_set_rows); GGML_OP_CONVERTER(translate_cpy); GGML_OP_CONVERTER(translate_flash_attn_ext); From 6e501c3425b139b44320a3af3626829f71cb28df Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 15 Sep 2025 15:56:03 +0800 Subject: [PATCH 141/156] Fix Hunyuan --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0bdb9aa8971f6..bc528e0cfb5a6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -242,14 +242,17 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; + std::string name = std::string(node->name); if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; m_context_size = cache_k->ne[1]; - } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { + } else if (node->op == GGML_OP_ROPE && + (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; m_rope_params = node->op_params; - } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { + } else if (node->op == GGML_OP_ROPE && + (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0)) { m_num_heads_kv = node->ne[1]; } } From 42d7541764c19bd0b8b0863649e45b740cce2296 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 16 Sep 2025 16:30:45 +0800 Subject: [PATCH 142/156] Support iSWA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 103 ++++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 13 ++- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 + .../ggml-openvino/openvino/node_context.hpp | 13 +-- .../openvino/op/flash_attn_ext.cpp | 9 +- .../src/ggml-openvino/openvino/op/permute.cpp | 38 ++----- .../openvino/translate_session.cpp | 21 +++- ggml/src/ggml-openvino/utils.cpp | 2 +- src/llama-graph.cpp | 2 + 9 files changed, 124 insertions(+), 79 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index bc528e0cfb5a6..e3dd5e0c1dd91 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -30,17 +30,21 @@ #include #include #include +#include #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-quants.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int num_heads, int num_heads_kv, int head_size) : + int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, + const std::vector& swa_layers) : m_cgraph(cgraph), m_node(node), m_op_name(std::string(node->name)), m_context_size(context_size), + m_context_size_swa(context_size_swa), + m_swa_layers(swa_layers), m_num_heads(num_heads), m_num_heads_kv(num_heads_kv), m_head_size(head_size), @@ -204,11 +208,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { - // Permute cache_k (view) - m_op_case = 2; - } else { - // Permute cache_v (view), deprecated, cache_v will also fall to case 2 - m_op_case = 3; + // Permute kv cache (view) + std::string src_name(node->view_src->name); + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + m_op_case = 2; + } else { + m_op_case = 3; + } } break; } @@ -239,13 +246,34 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } +int extract_layer_from_name(const std::string& name) { + size_t pos1 = name.find("_l"); + assert(pos1 != std::string::npos); + pos1 += 2; + size_t pos2 = name.find(' ', pos1); + if (pos2 == std::string::npos) { + pos2 = name.length(); + } + std::string layer_str = name.substr(pos1, pos2 - pos1); + int layer = std::stoi(layer_str); + return layer; +} + void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; std::string name = std::string(node->name); - if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { - auto* cache_k = node->src[0]; - m_context_size = cache_k->ne[1]; + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + auto* cache_k = node->src[1]; + cache_k = cache_k->view_src ? cache_k->view_src : cache_k; + int layer = extract_layer_from_name(cache_k->name); + + if (std::string(node->src[3]->name).find("swa") != std::string::npos) { + m_swa_layers.push_back(layer); + m_context_size_swa = cache_k->ne[1]; + } else { + m_context_size = cache_k->ne[1]; + } } else if (node->op == GGML_OP_ROPE && (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { m_head_size = node->ne[0]; @@ -269,11 +297,11 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, 1, 1}; } } else { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; + input_shape = ov::PartialShape{1, 1, -1}; } } else if (name == "inp_out_ids" && !m_is_static) { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; - } else if (name == "KQ_mask") { + input_shape = ov::PartialShape{1, 1, -1}; + } else if (name.find("KQ_mask") == 0) { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, m_context_size, m_context_size}; @@ -281,13 +309,12 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, 1, m_context_size}; } } else { - auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; + input_shape = ov::PartialShape{1, -1, -1}; } - } else if (name.find("cache_k") == 0) { - input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; - } else if (name.find("cache_v") == 0) { - input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; + } else if (name.find("cache_") == 0) { + int layer = extract_layer_from_name(name); + bool is_swa = is_swa_layer(layer); + input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { @@ -305,35 +332,35 @@ void GgmlOvDecoder::add_extra_inputs() { // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU int64_t attention_size = -1; + int64_t attention_size_swa = -1; for (const auto& node : m_nodes) { - if (node->op == GGML_OP_SOFT_MAX) { - auto* mask = node->src[1]; - if (std::string(mask->name).find("KQ_mask") != 0) { - throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name)); - } - attention_size = mask->ne[0]; - break; - } if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto* mask = node->src[3]; - if (std::string(mask->name).find("KQ_mask") != 0) { + std::string mask_name(mask->name); + if (mask_name.find("KQ_mask") != 0) { throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); } - attention_size = mask->ne[0]; + if (mask_name.find("swa") != std::string::npos) { + attention_size_swa = mask->ne[0]; + } else { + attention_size = mask->ne[0]; + } } } - { - std::string name = "attention_size"; + auto create_attention_size_input = [this](const std::string& name, int64_t size) { auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = attention_size; + *tensor->data() = size; m_model_extra_input_values[name] = tensor; - } + }; + + create_attention_size_input("attention_size", attention_size); + create_attention_size_input("attention_size_swa", attention_size_swa); } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { @@ -706,8 +733,16 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared( - node, m_cgraph, m_is_static, m_is_first_token, m_context_size, m_num_heads, m_num_heads_kv, m_head_size); + auto decoder = std::make_shared(node, + m_cgraph, + m_is_static, + m_is_first_token, + m_context_size, + m_context_size_swa, + m_num_heads, + m_num_heads_kv, + m_head_size, + m_swa_layers); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4ba147da20d62..35e79ecefc724 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -19,7 +19,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int num_heads, int num_heads_kv, int head_size); + int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, + const std::vector& swa_layers); // Naive graph decoder GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); @@ -101,6 +102,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual int get_context_size() const override { return m_context_size; } + virtual int get_context_size_swa() const override { return m_context_size_swa; } + + virtual int is_swa_layer(int layer) const override { + return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end(); + } + virtual int get_num_heads() const override { return m_num_heads; } virtual int get_num_heads_kv() const override { return m_num_heads_kv; } @@ -156,6 +163,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_weights; std::vector m_model_output_names; int m_context_size; + int m_context_size_swa; + std::vector m_swa_layers; int m_num_heads; int m_num_heads_kv; int m_head_size; @@ -166,3 +175,5 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { }; void print_tensor_address_map(const struct ggml_cgraph* cgraph); + +int extract_layer_from_name(const std::string& name); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a3387ba3947a2..6f11ff1283e37 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -67,6 +67,8 @@ class GgmlDecoder : public DecoderBase { virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; virtual int get_context_size() const = 0; + virtual int get_context_size_swa() const = 0; + virtual int is_swa_layer(int layer) const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index cc1b5c03329c9..a64ae098ab3e9 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -2,6 +2,7 @@ #include #include +#include #include "decoder.hpp" @@ -30,6 +31,8 @@ class NodeContext : public frontend::NodeContext { return m_translate_session; } + const std::vector& get_input_names() const { return m_input_names; } + size_t get_input_size() const override { return m_decoder->get_input_size(); } @@ -101,15 +104,7 @@ class NodeContext : public frontend::NodeContext { return m_decoder->is_first_token(); } - int get_num_heads() const { return m_decoder->get_num_heads(); } - - int get_num_heads_kv() const { return m_decoder->get_num_heads_kv(); } - - int get_head_size() const { return m_decoder->get_head_size(); } - - int get_context_size() const { return m_decoder->get_context_size(); } - - private: +private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; TranslateSession* m_translate_session; diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index d97603d98a941..8b67778fb9373 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -32,8 +33,12 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); ov::Output mask_sliced; - if (context.has_input("KQ_mask_sliced")) { - mask_sliced = context.get_input("KQ_mask_sliced"); + std::string mask_name = "KQ_mask_sliced"; + if (context.get_input_names()[3].find("swa") != std::string::npos) { + mask_name = "KQ_mask_swa_sliced"; + } + if (context.has_input(mask_name)) { + mask_sliced = context.get_input(mask_name); } else { auto token_len = get_dimensions(q, {1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index fcb091016a4f1..086b1e4cdb172 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -29,43 +29,29 @@ OutputVector translate_permute(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); - auto attention_size = context.get_input("attention_size"); + Output attention_size; if (context.is_static()) { attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } else if (op_case == 2) { + attention_size = context.get_input("attention_size"); + } else { + attention_size = context.get_input("attention_size_swa"); } auto src_shape_ = context.get_input_shape(0).to_shape(); std::vector src_shape(src_shape_.begin(), src_shape_.end()); - std::shared_ptr src_reshaped; - if (op_case == 2) { - src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), - false); - } else { - src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{src_shape[1], src_shape[0], -1}), - false); - } + auto src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - std::shared_ptr slice_axis; - if (op_case == 2) { - slice_axis = zero; - } else { - slice_axis = two; - } - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, slice_axis); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - if (op_case == 2) { - res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - res = src_slice; - } + res = std::make_shared(src_slice, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3b8c30361a5e8..9c82fe5f850a0 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -78,13 +78,22 @@ void add_token_len(TensorMap& tensor_map) { } void add_sliced_mask(TensorMap& tensor_map) { - auto mask = tensor_map.at("KQ_mask").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr mask_sliced = std::make_shared(mask, zero, token_len, one, one); - mask_sliced->set_friendly_name("KQ_mask_sliced"); - tensor_map.insert({"KQ_mask_sliced", mask_sliced->output(0)}); + + auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) { + if (tensor_map.find(mask_name) != tensor_map.end()) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); + std::shared_ptr mask_sliced = + std::make_shared(mask, zero, token_len, one, one); + mask_sliced->set_friendly_name(sliced_name); + tensor_map.insert({sliced_name, mask_sliced->output(0)}); + } + }; + + create_sliced_mask("KQ_mask", "KQ_mask_sliced"); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced"); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 7b696769fba84..8724404098c05 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -362,7 +362,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } - } else if (param_name == "KQ_mask") { + } else if (param_name.find("KQ_mask") == 0) { size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index d6a9f8a0c530f..96fef078f5fe0 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1708,6 +1708,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); + ggml_set_name(inp->self_kq_mask, "KQ_mask"); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1722,6 +1723,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); + ggml_set_name(inp->self_kq_mask_swa, "KQ_mask_swa"); ggml_set_input(inp->self_kq_mask_swa); inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; From 769ef74c435a54a4e92627904761e4be04fce092 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 11:16:14 +0800 Subject: [PATCH 143/156] Fix NPU accuracy --- .../openvino/translate_session.cpp | 25 +++++++++++-------- ggml/src/ggml-openvino/utils.cpp | 5 +--- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 9c82fe5f850a0..c37aa21602ff0 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -77,23 +77,28 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } -void add_sliced_mask(TensorMap& tensor_map) { +void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) { + auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) { if (tensor_map.find(mask_name) != tensor_map.end()) { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); - std::shared_ptr mask_sliced = - std::make_shared(mask, zero, token_len, one, one); - mask_sliced->set_friendly_name(sliced_name); + std::shared_ptr mask_sliced; + if (is_static) { + mask_sliced = mask; + } else { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_sliced = std::make_shared(mask, zero, token_len, one, one); + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + mask_sliced->set_friendly_name(sliced_name); + } tensor_map.insert({sliced_name, mask_sliced->output(0)}); } }; - create_sliced_mask("KQ_mask", "KQ_mask_sliced"); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced"); + create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { @@ -117,7 +122,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_sliced_mask(tensor_map); + add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8724404098c05..db471636452ad 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -253,6 +253,7 @@ ov::AnyMap get_npu_base_config() { {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, @@ -262,15 +263,11 @@ ov::AnyMap get_npu_base_config() { ov::AnyMap get_npu_prefill_config() { auto config = get_npu_base_config(); - config.emplace("NPUW_FUNCALL_ASYNC", "NO"); - config.emplace("NPUW_ACC_CHECK", "YES"); - config.emplace("NPUW_ACC_DEVICE", "CPU"); return config; } ov::AnyMap get_npu_generate_config() { auto config = get_npu_base_config(); - config.emplace("NPUW_FUNCALL_ASYNC", "YES"); return config; } From 7cacb59473afb37b0ff87630f6afbbb04b1dcb5c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 15:35:27 +0800 Subject: [PATCH 144/156] Fix ROPE accuracy when freq_scale != 1 --- ggml/src/ggml-openvino/ggml-openvino.cpp | 6 +----- ggml/src/ggml-openvino/openvino/utils.cpp | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 6da653716f7ed..683f768c5f170 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -319,12 +319,8 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return true; } float freq_scale; - memcpy(&freq_scale, op_params + 6, sizeof(float)); - if (freq_scale != 0.0f && freq_scale != 1.0f) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); - return true; - } float ext_factor; + memcpy(&freq_scale, op_params + 6, sizeof(float)); memcpy(&ext_factor, op_params + 7, sizeof(float)); if (ext_factor != 0.0f) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index ef5f51ebbc4a7..f70cb91a17fe0 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -140,7 +140,7 @@ std::pair, ov::Output> make_sin_cos(int32_t* rope_params, ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); std::vector factor(n_dims / 2); - factor[0] = freq_scale; + factor[0] = 1.0f; for (size_t i = 1; i < factor.size(); i++) { factor[i] = theta_scale * factor[i - 1]; } From 14047c1ed7a5704f211a07bf948bf61b6bbdb2c3 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 16:50:54 +0800 Subject: [PATCH 145/156] Minor: not add attention_size_swa for non-swa model --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e3dd5e0c1dd91..8286052f8bf2e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -360,7 +360,9 @@ void GgmlOvDecoder::add_extra_inputs() { }; create_attention_size_input("attention_size", attention_size); - create_attention_size_input("attention_size_swa", attention_size_swa); + if (attention_size_swa != -1) { + create_attention_size_input("attention_size_swa", attention_size_swa); + } } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { From a57f95f147614488d801d3139b9012bd7583dec4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 19 Sep 2025 16:50:27 +0800 Subject: [PATCH 146/156] Minor refactor --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ---------- ggml/src/ggml-openvino/utils.cpp | 5 +++++ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8286052f8bf2e..a5d9d6967fd92 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -65,11 +65,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, print_tensor_address_map(cgraph); } - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; - dump_cgraph(cgraph, filename); - } - set_llm_params(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -83,11 +78,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights) { - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; - dump_cgraph(cgraph, filename); - } - m_cgraph = cgraph; m_model_weights = model_weights; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index db471636452ad..07cbb2e437f6a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -86,6 +86,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c }; } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph.txt"; + GgmlOvDecoder::dump_cgraph(cgraph, filename); + } + if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } From 27bdafe8d838fbda81527eb4629c7d0c69259b57 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 23 Sep 2025 16:07:51 +0800 Subject: [PATCH 147/156] Add Q5_K to support phi-3-q4_k_m --- ggml/src/ggml-openvino/ggml-decoder.cpp | 8 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 1 + ggml/src/ggml-openvino/ggml-quants.cpp | 143 ++++++++++++++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 5 + ggml/src/ggml-openvino/utils.cpp | 1 + 5 files changed, 124 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a5d9d6967fd92..38b0fa3db4f1c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -448,6 +448,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + @@ -486,12 +487,12 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, ov::element::Type weight_type; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { weight_type = ov::element::u4; - } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K + } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K weight_type = ov::element::u8; } uint64_t weights_per_block; - // here we only consider sub block, q6k:16 q4k:32 + // here we only consider sub block, q6k:16 q4k:32 q5k:32 if (tensor->type == GGML_TYPE_Q6_K) { weights_per_block = 16; } else { @@ -526,6 +527,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } else if (tensor->type == GGML_TYPE_Q4_K) { extract_q4_k_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q5_K) { + extract_q5_k_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 683f768c5f170..648acb4e35ede 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -350,6 +350,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 1603e65355274..9b8bfff072570 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,9 +1,17 @@ #include "ggml-quants.hpp" +#include +#include +#include +#include #include #include #include +#include +#include #include +#include +#include #include #include #include @@ -11,9 +19,12 @@ #include #include #include +#include #include #include +#include +#include "ggml-common.h" #include "ggml-impl.h" #include "ggml.h" @@ -38,10 +49,10 @@ void extract_q4_0_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); @@ -57,10 +68,10 @@ void extract_q4_1_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2))); @@ -76,22 +87,22 @@ void extract_q8_0_data(const ggml_tensor* tensor, ov::Tensor& biases_arr) { const uint64_t weights_per_block = 32; const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); - for (size_t i = 0; i < scales_arr.get_size(); i++) { + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { uint8_t* block_data = data + i * bytes_per_block; - scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); + scales[i] = ov::float16::from_bits(*(uint16_t*) block_data); biases[i] = ov::float16(-128.f * static_cast(scales[i])); for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. - // Original data is in int8_t, so we add a bias of -128 and invert the - // first bit. + // Original data is in int8_t, so we add a bias of -128 and invert the first bit. x ^= 1 << 7; weights[i * weights_per_block + j] = x; } - } + }); } void unpack_256_4(const uint8_t* data, uint8_t* dst) { @@ -117,12 +128,11 @@ void extract_q4_k_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; - // TODO tensor->nb[3] const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { uint8_t* block_data = data + i * bytes_per_block; @@ -170,28 +180,26 @@ void extract_q6_k_data(const ggml_tensor* tensor, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 128 + 64 + 16 + 2; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); - // std::string name(tensor.name, tensor.namelen); - for (size_t i = 0; i < n_super_block; i++) { + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { uint8_t* block_data = data + i * bytes_per_block; float scale_factor = - static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 104))); // (128+64+16)/2 + static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2 for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = - ov::float16(scale_factor * static_cast(*((int8_t*)(block_data + 128 + 64 + j)))); + ov::float16(scale_factor * static_cast(*((int8_t*) (block_data + 128 + 64 + j)))); biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); } - // Extract ql and qh uint8_t* ql = block_data; uint8_t* qh = block_data + 128; - // Extract weights for (int64_t j = 0; j < 32; ++j) { weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); @@ -202,9 +210,80 @@ void extract_q6_k_data(const ggml_tensor* tensor, weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); } + }); +} + +static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } else { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); } } +void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 4 + 12 + 32 + 128; + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t* block_data = data + i * bytes_per_block; + + const float d = static_cast(ov::float16::from_bits(*((uint16_t*) block_data))); + const float min = static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 1))); + + const uint8_t* scales_data = block_data + 4; // 12 bytes of scales + const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits + const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits + + int is = 0; + uint8_t u1 = 1; + uint8_t u2 = 2; + + // Process 2 blocks in one iteration + for (int j = 0; j < 256; j += 64) { // 256 = QK_K, so 4 iterations of 64 + uint8_t sc; + uint8_t m; + + // Get scale and min for first 32 elements + get_scale_min_k4(is + 0, scales_data, &sc, &m); + const float d1 = d * sc; + const float m1 = min * m; + + // Get scale and min for second 32 elements + get_scale_min_k4(is + 1, scales_data, &sc, &m); + const float d2 = d * sc; + const float m2 = min * m; + + scales[i * 8 + is] = ov::float16(d1); + biases[i * 8 + is] = ov::float16(-m1); + scales[i * 8 + is + 1] = ov::float16(d2); + biases[i * 8 + is + 1] = ov::float16(-m2); + + // Extract weights for first 32 elements (matching deq formula exactly) + for (int l = 0; l < 32; ++l) { + weights[i * 256 + j + l] = (ql[l] & 0xF) + ((qh[l] & u1) ? 16 : 0); + } + + // Extract weights for second 32 elements + for (int l = 0; l < 32; ++l) { + weights[i * 256 + j + l + 32] = (ql[l] >> 4) + ((qh[l] & u2) ? 16 : 0); + } + + ql += 32; + is += 2; + u1 <<= 2; + u2 <<= 2; + } + }); +} + // TODO Reorder for make_intX_weights ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index fbae2aa1f43ef..5496785eb1fbd 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -29,6 +29,11 @@ void extract_q4_k_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr); +void extract_q5_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + void extract_q6_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 07cbb2e437f6a..e9084cf387f7a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -283,6 +283,7 @@ std::map get_types_to_requant(const std::string& devi {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, + {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, }; } if (device == "GPU") { From 506615ab938c8d4d8264d14e826a7f4b4f0b916a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Sep 2025 15:50:32 +0800 Subject: [PATCH 148/156] Requantize Q6_K (gs16) to gs32 on GPU --- ggml/src/ggml-openvino/ggml-quants.cpp | 43 +++++++++++++++++++++++--- ggml/src/ggml-openvino/ggml-quants.hpp | 4 ++- ggml/src/ggml-openvino/utils.cpp | 4 +-- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 9b8bfff072570..1538a8207ca9d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -425,6 +425,8 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r int64_t block_size = node_shape[1]; if (requant_type == ExtraQuantType::Q4_0_128) { block_size = 128; + } else if (requant_type == ExtraQuantType::Q8_0_32) { + block_size = 32; } auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; @@ -432,7 +434,7 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r ov::Tensor scales(ov::element::f16, scales_shape); ov::Tensor bias(ov::element::f16, scales_shape); - if (requant_type == ExtraQuantType::Q4_0_C) { + if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) { weights = ov::Tensor(ov::element::u4, node_shape); quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); @@ -440,10 +442,10 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r weights = ov::Tensor(ov::element::u8, node_shape); quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q4_0_128) { - weights = ov::Tensor(ov::element::u4, node_shape); - quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) { + weights = ov::Tensor(ov::element::u8, node_shape); + quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); } weight_node->set_friendly_name(tensor->name); @@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } +void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + } + } + + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(-128.0f * d); + + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + const int8_t xi0 = roundf(x0); + weights[i * qk + j] = (uint8_t) (xi0 + 128); + } + } +} + void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk) { assert(k % qk == 0); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 5496785eb1fbd..71ae317a39e90 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -51,7 +51,7 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 }; +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); @@ -59,6 +59,8 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); +void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); namespace ov { namespace op { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e9084cf387f7a..0ec815f07f4f9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -288,8 +288,8 @@ std::map get_types_to_requant(const std::string& devi } if (device == "GPU") { return { - // CVS-166739 - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + // gs16 is WIP + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, }; } return {}; From 9ce1692648ae44a1ee94a298de7ed60aa0c75b1d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sun, 28 Sep 2025 11:24:13 +0800 Subject: [PATCH 149/156] Fix after rebasing --- ggml/src/ggml-openvino/ggml-decoder.cpp | 24 +++++++++++++++---- .../ggml-openvino/openvino/op/set_rows.cpp | 4 +++- ggml/src/ggml-openvino/openvino/op/view.cpp | 4 ++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 38b0fa3db4f1c..751fa192a4261 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -198,13 +198,17 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { - // Permute kv cache (view) std::string src_name(node->view_src->name); - int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - m_op_case = 2; + if (src_name.find("cache") == std::string::npos) { + m_op_case = 1; } else { - m_op_case = 3; + // Permute kv cache (view) + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + m_op_case = 2; + } else { + m_op_case = 3; + } } } break; @@ -230,6 +234,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } + case GGML_OP_VIEW: { + if (node->src[0]->op == GGML_OP_VIEW) { + auto* src = node->src[0]; + auto* view_src = src->view_src; + if (view_src->ne[1] != src->ne[2]) { + throw std::runtime_error("Unsupported VIEW case"); + } + m_op_case = 2; + } + } default: break; } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 0d94a95e44276..50817c8323bef 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -45,7 +45,9 @@ OutputVector translate_set_rows(const NodeContext& context) { false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_reshaped = std::make_shared(data, zero); + auto data_reshaped = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); + auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); auto res = std::make_shared(updated, std::make_shared(dst), false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 58143e667cc6f..034b6df119510 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -9,6 +9,10 @@ namespace op { OutputVector translate_view(const NodeContext& context) { num_inputs_check(context, 1, 1); + if (context.get_op_case() == 2) { + auto dst_shape = context.get_output_shape(0).to_shape(); + return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, context.get_name()); + } return {context.get_input(0)}; } From 499f8ba85880fe4fc749f635aad016c6126a1e23 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sun, 28 Sep 2025 22:21:23 +0800 Subject: [PATCH 150/156] Always apply Eliminate_ZP to fix GPU compile issue on some platforms --- ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp | 1 + ggml/src/ggml-openvino/openvino/translate_session.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp index d2e5a040dd28f..4759e86e1ea34 100644 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp @@ -80,6 +80,7 @@ EliminateZeroPoints::EliminateZeroPoints() { std::shared_ptr new_constant; + // TODO improve performance if (data_type == ov::element::u4) { auto data_values = data_constant->cast_vector(); std::vector adjusted_values(total_elements); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index c37aa21602ff0..944381968226d 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -233,9 +233,9 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } - if (ggml_model_decoder->is_static()) { - manager.register_pass(); - } + // if (ggml_model_decoder->is_static()) { + manager.register_pass(); + // } manager.run_passes(model); } return model; From a6c99a38bcfc795c91f02b5159902a76521e70b6 Mon Sep 17 00:00:00 2001 From: cavusmustafa Date: Wed, 1 Oct 2025 14:02:11 -0700 Subject: [PATCH 151/156] kvcachefusion support --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ++- .../openvino/op/flash_attn_ext.cpp | 64 +++++++++++++------ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 ++++-- .../src/ggml-openvino/openvino/op/permute.cpp | 34 ++++++---- ggml/src/ggml-openvino/openvino/op/rope.cpp | 3 + .../ggml-openvino/openvino/op/set_rows.cpp | 36 +++++++---- .../src/ggml-openvino/openvino/op/softmax.cpp | 19 +++++- .../openvino/translate_session.cpp | 16 ++++- 8 files changed, 146 insertions(+), 56 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 751fa192a4261..0000319f632aa 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -316,9 +316,13 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, -1, -1}; } } else if (name.find("cache_") == 0) { - int layer = extract_layer_from_name(name); - bool is_swa = is_swa_layer(layer); - input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; + if (m_is_static) { + int layer = extract_layer_from_name(name); + bool is_swa = is_swa_layer(layer); + input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; + } else { + input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size}; + } } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 8b67778fb9373..36d0f8844a0c0 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto q = std::make_shared(q_f32, ov::element::f16); auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); - ov::Output mask_sliced; + ov::Output mask_sliced, res; std::string mask_name = "KQ_mask_sliced"; if (context.get_input_names()[3].find("swa") != std::string::npos) { mask_name = "KQ_mask_swa_sliced"; @@ -40,33 +41,55 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { if (context.has_input(mask_name)) { mask_sliced = context.get_input(mask_name); } else { - auto token_len = get_dimensions(q, {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_sliced = std::make_shared(mask, zero, token_len, one, one); + auto token_len = get_dimensions(q, {2}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto leaf_8 = context.get_input("leaf_8"); + auto shape_of_leaf_8 = std::make_shared(leaf_8); + auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + mask_sliced = + std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask_sliced, zero_1d); } if (mask_sliced.get_element_type() != ov::element::f16) { mask_sliced = std::make_shared(mask_sliced, ov::element::f16); } - auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv) { + auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv, bool is_static) { int64_t factor = q_batch / kv_batch; if (factor > 1) { auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{q_batch}); auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_batch}); auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; + if (is_static) { + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); - auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); - auto kv_broadcast_shape = - std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); - kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); + auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); + kv_broadcast_shape = + std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); + new_kv_shape = + std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); + } else { + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); + kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + + auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3}); + kv_broadcast_shape = + std::make_shared(ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); + new_kv_shape = + std::make_shared(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0); + } - auto new_kv_shape = - std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); + kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); kv = std::make_shared(kv, new_kv_shape, false); } return kv; @@ -74,13 +97,18 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto q_shape = context.get_input_shape(0).to_shape(); auto k_shape = context.get_input_shape(1).to_shape(); - k = tile_kv(q_shape[0], k_shape[0], k); - v = tile_kv(q_shape[0], k_shape[0], v); + k = tile_kv(q_shape[0], k_shape[0], k, context.is_static()); + v = tile_kv(q_shape[0], k_shape[0], v, context.is_static()); auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); auto sdpa_f32 = std::make_shared(sdpa, ov::element::f32); - auto res = std::make_shared(sdpa_f32, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + if (context.is_static()) { + res = std::make_shared(sdpa_f32, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = std::make_shared(sdpa_f32, + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b4103378ebb1b..3a1ca341664c5 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -59,13 +59,23 @@ OutputVector translate_mulmat(const NodeContext& context) { auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; - auto broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); + + ov::Output broadcast_shape; + ov::Output Z_unsqueezed; + if (context.is_static()) { + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); + } else { + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); + Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + broadcast_shape = + std::make_shared(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0); + } auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 086b1e4cdb172..cd0d073ab3ac8 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -25,8 +25,13 @@ OutputVector translate_permute(const NodeContext& context) { ov::Output res; if (op_case == 1) { - res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + if (context.is_static()) { + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + } } else { auto src = context.get_input(0); Output attention_size; @@ -38,20 +43,23 @@ OutputVector translate_permute(const NodeContext& context) { attention_size = context.get_input("attention_size_swa"); } - auto src_shape_ = context.get_input_shape(0).to_shape(); - std::vector src_shape(src_shape_.begin(), src_shape_.end()); - - auto src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), - false); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - res = std::make_shared(src_slice, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + if (context.is_static()) { + auto src_shape_ = context.get_input_shape(0).to_shape(); + std::vector src_shape(src_shape_.begin(), src_shape_.end()); + auto src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); + res = std::make_shared(src_slice, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = std::make_shared(src, + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + } } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 4b1e3b500cf3e..484730d2897f1 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -84,6 +84,9 @@ OutputVector translate_rope(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); + if (!(context.is_static())) { + res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + } } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 50817c8323bef..a3285d41ce13f 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -3,10 +3,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -39,17 +41,29 @@ OutputVector translate_set_rows(const NodeContext& context) { auto dst = context.get_input(context.get_output_name()); auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); - auto dst_reshaped = std::make_shared( - dst, - ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), - false); - auto indices_reshaped = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_reshaped = std::make_shared( - data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); - - auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); - auto res = std::make_shared(updated, std::make_shared(dst), false); + Output res; + if (context.is_static()) { + auto dst_reshaped = std::make_shared( + dst, + ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), + false); + auto indices_reshaped = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto data_reshaped = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); + + auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); + res = std::make_shared(updated, std::make_shared(dst), false); + } else { + // TODO: Better solution would be to reshape the data into 4D at first place (for stateful model) + if (data.get_partial_shape().rank() + 1 == dst.get_partial_shape().rank()) { + data = std::make_shared(data, zero); + } + int concat_axis = 1; + if (context.is_static()) + concat_axis = 0; + res = std::make_shared(OutputVector{dst, data}, concat_axis); + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 1aa3bf76a06bb..8f134626c8a5d 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -57,9 +59,20 @@ OutputVector translate_soft_max(const NodeContext& context) { } else { auto token_len = get_dimensions(input_node, {1}); auto mask_node = context.get_input(1); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto leaf_8 = context.get_input("leaf_8"); + auto shape_of_leaf_8 = std::make_shared(leaf_8); + auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + mask_node_sliced = + std::make_shared(mask_node, zero_2d, stop, one_2d, axes); + if (!(context.is_static())) { + mask_node_sliced = std::make_shared(mask_node_sliced, zero_1d); + } } if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 944381968226d..58a94d6149e7b 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -87,9 +88,18 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_sliced = std::make_shared(mask, zero, token_len, one, one); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto leaf_8 = tensor_map.at("leaf_8").get_node_shared_ptr(); + auto shape_of_leaf_8 = std::make_shared(leaf_8); + auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + mask_sliced = + std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask_sliced, zero_1d); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); } From b641bf6bdc9d5547e335b9d27118dd4705d6fcb2 Mon Sep 17 00:00:00 2001 From: cavusmustafa Date: Wed, 1 Oct 2025 14:33:48 -0700 Subject: [PATCH 152/156] env variable GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION added --- ggml/src/ggml-openvino/utils.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 0ec815f07f4f9..9b000f26d55ba 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -80,11 +80,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (device == "GPU") { - config = { - {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} - }; - } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; @@ -186,6 +181,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } + auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); + if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") { + config = { + {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} + }; + } + auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); From bf91836faf52151d49acf5c463b8d6b84cb56d3d Mon Sep 17 00:00:00 2001 From: cavusmustafa Date: Thu, 2 Oct 2025 11:24:40 -0700 Subject: [PATCH 153/156] Fix for Phi3 --- .../ggml-openvino/openvino/op/flash_attn_ext.cpp | 8 ++++---- ggml/src/ggml-openvino/openvino/op/permute.cpp | 12 ++++++++++-- ggml/src/ggml-openvino/openvino/op/set_rows.cpp | 16 +++++++--------- ggml/src/ggml-openvino/openvino/op/softmax.cpp | 8 ++++---- .../ggml-openvino/openvino/translate_session.cpp | 8 ++++---- 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 36d0f8844a0c0..ec9bb0aac59d7 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -47,10 +47,10 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto leaf_8 = context.get_input("leaf_8"); - auto shape_of_leaf_8 = std::make_shared(leaf_8); - auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + auto inp_pos = context.get_input("inp_pos"); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index cd0d073ab3ac8..ea5e417965eb3 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -23,13 +24,18 @@ OutputVector translate_permute(const NodeContext& context) { int op_case = context.get_op_case(); FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case"); ov::Output res; + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); if (op_case == 1) { if (context.is_static()) { res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { - res = std::make_shared(context.get_input(0), + auto src = context.get_input(0); + if (src.get_partial_shape().rank() == 3) { + src = std::make_shared(src, zero); + } + res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } } else { @@ -43,7 +49,6 @@ OutputVector translate_permute(const NodeContext& context) { attention_size = context.get_input("attention_size_swa"); } - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); if (context.is_static()) { @@ -57,6 +62,9 @@ OutputVector translate_permute(const NodeContext& context) { res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { + if (src.get_partial_shape().rank() == 3) { + src = std::make_shared(src, zero); + } res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index a3285d41ce13f..0b2f29441aec7 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -55,14 +55,12 @@ OutputVector translate_set_rows(const NodeContext& context) { auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); res = std::make_shared(updated, std::make_shared(dst), false); } else { - // TODO: Better solution would be to reshape the data into 4D at first place (for stateful model) - if (data.get_partial_shape().rank() + 1 == dst.get_partial_shape().rank()) { - data = std::make_shared(data, zero); - } - int concat_axis = 1; - if (context.is_static()) - concat_axis = 0; - res = std::make_shared(OutputVector{dst, data}, concat_axis); + assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && dst.get_partial_shape()[3].is_static()); + int64_t dim2 = dst.get_partial_shape()[2].get_length(); + int64_t dim3 = dst.get_partial_shape()[3].get_length(); + data = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false); + res = std::make_shared(OutputVector{dst, data}, 1); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 8f134626c8a5d..12db9e82a08cf 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -64,10 +64,10 @@ OutputVector translate_soft_max(const NodeContext& context) { auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto leaf_8 = context.get_input("leaf_8"); - auto shape_of_leaf_8 = std::make_shared(leaf_8); - auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + auto inp_pos = context.get_input("inp_pos"); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); mask_node_sliced = std::make_shared(mask_node, zero_2d, stop, one_2d, axes); if (!(context.is_static())) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 58a94d6149e7b..830344020c40a 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -93,10 +93,10 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto leaf_8 = tensor_map.at("leaf_8").get_node_shared_ptr(); - auto shape_of_leaf_8 = std::make_shared(leaf_8); - auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); From d84268ebdfeab79aacfa50cd8733c86de253ec0f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 9 Oct 2025 14:50:52 +0800 Subject: [PATCH 154/156] Fix llama-cli (need to run with --no-warmup) --- ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp | 8 ++++---- ggml/src/ggml-openvino/openvino/translate_session.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index ec9bb0aac59d7..c07a7ccb16e3c 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -42,15 +42,15 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { mask_sliced = context.get_input(mask_name); } else { auto token_len = get_dimensions(q, {2}); + auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = context.get_input("inp_pos"); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); + + auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 830344020c40a..0b16c06fd04b1 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -132,7 +132,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_sliced_mask(tensor_map, ggml_model_decoder); + // add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From 2b4b32f6e46715cdb1295e759b8b0de2967716db Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 10 Oct 2025 13:17:12 +0800 Subject: [PATCH 155/156] Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 ++-- .../openvino/op/flash_attn_ext.cpp | 1 - ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 +++-------- .../src/ggml-openvino/openvino/op/permute.cpp | 14 ++------ .../src/ggml-openvino/openvino/op/softmax.cpp | 19 ++--------- .../openvino/translate_session.cpp | 34 +++++++++++++------ 6 files changed, 38 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0000319f632aa..7c6bfe7ee74d7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, set_input_output(cur_node); } - add_extra_inputs(); + // add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, @@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU + // Not used for NPU. + // Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models) int64_t attention_size = -1; int64_t attention_size_swa = -1; for (const auto& node : m_nodes) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index c07a7ccb16e3c..9845fe0a02aa5 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 3a1ca341664c5..b4103378ebb1b 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) { auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; - - ov::Output broadcast_shape; - ov::Output Z_unsqueezed; - if (context.is_static()) { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); - } else { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); - Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - broadcast_shape = - std::make_shared(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0); - } + auto broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index ea5e417965eb3..5f86f47c1cca3 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) { } } else { auto src = context.get_input(0); - Output attention_size; - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } else if (op_case == 2) { - attention_size = context.get_input("attention_size"); - } else { - attention_size = context.get_input("attention_size_swa"); - } - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); if (context.is_static()) { @@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) { src, ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), false); - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - res = std::make_shared(src_slice, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared( + src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 12db9e82a08cf..1aa3bf76a06bb 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -7,10 +7,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) { } else { auto token_len = get_dimensions(input_node, {1}); auto mask_node = context.get_input(1); - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = context.get_input("inp_pos"); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); - mask_node_sliced = - std::make_shared(mask_node, zero_2d, stop, one_2d, axes); - if (!(context.is_static())) { - mask_node_sliced = std::make_shared(mask_node_sliced, zero_1d); - } + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); } if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 0b16c06fd04b1..e35599084e973 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -11,14 +11,15 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include +#include #include #include #include @@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); + + std::shared_ptr kv_len; + { + auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1}); + auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1}); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + kv_len = std::make_shared( + inp_pos, start, start, stride, std::vector{0, 0, 0}, std::vector{1, 1, 1}); + } + kv_len = std::make_shared( + kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + kv_len = std::make_shared(kv_len, ov::element::i64); + kv_len = std::make_shared(kv_len, one_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); @@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { }; create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); + // swa is not working for the `kv_len` is not correct + // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { @@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - // add_sliced_mask(tensor_map, ggml_model_decoder); + add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From 66e503bdbbc552a5a432b679e617a4b74b127d91 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sat, 11 Oct 2025 13:45:39 +0800 Subject: [PATCH 156/156] fix after rebasing --- ggml/src/ggml-openvino/ggml-openvino.cpp | 1 + ggml/src/ggml-openvino/openvino/op/set_rows.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 648acb4e35ede..309fc19b37166 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -70,6 +70,7 @@ static const ggml_backend_i ggml_backend_openvino_interface = { /* .graph_compute = */ ggml_backend_openvino_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .graph_optimize = */ NULL, }; int ggml_backend_openvino_get_device_count() { diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 0b2f29441aec7..001bd087734d9 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -25,7 +25,7 @@ namespace ggml { namespace op { OutputVector translate_set_rows(const NodeContext& context) { - num_inputs_check(context, 2, 2); + num_inputs_check(context, 3, 3); auto data = context.get_input(0); data = std::make_shared(data, context.get_output_type(0));