diff --git a/.github/mergify.yml b/.github/mergify.yml deleted file mode 100644 index 9e715eb3..00000000 --- a/.github/mergify.yml +++ /dev/null @@ -1,17 +0,0 @@ -queue_rules: - - name: default - conditions: - - label=ci:mergify - - -pull_request_rules: - - name: push to default merge queue - conditions: - - base=main - - label=ci:mergify - - check-success=cla/google - actions: - queue: - name: default - require_branch_protection: true - method: squash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index d5f5ebc7..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,40 +0,0 @@ -# YAML schema for GitHub Actions: -# https://help.github.com/en/actions/automating-your-workflow-with-github-actions/workflow-syntax-for-github-actions -# -# Helpful YAML parser to clarify YAML syntax: -# https://yaml-online-parser.appspot.com/ -# -# -# This file contains the workflows that are run prior to merging a pull request. - -name: CI - -on: - pull_request: - types: [labeled] - branches: - - main - -# schedule: - # 10am UTC is 3am or 4am PT depending on daylight savings. -# - cron: '0 10 * * *' - - # Allow manually triggering of the workflow. - workflow_dispatch: {} - -jobs: - arduino: - runs-on: ubuntu-latest - - if: | - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:run')) || - (github.event_name == 'schedule' && github.repository == 'tensorflow/tflite-micro-arduino-examples') - - name: Arduino CLI Build - steps: - - uses: actions/checkout@v2 - - name: Test - run: | - ./scripts/install_arduino_cli.sh - ./scripts/test_arduino_library.sh "${PWD}" diff --git a/.github/workflows/remove-labels.yml b/.github/workflows/remove-labels.yml deleted file mode 100644 index 7e7b6849..00000000 --- a/.github/workflows/remove-labels.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Remove Labels - -on: - pull_request_target: - types: [labeled] - -jobs: - label_cleanup: - runs-on: ubuntu-latest - - name: remove CI runner labels - steps: - - name: remove tags - uses: actions/github-script@a3e7071a34d7e1f219a8a4de9a5e0a34d1ee1293 - with: - github-token: ${{secrets.GITHUB_TOKEN}} - script: | - github.issues.removeLabel({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - name: 'ci:run' - }) - github.issues.removeLabel({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - name: 'ci:test' - }) - # Prevent erroring out if label doesn't exist - continue-on-error: true diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml index 3f3fce13..e44ebff5 100644 --- a/.github/workflows/sync.yml +++ b/.github/workflows/sync.yml @@ -5,7 +5,7 @@ # https://yaml-online-parser.appspot.com/ # -name: (Arduino) Sync from tflite-micro +name: (RTduino) Sync from tflite-micro on: # schedule: @@ -21,7 +21,7 @@ jobs: if: | github.event_name == 'workflow_dispatch' || - (github.event_name == 'schedule' && github.repository == 'tensorflow/tflite-micro-arduino-examples') + (github.event_name == 'schedule' && github.repository == 'RTduino-libraries/TensorFlow-Lite-Micro') steps: - uses: actions/setup-python@v2 @@ -31,7 +31,7 @@ jobs: - name: Sync the code run: | - pip3 install six Pillow Wave + pip3 install six Pillow Wave numpy ./scripts/sync_from_tflite_micro.sh git config --local user.name "TFLM-bot" git config --local user.email "tflm-github-bot@google.com" @@ -56,4 +56,4 @@ jobs: author: TFLM-bot body: "(Arduino) Automated sync from github.com/tensorflow/tflite-micro" labels: bot:sync-tf, ci:run - reviewers: advaitjain + reviewers: mysterywolf diff --git a/scripts/sync_from_tflite_micro.sh b/scripts/sync_from_tflite_micro.sh index c696c9d8..df77d91c 100755 --- a/scripts/sync_from_tflite_micro.sh +++ b/scripts/sync_from_tflite_micro.sh @@ -27,7 +27,7 @@ TEMP_DIR=$(mktemp -d) cd "${TEMP_DIR}" echo Cloning tflite-micro repo to "${TEMP_DIR}" -git clone --depth 1 --single-branch "https://github.com/tensorflow/tflite-micro.git" +git clone --depth 1 --single-branch "https://github.com/RTduino-libraries/tflite-micro.git" -b sync-baseline cd tflite-micro make -f tensorflow/lite/micro/tools/make/Makefile clean_downloads diff --git a/src/tensorflow/lite/builtin_ops.h b/src/tensorflow/lite/builtin_ops.h index 33707308..f9871add 100644 --- a/src/tensorflow/lite/builtin_ops.h +++ b/src/tensorflow/lite/builtin_ops.h @@ -186,6 +186,9 @@ typedef enum { kTfLiteBuiltinAtan2 = 156, kTfLiteBuiltinUnsortedSegmentMin = 157, kTfLiteBuiltinSign = 158, + kTfLiteBuiltinBitcast = 159, + kTfLiteBuiltinBitwiseXor = 160, + kTfLiteBuiltinRightShift = 161, } TfLiteBuiltinOperator; #ifdef __cplusplus diff --git a/src/tensorflow/lite/c/common.h b/src/tensorflow/lite/c/common.h index 0e485812..e3e8001c 100644 --- a/src/tensorflow/lite/c/common.h +++ b/src/tensorflow/lite/c/common.h @@ -38,10 +38,4 @@ limitations under the License. #include "tensorflow/lite/core/c/common.h" -// TfLiteOpaqueDelegate: allows delegation of nodes to alternative backends. -// TfLiteOpaqueDelegate is an abstract type that is intended to have the same -// role as TfLiteDelegate, but without necessarily exposing the implementation -// details of how delegates are implemented. -typedef TfLiteDelegate TfLiteOpaqueDelegate; - #endif // TENSORFLOW_LITE_C_COMMON_H_ diff --git a/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp b/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp index 117bc75e..68b94d95 100644 --- a/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp +++ b/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp @@ -256,6 +256,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, return ParseElu(op, error_reporter, allocator, builtin_data); } + case BuiltinOperator_EMBEDDING_LOOKUP: { + return ParseEmbeddingLookup(op, error_reporter, allocator, builtin_data); + } + case BuiltinOperator_EXP: { return ParseExp(op, error_reporter, allocator, builtin_data); } @@ -542,6 +546,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, return ParseZerosLike(op, error_reporter, allocator, builtin_data); } + case BuiltinOperator_BITWISE_XOR: { + return ParseBitwiseXor(op, error_reporter, allocator, builtin_data); + } + + case BuiltinOperator_RIGHT_SHIFT: { + return ParseRightShift(op, error_reporter, allocator, builtin_data); + } + case BuiltinOperator_CAST: { return ParseCast(op, error_reporter, allocator, builtin_data); } @@ -845,6 +857,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, *builtin_data = params.release(); return kTfLiteOk; } + // Below are the ops with no builtin_data structure. // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are // ok for now, since there is no call implementation either. @@ -855,7 +868,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_CUSTOM: case BuiltinOperator_DENSIFY: case BuiltinOperator_DYNAMIC_UPDATE_SLICE: - case BuiltinOperator_EMBEDDING_LOOKUP: case BuiltinOperator_EQUAL: case BuiltinOperator_HASHTABLE_FIND: case BuiltinOperator_HASHTABLE_IMPORT: @@ -885,6 +897,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_UNSORTED_SEGMENT_SUM: case BuiltinOperator_ATAN2: case BuiltinOperator_SIGN: + case BuiltinOperator_BITCAST: case BuiltinOperator_WHERE: return kTfLiteOk; case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES: @@ -1335,6 +1348,14 @@ TfLiteStatus ParseElu(const Operator*, ErrorReporter*, BuiltinDataAllocator*, return kTfLiteOk; } +// We have this parse function instead of directly returning kTfLiteOk from the +// switch-case in ParseOpData because this function is used as part of the +// selective registration for the OpResolver implementation in micro. +TfLiteStatus ParseEmbeddingLookup(const Operator*, ErrorReporter*, + BuiltinDataAllocator*, void**) { + return kTfLiteOk; +} + // We have this parse function instead of directly returning kTfLiteOk from the // switch-case in ParseOpData because this function is used as part of the // selective registration for the OpResolver implementation in micro. @@ -2441,6 +2462,22 @@ TfLiteStatus ParseZerosLike(const Operator*, ErrorReporter*, return kTfLiteOk; } +// We have this parse function instead of directly returning kTfLiteOk from the +// switch-case in ParseOpData because this function is used as part of the +// selective registration for the OpResolver implementation in micro. +TfLiteStatus ParseBitwiseXor(const Operator*, ErrorReporter*, + BuiltinDataAllocator*, void**) { + return kTfLiteOk; +} + +// We have this parse function instead of directly returning kTfLiteOk from the +// switch-case in ParseOpData because this function is used as part of the +// selective registration for the OpResolver implementation in micro. +TfLiteStatus ParseRightShift(const Operator*, ErrorReporter*, + BuiltinDataAllocator*, void**) { + return kTfLiteOk; +} + TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data) { diff --git a/src/tensorflow/lite/core/api/flatbuffer_conversions.h b/src/tensorflow/lite/core/api/flatbuffer_conversions.h index 4df83d5e..9ffe3971 100644 --- a/src/tensorflow/lite/core/api/flatbuffer_conversions.h +++ b/src/tensorflow/lite/core/api/flatbuffer_conversions.h @@ -151,6 +151,11 @@ TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter, TfLiteStatus ParseElu(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); +TfLiteStatus ParseEmbeddingLookup(const Operator* op, + ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, + void** builtin_data); + TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); @@ -407,6 +412,14 @@ TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); +TfLiteStatus ParseBitwiseXor(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, + void** builtin_data); + +TfLiteStatus ParseRightShift(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, + void** builtin_data); + } // namespace tflite #endif // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_ diff --git a/src/tensorflow/lite/core/c/c_api_types.h b/src/tensorflow/lite/core/c/c_api_types.h index 3aab43f4..670ec1ee 100644 --- a/src/tensorflow/lite/core/c/c_api_types.h +++ b/src/tensorflow/lite/core/c/c_api_types.h @@ -21,6 +21,7 @@ limitations under the License. /// "third_party/tensorflow/lite/c/c_api_types.h". /// Only the TensorFlow Lite implementation itself should include this /// file directly. +// IWYU pragma: private, include "third_party/tensorflow/lite/c/c_api_types.h" #ifndef TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_ #define TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_ diff --git a/src/tensorflow/lite/core/c/common.cpp b/src/tensorflow/lite/core/c/common.cpp index 00bbcde2..a25abcfb 100644 --- a/src/tensorflow/lite/core/c/common.cpp +++ b/src/tensorflow/lite/core/c/common.cpp @@ -98,11 +98,22 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) { return ret; } +TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) { + if (!src) return nullptr; + TfLiteFloatArray* ret = TfLiteFloatArrayCreate(src->size); + if (ret) { + memcpy(ret->data, src->data, src->size * sizeof(float)); + } + return ret; +} + void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); } void TfLiteTensorDataFree(TfLiteTensor* t) { - if (t->allocation_type == kTfLiteDynamic || - t->allocation_type == kTfLitePersistentRo) { + if (t->allocation_type == kTfLiteVariantObject) { + delete reinterpret_cast(t->data.data); + } else if (t->allocation_type == kTfLiteDynamic || + t->allocation_type == kTfLitePersistentRo) { if (t->data.raw) { #ifdef TF_LITE_TENSORFLOW_PROFILER tflite::PauseHeapMonitoring(/*pause=*/true); @@ -207,11 +218,16 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) { if (!src || !dst) return kTfLiteOk; if (src->bytes != dst->bytes) return kTfLiteError; if (src == dst) return kTfLiteOk; - dst->type = src->type; if (dst->dims) TfLiteIntArrayFree(dst->dims); dst->dims = TfLiteIntArrayCopy(src->dims); - memcpy(dst->data.raw, src->data.raw, src->bytes); + if (src->allocation_type == kTfLiteVariantObject) { + if (dst->allocation_type != kTfLiteVariantObject) return kTfLiteError; + dst->data.data = + reinterpret_cast(src->data.data)->Clone(dst->data.raw); + } else { + memcpy(dst->data.raw, src->data.raw, src->bytes); + } dst->buffer_handle = src->buffer_handle; dst->data_is_stale = src->data_is_stale; dst->delegate = src->delegate; diff --git a/src/tensorflow/lite/core/c/common.h b/src/tensorflow/lite/core/c/common.h index 8ca987d2..9bf98971 100644 --- a/src/tensorflow/lite/core/c/common.h +++ b/src/tensorflow/lite/core/c/common.h @@ -38,6 +38,7 @@ limitations under the License. /// "third_party/tensorflow/lite/c/common.h". /// Only the TensorFlow Lite implementation itself should include this /// file directly. +// IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h" #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_ #define TENSORFLOW_LITE_CORE_C_COMMON_H_ @@ -157,6 +158,10 @@ int TfLiteFloatArrayGetSizeInBytes(int size); // This returns a pointer, that you must free using TfLiteFloatArrayFree(). TfLiteFloatArray* TfLiteFloatArrayCreate(int size); +// Create a copy of an array passed as `src`. +// You are expected to free memory with TfLiteFloatArrayFree. +TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src); + // Free memory of array `a`. void TfLiteFloatArrayFree(TfLiteFloatArray* a); #endif // TF_LITE_STATIC_MEMORY @@ -345,6 +350,8 @@ typedef union TfLitePtrUnion { // as constant inputs for downstream ops (also in prepare). // * kTfLiteCustom: Custom memory allocation provided by the user. See // TfLiteCustomAllocation below. +// * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object. +// Allocation and deallocation are done through `new` and `delete`. typedef enum TfLiteAllocationType { kTfLiteMemNone = 0, kTfLiteMmapRo, @@ -353,6 +360,7 @@ typedef enum TfLiteAllocationType { kTfLiteDynamic, kTfLitePersistentRo, kTfLiteCustom, + kTfLiteVariantObject, } TfLiteAllocationType; // The delegates should use zero or positive integers to represent handles. @@ -959,12 +967,53 @@ typedef struct TfLiteRegistration { // ops. We keep it inside of `TfLiteRegistration` and use it to route // callbacks properly. TfLiteRegistrationExternal* registration_external; + + // Retrieves asynchronous kernel. + // + // If the `async_kernel` field is nullptr, it means the operation described by + // this TfLiteRegistration object does not support asynchronous execution. + // Otherwise, the function that the field points to should only be called for + // delegate kernel nodes, i.e. `node` should be a delegate kernel node created + // by applying a delegate. + // If the function returns nullptr, that means that the underlying delegate + // does not support asynchronous execution for this `node`. + struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context, + TfLiteNode* node); } TfLiteRegistration; +/// \private // Old version of `TfLiteRegistration` to maintain binary backward // compatibility. -// WARNING: This structure is deprecated / not an official part of the API. -// It should be only used for binary backward compatibility. +// The legacy registration type must be a POD struct type whose field types must +// be a prefix of the field types in TfLiteRegistration, and offset of the first +// field in TfLiteRegistration that is not present in the legacy registration +// type must be greater than or equal to the size of the legacy registration +// type. +// WARNING: This structure is deprecated / not an official part of the +// API. It should be only used for binary backward compatibility. +typedef struct TfLiteRegistration_V2 { + void* (*init)(TfLiteContext* context, const char* buffer, size_t length); + void (*free)(TfLiteContext* context, void* buffer); + TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node); + TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node); + const char* (*profiling_string)(const TfLiteContext* context, + const TfLiteNode* node); + int32_t builtin_code; + const char* custom_name; + int version; + TfLiteRegistrationExternal* registration_external; +} TfLiteRegistration_V2; + +/// \private +// Old version of `TfLiteRegistration` to maintain binary backward +// compatibility. +// The legacy registration type must be a POD struct type whose field types must +// be a prefix of the field types in TfLiteRegistration, and offset of the first +// field in TfLiteRegistration that is not present in the legacy registration +// type must be greater than or equal to the size of the legacy registration +// type. +// WARNING: This structure is deprecated / not an official part of the +// API. It should be only used for binary backward compatibility. typedef struct TfLiteRegistration_V1 { void* (*init)(TfLiteContext* context, const char* buffer, size_t length); void (*free)(TfLiteContext* context, void* buffer); @@ -1155,5 +1204,74 @@ void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate); #ifdef __cplusplus } // extern "C" + +#include + +// `kTfLiteVariant` type tensors encode arbitrary C++ objects behind their +// `data.data : void*` member. This is the type-erased interface for interacting +// with such objects at runtime. Deleting or Cloning any `VariantData` +// will call the destructor and copy constructor of the erased type +// automatically. For example usage, see `common_test.cc`. +class VariantData { + public: + // All variant objects must be able to be destroyed and copied. + virtual ~VariantData() = default; + // This allows for a "virtual copy-constructor" like pattern. + // In most cases, we will be copying from an input to an output tensor. + // Often, the output tensor is already allocated so we can pass + // a pointer to its buffer for reuse. + virtual VariantData* Clone(char* maybe_alloc) const = 0; +}; + +// An abstract base class for variant objects. The template parameter +// is the type we are erasing. +template +class AbstractVariantData : public VariantData { + public: + VariantData* Clone(char* maybe_alloc) const override { + if (maybe_alloc) { + // We assume that the output tensor is already a variant of the same + // derived type. If the output is still allocated, then it still may have + // state that was not destroyed, so we must call the destructor before + // using the buffer. + // This may actual have a non-negligle effect on perfomance if the + // destructor is complex. In a future optimization we would want to + // introduce something like "move to" semantics, allowing for the + // underlying implementation to optimize for this case. + reinterpret_cast(maybe_alloc)->~VariantData(); + return new (maybe_alloc) + ErasedDerived(static_cast(*this)); + } + return new ErasedDerived(static_cast(*this)); + } + + protected: + AbstractVariantData() = default; + AbstractVariantData(const AbstractVariantData&) = default; + AbstractVariantData(AbstractVariantData&&) = delete; +}; + +// Analogous to `TfLiteTensorRealloc` for allocation of tensors whose +// data member points to an arbitrary C++ object. `VariantType` refers +// to the erased type of said object and `VariantArgs` refers to +// a list of argument types with which to construct a new `VariantType` +// `VariantArgs` must match constructor in `VariantType`. +template +TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t, + VariantArgs&&... args) { + if (t->type != kTfLiteVariant) return kTfLiteError; + if (t->data.raw) { + reinterpret_cast(t->data.data)->~VariantData(); + // For now we assume if `t` is already allocated then it was allocated + // with the same `VariantType` as templated. + t->data.data = + new (t->data.raw) VariantType(std::forward(args...)); + } else { + t->data.data = new VariantType(std::forward(args...)); + } + t->allocation_type = kTfLiteVariantObject; + return kTfLiteOk; +} + #endif // __cplusplus #endif // TENSORFLOW_LITE_CORE_C_COMMON_H_ diff --git a/src/tensorflow/lite/core/macros.h b/src/tensorflow/lite/core/macros.h new file mode 100644 index 00000000..8ebc8db2 --- /dev/null +++ b/src/tensorflow/lite/core/macros.h @@ -0,0 +1,68 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// This provides utility macros and functions that are inherently platform +// specific. +#ifndef TENSORFLOW_LITE_CORE_MACROS_H_ +#define TENSORFLOW_LITE_CORE_MACROS_H_ + +#ifdef __has_builtin +#define TFLITE_HAS_BUILTIN(x) __has_builtin(x) +#else +#define TFLITE_HAS_BUILTIN(x) 0 +#endif + +#if (!defined(__NVCC__)) && (TFLITE_HAS_BUILTIN(__builtin_expect) || \ + (defined(__GNUC__) && __GNUC__ >= 3)) +#define TFLITE_EXPECT_FALSE(cond) __builtin_expect(cond, false) +#define TFLITE_EXPECT_TRUE(cond) __builtin_expect(!!(cond), true) +#else +#define TFLITE_EXPECT_FALSE(cond) (cond) +#define TFLITE_EXPECT_TRUE(cond) (cond) +#endif + +#ifdef _WIN32 +#define TFLITE_NOINLINE __declspec(noinline) +#else +#ifdef __has_attribute +#if __has_attribute(noinline) +#define TFLITE_NOINLINE __attribute__((noinline)) +#else +#define TFLITE_NOINLINE +#endif // __has_attribute(noinline) +#else +#define TFLITE_NOINLINE +#endif // __has_attribute +#endif // _WIN32 + +// Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but +// we avoid the absl dependency for binary size reasons. +#ifdef __has_attribute +#define TFLITE_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +#define TFLITE_HAS_ATTRIBUTE(x) 0 +#endif + +#if (TFLITE_HAS_ATTRIBUTE(weak) || \ + (defined(__GNUC__) && !defined(__clang__))) && \ + !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__) +#undef TFLITE_ATTRIBUTE_WEAK +#define TFLITE_ATTRIBUTE_WEAK __attribute__((weak)) +#define TFLITE_HAS_ATTRIBUTE_WEAK 1 +#else +#define TFLITE_ATTRIBUTE_WEAK +#define TFLITE_HAS_ATTRIBUTE_WEAK 0 +#endif + +#endif // TENSORFLOW_LITE_CORE_MACROS_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/bits.h b/src/tensorflow/lite/experimental/microfrontend/lib/bits.h deleted file mode 100644 index 04b3ba6f..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/bits.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_ - -#ifdef __cplusplus -#include - -extern "C" { -#endif - -static inline int CountLeadingZeros32Slow(uint64_t n) { - int zeroes = 28; - if (n >> 16) zeroes -= 16, n >>= 16; - if (n >> 8) zeroes -= 8, n >>= 8; - if (n >> 4) zeroes -= 4, n >>= 4; - return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes; -} - -static inline int CountLeadingZeros32(uint32_t n) { -#if defined(_MSC_VER) - unsigned long result = 0; // NOLINT(runtime/int) - if (_BitScanReverse(&result, n)) { - return 31 - result; - } - return 32; -#elif defined(__GNUC__) - - // Handle 0 as a special case because __builtin_clz(0) is undefined. - if (n == 0) { - return 32; - } - return __builtin_clz(n); -#else - return CountLeadingZeros32Slow(n); -#endif -} - -static inline int MostSignificantBit32(uint32_t n) { - return 32 - CountLeadingZeros32(n); -} - -static inline int CountLeadingZeros64Slow(uint64_t n) { - int zeroes = 60; - if (n >> 32) zeroes -= 32, n >>= 32; - if (n >> 16) zeroes -= 16, n >>= 16; - if (n >> 8) zeroes -= 8, n >>= 8; - if (n >> 4) zeroes -= 4, n >>= 4; - return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes; -} - -static inline int CountLeadingZeros64(uint64_t n) { -#if defined(_MSC_VER) && defined(_M_X64) - // MSVC does not have __builtin_clzll. Use _BitScanReverse64. - unsigned long result = 0; // NOLINT(runtime/int) - if (_BitScanReverse64(&result, n)) { - return 63 - result; - } - return 64; -#elif defined(_MSC_VER) - // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse - unsigned long result = 0; // NOLINT(runtime/int) - if ((n >> 32) && _BitScanReverse(&result, n >> 32)) { - return 31 - result; - } - if (_BitScanReverse(&result, n)) { - return 63 - result; - } - return 64; -#elif defined(__GNUC__) - - // Handle 0 as a special case because __builtin_clzll(0) is undefined. - if (n == 0) { - return 64; - } - return __builtin_clzll(n); -#else - return CountLeadingZeros64Slow(n); -#endif -} - -static inline int MostSignificantBit64(uint64_t n) { - return 64 - CountLeadingZeros64(n); -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft.cpp b/src/tensorflow/lite/experimental/microfrontend/lib/fft.cpp deleted file mode 100644 index bcdd9cc0..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/fft.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/fft.h" - -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h" - -void FftCompute(struct FftState* state, const int16_t* input, - int input_scale_shift) { - const size_t input_size = state->input_size; - const size_t fft_size = state->fft_size; - - int16_t* fft_input = state->input; - // First, scale the input by the given shift. - size_t i; - for (i = 0; i < input_size; ++i) { - fft_input[i] = static_cast(static_cast(input[i]) - << input_scale_shift); - } - // Zero out whatever else remains in the top part of the input. - for (; i < fft_size; ++i) { - fft_input[i] = 0; - } - - // Apply the FFT. - kissfft_fixed16::kiss_fftr( - reinterpret_cast(state->scratch), - state->input, - reinterpret_cast(state->output)); -} - -void FftInit(struct FftState* state) { - // All the initialization is done in FftPopulateState() -} - -void FftReset(struct FftState* state) { - memset(state->input, 0, state->fft_size * sizeof(*state->input)); - memset(state->output, 0, (state->fft_size / 2 + 1) * sizeof(*state->output)); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft.h b/src/tensorflow/lite/experimental/microfrontend/lib/fft.h deleted file mode 100644 index aaffa69d..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/fft.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct complex_int16_t { - int16_t real; - int16_t imag; -}; - -struct FftState { - int16_t* input; - struct complex_int16_t* output; - size_t fft_size; - size_t input_size; - void* scratch; - size_t scratch_size; -}; - -void FftCompute(struct FftState* state, const int16_t* input, - int input_scale_shift); - -void FftInit(struct FftState* state); - -void FftReset(struct FftState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.cpp b/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.cpp deleted file mode 100644 index ed3dc8fb..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h" - -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h" - -int FftPopulateState(struct FftState* state, size_t input_size) { - state->input_size = input_size; - state->fft_size = 1; - while (state->fft_size < state->input_size) { - state->fft_size <<= 1; - } - - state->input = reinterpret_cast( - malloc(state->fft_size * sizeof(*state->input))); - if (state->input == nullptr) { - fprintf(stderr, "Failed to alloc fft input buffer\n"); - return 0; - } - - state->output = reinterpret_cast( - malloc((state->fft_size / 2 + 1) * sizeof(*state->output) * 2)); - if (state->output == nullptr) { - fprintf(stderr, "Failed to alloc fft output buffer\n"); - return 0; - } - - // Ask kissfft how much memory it wants. - size_t scratch_size = 0; - kissfft_fixed16::kiss_fftr_cfg kfft_cfg = kissfft_fixed16::kiss_fftr_alloc( - state->fft_size, 0, nullptr, &scratch_size); - if (kfft_cfg != nullptr) { - fprintf(stderr, "Kiss memory sizing failed.\n"); - return 0; - } - state->scratch = malloc(scratch_size); - if (state->scratch == nullptr) { - fprintf(stderr, "Failed to alloc fft scratch buffer\n"); - return 0; - } - state->scratch_size = scratch_size; - // Let kissfft configure the scratch space we just allocated - kfft_cfg = kissfft_fixed16::kiss_fftr_alloc(state->fft_size, 0, - state->scratch, &scratch_size); - if (kfft_cfg != state->scratch) { - fprintf(stderr, "Kiss memory preallocation strategy failed.\n"); - return 0; - } - return 1; -} - -void FftFreeStateContents(struct FftState* state) { - free(state->input); - free(state->output); - free(state->scratch); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.c b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.c deleted file mode 100644 index 80f8738f..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.c +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h" - -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/bits.h" - -void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state, - struct complex_int16_t* fft_output, - int32_t* energy) { - const int end_index = state->end_index; - int i; - energy += state->start_index; - fft_output += state->start_index; - for (i = state->start_index; i < end_index; ++i) { - const int32_t real = fft_output->real; - const int32_t imag = fft_output->imag; - fft_output++; - const uint32_t mag_squared = (real * real) + (imag * imag); - *energy++ = mag_squared; - } -} - -void FilterbankAccumulateChannels(struct FilterbankState* state, - const int32_t* energy) { - uint64_t* work = state->work; - uint64_t weight_accumulator = 0; - uint64_t unweight_accumulator = 0; - - const int16_t* channel_frequency_starts = state->channel_frequency_starts; - const int16_t* channel_weight_starts = state->channel_weight_starts; - const int16_t* channel_widths = state->channel_widths; - - int num_channels_plus_1 = state->num_channels + 1; - int i; - for (i = 0; i < num_channels_plus_1; ++i) { - const int32_t* magnitudes = energy + *channel_frequency_starts++; - const int16_t* weights = state->weights + *channel_weight_starts; - const int16_t* unweights = state->unweights + *channel_weight_starts++; - const int width = *channel_widths++; - int j; - for (j = 0; j < width; ++j) { - weight_accumulator += *weights++ * ((uint64_t)*magnitudes); - unweight_accumulator += *unweights++ * ((uint64_t)*magnitudes); - ++magnitudes; - } - *work++ = weight_accumulator; - weight_accumulator = unweight_accumulator; - unweight_accumulator = 0; - } -} - -static uint16_t Sqrt32(uint32_t num) { - if (num == 0) { - return 0; - } - uint32_t res = 0; - int max_bit_number = 32 - MostSignificantBit32(num); - max_bit_number |= 1; - uint32_t bit = 1U << (31 - max_bit_number); - int iterations = (31 - max_bit_number) / 2 + 1; - while (iterations--) { - if (num >= res + bit) { - num -= res + bit; - res = (res >> 1U) + bit; - } else { - res >>= 1U; - } - bit >>= 2U; - } - // Do rounding - if we have the bits. - if (num > res && res != 0xFFFF) { - ++res; - } - return res; -} - -static uint32_t Sqrt64(uint64_t num) { - // Take a shortcut and just use 32 bit operations if the upper word is all - // clear. This will cause a slight off by one issue for numbers close to 2^32, - // but it probably isn't going to matter (and gives us a big performance win). - if ((num >> 32) == 0) { - return Sqrt32((uint32_t)num); - } - uint64_t res = 0; - int max_bit_number = 64 - MostSignificantBit64(num); - max_bit_number |= 1; - uint64_t bit = 1ULL << (63 - max_bit_number); - int iterations = (63 - max_bit_number) / 2 + 1; - while (iterations--) { - if (num >= res + bit) { - num -= res + bit; - res = (res >> 1U) + bit; - } else { - res >>= 1U; - } - bit >>= 2U; - } - // Do rounding - if we have the bits. - if (num > res && res != 0xFFFFFFFFLL) { - ++res; - } - return res; -} - -uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) { - const int num_channels = state->num_channels; - const uint64_t* work = state->work + 1; - // Reuse the work buffer since we're fine clobbering it at this point to hold - // the output. - uint32_t* output = (uint32_t*)state->work; - int i; - for (i = 0; i < num_channels; ++i) { - *output++ = Sqrt64(*work++) >> scale_down_shift; - } - return (uint32_t*)state->work; -} - -void FilterbankReset(struct FilterbankState* state) { - memset(state->work, 0, (state->num_channels + 1) * sizeof(*state->work)); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.h b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.h deleted file mode 100644 index 1e6d3885..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_ - -#include -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/fft.h" - -#define kFilterbankBits 12 - -#ifdef __cplusplus -extern "C" { -#endif - -struct FilterbankState { - int num_channels; - int start_index; - int end_index; - int16_t* channel_frequency_starts; - int16_t* channel_weight_starts; - int16_t* channel_widths; - int16_t* weights; - int16_t* unweights; - uint64_t* work; -}; - -// Converts the relevant complex values of an FFT output into energy (the -// square magnitude). -void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state, - struct complex_int16_t* fft_output, - int32_t* energy); - -// Computes the mel-scale filterbank on the given energy array. Output is cached -// internally - to fetch it, you need to call FilterbankSqrt. -void FilterbankAccumulateChannels(struct FilterbankState* state, - const int32_t* energy); - -// Applies an integer square root to the 64 bit intermediate values of the -// filterbank, and returns a pointer to them. Memory will be invalidated the -// next time FilterbankAccumulateChannels is called. -uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift); - -void FilterbankReset(struct FilterbankState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c deleted file mode 100644 index f18ebf54..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c +++ /dev/null @@ -1,220 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h" - -#include -#include -#include - -#define kFilterbankIndexAlignment 4 -#define kFilterbankChannelBlockSize 4 - -void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) { - config->num_channels = 32; - config->lower_band_limit = 125.0f; - config->upper_band_limit = 7500.0f; - config->output_scale_shift = 7; -} - -static float FreqToMel(float freq) { return 1127.0 * log1p(freq / 700.0); } - -static void CalculateCenterFrequencies(const int num_channels, - const float lower_frequency_limit, - const float upper_frequency_limit, - float* center_frequencies) { - assert(lower_frequency_limit >= 0.0f); - assert(upper_frequency_limit > lower_frequency_limit); - - const float mel_low = FreqToMel(lower_frequency_limit); - const float mel_hi = FreqToMel(upper_frequency_limit); - const float mel_span = mel_hi - mel_low; - const float mel_spacing = mel_span / ((float)num_channels); - int i; - for (i = 0; i < num_channels; ++i) { - center_frequencies[i] = mel_low + (mel_spacing * (i + 1)); - } -} - -static void QuantizeFilterbankWeights(const float float_weight, int16_t* weight, - int16_t* unweight) { - *weight = floor(float_weight * (1 << kFilterbankBits) + 0.5); - *unweight = floor((1.0 - float_weight) * (1 << kFilterbankBits) + 0.5); -} - -int FilterbankPopulateState(const struct FilterbankConfig* config, - struct FilterbankState* state, int sample_rate, - int spectrum_size) { - state->num_channels = config->num_channels; - const int num_channels_plus_1 = config->num_channels + 1; - - // How should we align things to index counts given the byte alignment? - const int index_alignment = - (kFilterbankIndexAlignment < sizeof(int16_t) - ? 1 - : kFilterbankIndexAlignment / sizeof(int16_t)); - - state->channel_frequency_starts = - malloc(num_channels_plus_1 * sizeof(*state->channel_frequency_starts)); - state->channel_weight_starts = - malloc(num_channels_plus_1 * sizeof(*state->channel_weight_starts)); - state->channel_widths = - malloc(num_channels_plus_1 * sizeof(*state->channel_widths)); - state->work = malloc(num_channels_plus_1 * sizeof(*state->work)); - - float* center_mel_freqs = - malloc(num_channels_plus_1 * sizeof(*center_mel_freqs)); - int16_t* actual_channel_starts = - malloc(num_channels_plus_1 * sizeof(*actual_channel_starts)); - int16_t* actual_channel_widths = - malloc(num_channels_plus_1 * sizeof(*actual_channel_widths)); - - if (state->channel_frequency_starts == NULL || - state->channel_weight_starts == NULL || state->channel_widths == NULL || - center_mel_freqs == NULL || actual_channel_starts == NULL || - actual_channel_widths == NULL) { - free(center_mel_freqs); - free(actual_channel_starts); - free(actual_channel_widths); - fprintf(stderr, "Failed to allocate channel buffers\n"); - return 0; - } - - CalculateCenterFrequencies(num_channels_plus_1, config->lower_band_limit, - config->upper_band_limit, center_mel_freqs); - - // Always exclude DC. - const float hz_per_sbin = 0.5 * sample_rate / ((float)spectrum_size - 1); - state->start_index = 1.5 + config->lower_band_limit / hz_per_sbin; - state->end_index = 0; // Initialized to zero here, but actually set below. - - // For each channel, we need to figure out what frequencies belong to it, and - // how much padding we need to add so that we can efficiently multiply the - // weights and unweights for accumulation. To simplify the multiplication - // logic, all channels will have some multiplication to do (even if there are - // no frequencies that accumulate to that channel) - they will be directed to - // a set of zero weights. - int chan_freq_index_start = state->start_index; - int weight_index_start = 0; - int needs_zeros = 0; - - int chan; - for (chan = 0; chan < num_channels_plus_1; ++chan) { - // Keep jumping frequencies until we overshoot the bound on this channel. - int freq_index = chan_freq_index_start; - while (FreqToMel((freq_index)*hz_per_sbin) <= center_mel_freqs[chan]) { - ++freq_index; - } - - const int width = freq_index - chan_freq_index_start; - actual_channel_starts[chan] = chan_freq_index_start; - actual_channel_widths[chan] = width; - - if (width == 0) { - // This channel doesn't actually get anything from the frequencies, it's - // always zero. We need then to insert some 'zero' weights into the - // output, and just redirect this channel to do a single multiplication at - // this point. For simplicity, the zeros are placed at the beginning of - // the weights arrays, so we have to go and update all the other - // weight_starts to reflect this shift (but only once). - state->channel_frequency_starts[chan] = 0; - state->channel_weight_starts[chan] = 0; - state->channel_widths[chan] = kFilterbankChannelBlockSize; - if (!needs_zeros) { - needs_zeros = 1; - int j; - for (j = 0; j < chan; ++j) { - state->channel_weight_starts[j] += kFilterbankChannelBlockSize; - } - weight_index_start += kFilterbankChannelBlockSize; - } - } else { - // How far back do we need to go to ensure that we have the proper - // alignment? - const int aligned_start = - (chan_freq_index_start / index_alignment) * index_alignment; - const int aligned_width = (chan_freq_index_start - aligned_start + width); - const int padded_width = - (((aligned_width - 1) / kFilterbankChannelBlockSize) + 1) * - kFilterbankChannelBlockSize; - - state->channel_frequency_starts[chan] = aligned_start; - state->channel_weight_starts[chan] = weight_index_start; - state->channel_widths[chan] = padded_width; - weight_index_start += padded_width; - } - chan_freq_index_start = freq_index; - } - - // Allocate the two arrays to store the weights - weight_index_start contains - // the index of what would be the next set of weights that we would need to - // add, so that's how many weights we need to allocate. - state->weights = calloc(weight_index_start, sizeof(*state->weights)); - state->unweights = calloc(weight_index_start, sizeof(*state->unweights)); - - // If the alloc failed, we also need to nuke the arrays. - if (state->weights == NULL || state->unweights == NULL) { - free(center_mel_freqs); - free(actual_channel_starts); - free(actual_channel_widths); - fprintf(stderr, "Failed to allocate weights or unweights\n"); - return 0; - } - - // Next pass, compute all the weights. Since everything has been memset to - // zero, we only need to fill in the weights that correspond to some frequency - // for a channel. - const float mel_low = FreqToMel(config->lower_band_limit); - for (chan = 0; chan < num_channels_plus_1; ++chan) { - int frequency = actual_channel_starts[chan]; - const int num_frequencies = actual_channel_widths[chan]; - const int frequency_offset = - frequency - state->channel_frequency_starts[chan]; - const int weight_start = state->channel_weight_starts[chan]; - const float denom_val = (chan == 0) ? mel_low : center_mel_freqs[chan - 1]; - - int j; - for (j = 0; j < num_frequencies; ++j, ++frequency) { - const float weight = - (center_mel_freqs[chan] - FreqToMel(frequency * hz_per_sbin)) / - (center_mel_freqs[chan] - denom_val); - - // Make the float into an integer for the weights (and unweights). - const int weight_index = weight_start + frequency_offset + j; - QuantizeFilterbankWeights(weight, state->weights + weight_index, - state->unweights + weight_index); - } - if (frequency > state->end_index) { - state->end_index = frequency; - } - } - - free(center_mel_freqs); - free(actual_channel_starts); - free(actual_channel_widths); - if (state->end_index >= spectrum_size) { - fprintf(stderr, "Filterbank end_index is above spectrum size.\n"); - return 0; - } - return 1; -} - -void FilterbankFreeStateContents(struct FilterbankState* state) { - free(state->channel_frequency_starts); - free(state->channel_weight_starts); - free(state->channel_widths); - free(state->weights); - free(state->unweights); - free(state->work); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h deleted file mode 100644 index 781d1024..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_ - -#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct FilterbankConfig { - // number of frequency channel buckets for filterbank - int num_channels; - // maximum frequency to include - float upper_band_limit; - // minimum frequency to include - float lower_band_limit; - // unused - int output_scale_shift; -}; - -// Fills the frontendConfig with "sane" defaults. -void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config); - -// Allocates any buffers. -int FilterbankPopulateState(const struct FilterbankConfig* config, - struct FilterbankState* state, int sample_rate, - int spectrum_size); - -// Frees any allocated buffers. -void FilterbankFreeStateContents(struct FilterbankState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.c b/src/tensorflow/lite/experimental/microfrontend/lib/frontend.c deleted file mode 100644 index 9de2a879..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.c +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h" - -#include "tensorflow/lite/experimental/microfrontend/lib/bits.h" - -struct FrontendOutput FrontendProcessSamples(struct FrontendState* state, - const int16_t* samples, - size_t num_samples, - size_t* num_samples_read) { - struct FrontendOutput output; - output.values = NULL; - output.size = 0; - - // Try to apply the window - if it fails, return and wait for more data. - if (!WindowProcessSamples(&state->window, samples, num_samples, - num_samples_read)) { - return output; - } - - // Apply the FFT to the window's output (and scale it so that the fixed point - // FFT can have as much resolution as possible). - int input_shift = - 15 - MostSignificantBit32(state->window.max_abs_output_value); - FftCompute(&state->fft, state->window.output, input_shift); - - // We can re-ruse the fft's output buffer to hold the energy. - int32_t* energy = (int32_t*)state->fft.output; - - FilterbankConvertFftComplexToEnergy(&state->filterbank, state->fft.output, - energy); - - FilterbankAccumulateChannels(&state->filterbank, energy); - uint32_t* scaled_filterbank = FilterbankSqrt(&state->filterbank, input_shift); - - // Apply noise reduction. - NoiseReductionApply(&state->noise_reduction, scaled_filterbank); - - if (state->pcan_gain_control.enable_pcan) { - PcanGainControlApply(&state->pcan_gain_control, scaled_filterbank); - } - - // Apply the log and scale. - int correction_bits = - MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2); - uint16_t* logged_filterbank = - LogScaleApply(&state->log_scale, scaled_filterbank, - state->filterbank.num_channels, correction_bits); - - output.size = state->filterbank.num_channels; - output.values = logged_filterbank; - return output; -} - -void FrontendReset(struct FrontendState* state) { - WindowReset(&state->window); - FftReset(&state->fft); - FilterbankReset(&state->filterbank); - NoiseReductionReset(&state->noise_reduction); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.h b/src/tensorflow/lite/experimental/microfrontend/lib/frontend.h deleted file mode 100644 index 883df5fd..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_ - -#include -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/fft.h" -#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h" -#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h" -#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h" -#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h" -#include "tensorflow/lite/experimental/microfrontend/lib/window.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct FrontendState { - struct WindowState window; - struct FftState fft; - struct FilterbankState filterbank; - struct NoiseReductionState noise_reduction; - struct PcanGainControlState pcan_gain_control; - struct LogScaleState log_scale; -}; - -struct FrontendOutput { - const uint16_t* values; - size_t size; -}; - -// Main entry point to processing frontend samples. Updates num_samples_read to -// contain the number of samples that have been consumed from the input array. -// Returns a struct containing the generated output. If not enough samples were -// added to generate a feature vector, the returned size will be 0 and the -// values pointer will be NULL. Note that the output pointer will be invalidated -// as soon as FrontendProcessSamples is called again, so copy the contents -// elsewhere if you need to use them later. -struct FrontendOutput FrontendProcessSamples(struct FrontendState* state, - const int16_t* samples, - size_t num_samples, - size_t* num_samples_read); - -void FrontendReset(struct FrontendState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c deleted file mode 100644 index 27224f6d..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h" - -#include -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/bits.h" - -void FrontendFillConfigWithDefaults(struct FrontendConfig* config) { - WindowFillConfigWithDefaults(&config->window); - FilterbankFillConfigWithDefaults(&config->filterbank); - NoiseReductionFillConfigWithDefaults(&config->noise_reduction); - PcanGainControlFillConfigWithDefaults(&config->pcan_gain_control); - LogScaleFillConfigWithDefaults(&config->log_scale); -} - -int FrontendPopulateState(const struct FrontendConfig* config, - struct FrontendState* state, int sample_rate) { - memset(state, 0, sizeof(*state)); - - if (!WindowPopulateState(&config->window, &state->window, sample_rate)) { - fprintf(stderr, "Failed to populate window state\n"); - return 0; - } - - if (!FftPopulateState(&state->fft, state->window.size)) { - fprintf(stderr, "Failed to populate fft state\n"); - return 0; - } - FftInit(&state->fft); - - if (!FilterbankPopulateState(&config->filterbank, &state->filterbank, - sample_rate, state->fft.fft_size / 2 + 1)) { - fprintf(stderr, "Failed to populate filterbank state\n"); - return 0; - } - - if (!NoiseReductionPopulateState(&config->noise_reduction, - &state->noise_reduction, - state->filterbank.num_channels)) { - fprintf(stderr, "Failed to populate noise reduction state\n"); - return 0; - } - - int input_correction_bits = - MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2); - if (!PcanGainControlPopulateState( - &config->pcan_gain_control, &state->pcan_gain_control, - state->noise_reduction.estimate, state->filterbank.num_channels, - state->noise_reduction.smoothing_bits, input_correction_bits)) { - fprintf(stderr, "Failed to populate pcan gain control state\n"); - return 0; - } - - if (!LogScalePopulateState(&config->log_scale, &state->log_scale)) { - fprintf(stderr, "Failed to populate log scale state\n"); - return 0; - } - - FrontendReset(state); - - // All good, return a true value. - return 1; -} - -void FrontendFreeStateContents(struct FrontendState* state) { - WindowFreeStateContents(&state->window); - FftFreeStateContents(&state->fft); - FilterbankFreeStateContents(&state->filterbank); - NoiseReductionFreeStateContents(&state->noise_reduction); - PcanGainControlFreeStateContents(&state->pcan_gain_control); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h deleted file mode 100644 index 895ce6cd..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_ - -#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h" -#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h" -#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h" -#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h" -#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h" -#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h" -#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct FrontendConfig { - struct WindowConfig window; - struct FilterbankConfig filterbank; - struct NoiseReductionConfig noise_reduction; - struct PcanGainControlConfig pcan_gain_control; - struct LogScaleConfig log_scale; -}; - -// Fills the frontendConfig with "sane" defaults. -void FrontendFillConfigWithDefaults(struct FrontendConfig* config); - -// Allocates any buffers. -int FrontendPopulateState(const struct FrontendConfig* config, - struct FrontendState* state, int sample_rate); - -// Frees any allocated buffers. -void FrontendFreeStateContents(struct FrontendState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h b/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h deleted file mode 100644 index f704677d..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_COMMON_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_COMMON_H_ - -// This header file should be included in all variants of kiss_fft_$type.{h,cc} -// so that their sub-included source files do not mistakenly wrap libc header -// files within their kissfft_$type namespaces. -// E.g, This header avoids kissfft_int16.h containing: -// namespace kiss_fft_int16 { -// #include "third_party/kissfft/kiss_fft.h" -// } -// where kiss_fft_.h contains: -// #include -// -// TRICK: By including the following header files here, their preprocessor -// header guards prevent them being re-defined inside of the kiss_fft_$type -// namespaces declared within the kiss_fft_$type.{h,cc} sources. -// Note that the original kiss_fft*.h files are untouched since they -// may be used in libraries that include them directly. - -#include -#include -#include -#include -#include - -#ifdef FIXED_POINT -#include -#endif - -#ifdef USE_SIMD -#include -#endif -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_COMMON_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.cpp b/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.cpp deleted file mode 100644 index 54630661..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h" - -#define FIXED_POINT 16 -namespace kissfft_fixed16 { -#include "third_party/kissfft/kiss_fft.c" -#include "third_party/kissfft/tools/kiss_fftr.c" -} // namespace kissfft_fixed16 -#undef FIXED_POINT diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h b/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h deleted file mode 100644 index 380307a4..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_INT16_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_INT16_H_ - -#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h" - -// Wrap 16-bit kiss fft in its own namespace. Enables us to link an application -// with different kiss fft resultions (16/32 bit interger, float, double) -// without getting a linker error. -#define FIXED_POINT 16 -namespace kissfft_fixed16 { -#include "third_party/kissfft/kiss_fft.h" -#include "third_party/kissfft/tools/kiss_fftr.h" -} // namespace kissfft_fixed16 -#undef FIXED_POINT -#undef kiss_fft_scalar -#undef KISS_FFT_H - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_INT16_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.c b/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.c deleted file mode 100644 index f59618e0..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.c +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/log_lut.h" -const uint16_t kLogLut[] -#ifndef _MSC_VER - __attribute__((aligned(4))) -#endif // _MSV_VER - = {0, 224, 442, 654, 861, 1063, 1259, 1450, 1636, 1817, 1992, 2163, - 2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848, - 3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934, - 5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507, - 5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633, - 5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370, - 5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762, - 4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848, - 3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659, - 2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224, - 1094, 963, 830, 695, 559, 421, 282, 142, 0, 0}; diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.h b/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.h deleted file mode 100644 index b2448a32..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Number of segments in the log lookup table. The table will be kLogSegments+1 -// in length (with some padding). -#define kLogSegments 128 -#define kLogSegmentsLog2 7 - -// Scale used by lookup table. -#define kLogScale 65536 -#define kLogScaleLog2 16 -#define kLogCoeff 45426 - -extern const uint16_t kLogLut[]; - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.c b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.c deleted file mode 100644 index c27a50a6..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.c +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h" - -#include "tensorflow/lite/experimental/microfrontend/lib/bits.h" -#include "tensorflow/lite/experimental/microfrontend/lib/log_lut.h" - -#define kuint16max 0x0000FFFF - -// The following functions implement integer logarithms of various sizes. The -// approximation is calculated according to method described in -// www.inti.gob.ar/electronicaeinformatica/instrumentacion/utic/ -// publicaciones/SPL2007/Log10-spl07.pdf -// It first calculates log2 of the input and then converts it to natural -// logarithm. - -static uint32_t Log2FractionPart(const uint32_t x, const uint32_t log2x) { - // Part 1 - int32_t frac = x - (1LL << log2x); - if (log2x < kLogScaleLog2) { - frac <<= kLogScaleLog2 - log2x; - } else { - frac >>= log2x - kLogScaleLog2; - } - // Part 2 - const uint32_t base_seg = frac >> (kLogScaleLog2 - kLogSegmentsLog2); - const uint32_t seg_unit = - (((uint32_t)1) << kLogScaleLog2) >> kLogSegmentsLog2; - - const int32_t c0 = kLogLut[base_seg]; - const int32_t c1 = kLogLut[base_seg + 1]; - const int32_t seg_base = seg_unit * base_seg; - const int32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> kLogScaleLog2; - return frac + c0 + rel_pos; -} - -static uint32_t Log(const uint32_t x, const uint32_t scale_shift) { - const uint32_t integer = MostSignificantBit32(x) - 1; - const uint32_t fraction = Log2FractionPart(x, integer); - const uint32_t log2 = (integer << kLogScaleLog2) + fraction; - const uint32_t round = kLogScale / 2; - const uint32_t loge = (((uint64_t)kLogCoeff) * log2 + round) >> kLogScaleLog2; - // Finally scale to our output scale - const uint32_t loge_scaled = ((loge << scale_shift) + round) >> kLogScaleLog2; - return loge_scaled; -} - -uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal, - int signal_size, int correction_bits) { - const int scale_shift = state->scale_shift; - uint16_t* output = (uint16_t*)signal; - uint16_t* ret = output; - int i; - for (i = 0; i < signal_size; ++i) { - uint32_t value = *signal++; - if (state->enable_log) { - if (correction_bits < 0) { - value >>= -correction_bits; - } else { - value <<= correction_bits; - } - if (value > 1) { - value = Log(value, scale_shift); - } else { - value = 0; - } - } - *output++ = (value < kuint16max) ? value : kuint16max; - } - return ret; -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.h b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.h deleted file mode 100644 index a383f32f..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct LogScaleState { - int enable_log; - int scale_shift; -}; - -// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note -// that the signal array will be modified. -uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal, - int signal_size, int correction_bits); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c deleted file mode 100644 index 0e3dd1d1..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h" - -void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config) { - config->enable_log = 1; - config->scale_shift = 6; -} - -int LogScalePopulateState(const struct LogScaleConfig* config, - struct LogScaleState* state) { - state->enable_log = config->enable_log; - state->scale_shift = config->scale_shift; - return 1; -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h deleted file mode 100644 index 11f7d9ee..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_ - -#include -#include - -#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct LogScaleConfig { - // set to false (0) to disable this module - int enable_log; - // scale results by 2^(scale_shift) - int scale_shift; -}; - -// Populates the LogScaleConfig with "sane" default values. -void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config); - -// Allocates any buffers. -int LogScalePopulateState(const struct LogScaleConfig* config, - struct LogScaleState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c deleted file mode 100644 index 16b30e66..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h" - -#include - -void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal) { - int i; - for (i = 0; i < state->num_channels; ++i) { - const uint32_t smoothing = - ((i & 1) == 0) ? state->even_smoothing : state->odd_smoothing; - const uint32_t one_minus_smoothing = (1 << kNoiseReductionBits) - smoothing; - - // Update the estimate of the noise. - const uint32_t signal_scaled_up = signal[i] << state->smoothing_bits; - uint32_t estimate = - (((uint64_t)signal_scaled_up * smoothing) + - ((uint64_t)state->estimate[i] * one_minus_smoothing)) >> - kNoiseReductionBits; - state->estimate[i] = estimate; - - // Make sure that we can't get a negative value for the signal - estimate. - if (estimate > signal_scaled_up) { - estimate = signal_scaled_up; - } - - const uint32_t floor = - ((uint64_t)signal[i] * state->min_signal_remaining) >> - kNoiseReductionBits; - const uint32_t subtracted = - (signal_scaled_up - estimate) >> state->smoothing_bits; - const uint32_t output = subtracted > floor ? subtracted : floor; - signal[i] = output; - } -} - -void NoiseReductionReset(struct NoiseReductionState* state) { - memset(state->estimate, 0, sizeof(*state->estimate) * state->num_channels); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h deleted file mode 100644 index 46d3f52e..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_ - -#define kNoiseReductionBits 14 - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct NoiseReductionState { - int smoothing_bits; - uint16_t even_smoothing; - uint16_t odd_smoothing; - uint16_t min_signal_remaining; - int num_channels; - uint32_t* estimate; -}; - -// Removes stationary noise from each channel of the signal using a low pass -// filter. -void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal); - -void NoiseReductionReset(struct NoiseReductionState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c deleted file mode 100644 index a6c9234e..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h" - -#include - -void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config) { - config->smoothing_bits = 10; - config->even_smoothing = 0.025; - config->odd_smoothing = 0.06; - config->min_signal_remaining = 0.05; -} - -int NoiseReductionPopulateState(const struct NoiseReductionConfig* config, - struct NoiseReductionState* state, - int num_channels) { - state->smoothing_bits = config->smoothing_bits; - state->odd_smoothing = config->odd_smoothing * (1 << kNoiseReductionBits); - state->even_smoothing = config->even_smoothing * (1 << kNoiseReductionBits); - state->min_signal_remaining = - config->min_signal_remaining * (1 << kNoiseReductionBits); - state->num_channels = num_channels; - state->estimate = calloc(state->num_channels, sizeof(*state->estimate)); - if (state->estimate == NULL) { - fprintf(stderr, "Failed to alloc estimate buffer\n"); - return 0; - } - return 1; -} - -void NoiseReductionFreeStateContents(struct NoiseReductionState* state) { - free(state->estimate); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h deleted file mode 100644 index fa555391..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_ - -#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct NoiseReductionConfig { - // scale the signal up by 2^(smoothing_bits) before reduction - int smoothing_bits; - // smoothing coefficient for even-numbered channels - float even_smoothing; - // smoothing coefficient for odd-numbered channels - float odd_smoothing; - // fraction of signal to preserve (1.0 disables this module) - float min_signal_remaining; -}; - -// Populates the NoiseReductionConfig with "sane" default values. -void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config); - -// Allocates any buffers. -int NoiseReductionPopulateState(const struct NoiseReductionConfig* config, - struct NoiseReductionState* state, - int num_channels); - -// Frees any allocated buffers. -void NoiseReductionFreeStateContents(struct NoiseReductionState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c deleted file mode 100644 index 22d58767..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h" - -#include "tensorflow/lite/experimental/microfrontend/lib/bits.h" - -int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut) { - if (x <= 2) { - return lut[x]; - } - - const int16_t interval = MostSignificantBit32(x); - lut += 4 * interval - 6; - - const int16_t frac = - ((interval < 11) ? (x << (11 - interval)) : (x >> (interval - 11))) & - 0x3FF; - - int32_t result = ((int32_t)lut[2] * frac) >> 5; - result += (int32_t)((uint32_t)lut[1] << 5); - result *= frac; - result = (result + (1 << 14)) >> 15; - result += lut[0]; - return (int16_t)result; -} - -uint32_t PcanShrink(const uint32_t x) { - if (x < (2 << kPcanSnrBits)) { - return (x * x) >> (2 + 2 * kPcanSnrBits - kPcanOutputBits); - } else { - return (x >> (kPcanSnrBits - kPcanOutputBits)) - (1 << kPcanOutputBits); - } -} - -void PcanGainControlApply(struct PcanGainControlState* state, - uint32_t* signal) { - int i; - for (i = 0; i < state->num_channels; ++i) { - const uint32_t gain = - WideDynamicFunction(state->noise_estimate[i], state->gain_lut); - const uint32_t snr = ((uint64_t)signal[i] * gain) >> state->snr_shift; - signal[i] = PcanShrink(snr); - } -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h deleted file mode 100644 index 3f6222be..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_ - -#include -#include - -#define kPcanSnrBits 12 -#define kPcanOutputBits 6 - -#ifdef __cplusplus -extern "C" { -#endif - -// Details at https://research.google/pubs/pub45911.pdf -struct PcanGainControlState { - int enable_pcan; - uint32_t* noise_estimate; - int num_channels; - int16_t* gain_lut; - int32_t snr_shift; -}; - -int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut); - -uint32_t PcanShrink(const uint32_t x); - -void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c deleted file mode 100644 index e850d439..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h" - -#include -#include - -#define kint16max 0x00007FFF - -void PcanGainControlFillConfigWithDefaults( - struct PcanGainControlConfig* config) { - config->enable_pcan = 0; - config->strength = 0.95; - config->offset = 80.0; - config->gain_bits = 21; -} - -int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config, - int32_t input_bits, uint32_t x) { - const float x_as_float = ((float)x) / ((uint32_t)1 << input_bits); - const float gain_as_float = - ((uint32_t)1 << config->gain_bits) * - powf(x_as_float + config->offset, -config->strength); - - if (gain_as_float > kint16max) { - return kint16max; - } - return (int16_t)(gain_as_float + 0.5f); -} - -int PcanGainControlPopulateState(const struct PcanGainControlConfig* config, - struct PcanGainControlState* state, - uint32_t* noise_estimate, - const int num_channels, - const uint16_t smoothing_bits, - const int32_t input_correction_bits) { - state->enable_pcan = config->enable_pcan; - if (!state->enable_pcan) { - return 1; - } - state->noise_estimate = noise_estimate; - state->num_channels = num_channels; - state->gain_lut = malloc(kWideDynamicFunctionLUTSize * sizeof(int16_t)); - if (state->gain_lut == NULL) { - fprintf(stderr, "Failed to allocate gain LUT\n"); - return 0; - } - state->snr_shift = config->gain_bits - input_correction_bits - kPcanSnrBits; - - const int32_t input_bits = smoothing_bits - input_correction_bits; - state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0); - state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1); - state->gain_lut -= 6; - int interval; - for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) { - const uint32_t x0 = (uint32_t)1 << (interval - 1); - const uint32_t x1 = x0 + (x0 >> 1); - const uint32_t x2 = - (interval == kWideDynamicFunctionBits) ? x0 + (x0 - 1) : 2 * x0; - - const int16_t y0 = PcanGainLookupFunction(config, input_bits, x0); - const int16_t y1 = PcanGainLookupFunction(config, input_bits, x1); - const int16_t y2 = PcanGainLookupFunction(config, input_bits, x2); - - const int32_t diff1 = (int32_t)y1 - y0; - const int32_t diff2 = (int32_t)y2 - y0; - const int32_t a1 = 4 * diff1 - diff2; - const int32_t a2 = diff2 - a1; - - state->gain_lut[4 * interval] = y0; - state->gain_lut[4 * interval + 1] = (int16_t)a1; - state->gain_lut[4 * interval + 2] = (int16_t)a2; - } - state->gain_lut += 6; - return 1; -} - -void PcanGainControlFreeStateContents(struct PcanGainControlState* state) { - free(state->gain_lut); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h deleted file mode 100644 index d4bfaa2e..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_ - -#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h" - -#define kWideDynamicFunctionBits 32 -#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3) - -#ifdef __cplusplus -extern "C" { -#endif - -struct PcanGainControlConfig { - // set to false (0) to disable this module - int enable_pcan; - // gain normalization exponent (0.0 disables, 1.0 full strength) - float strength; - // positive value added in the normalization denominator - float offset; - // number of fractional bits in the gain - int gain_bits; -}; - -void PcanGainControlFillConfigWithDefaults( - struct PcanGainControlConfig* config); - -int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config, - int32_t input_bits, uint32_t x); - -int PcanGainControlPopulateState(const struct PcanGainControlConfig* config, - struct PcanGainControlState* state, - uint32_t* noise_estimate, - const int num_channels, - const uint16_t smoothing_bits, - const int32_t input_correction_bits); - -void PcanGainControlFreeStateContents(struct PcanGainControlState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window.c b/src/tensorflow/lite/experimental/microfrontend/lib/window.c deleted file mode 100644 index 10da6762..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/window.c +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/window.h" - -#include - -int WindowProcessSamples(struct WindowState* state, const int16_t* samples, - size_t num_samples, size_t* num_samples_read) { - const int size = state->size; - - // Copy samples from the samples buffer over to our local input. - size_t max_samples_to_copy = state->size - state->input_used; - if (max_samples_to_copy > num_samples) { - max_samples_to_copy = num_samples; - } - memcpy(state->input + state->input_used, samples, - max_samples_to_copy * sizeof(*samples)); - *num_samples_read = max_samples_to_copy; - state->input_used += max_samples_to_copy; - - if (state->input_used < state->size) { - // We don't have enough samples to compute a window. - return 0; - } - - // Apply the window to the input. - const int16_t* coefficients = state->coefficients; - const int16_t* input = state->input; - int16_t* output = state->output; - int i; - int16_t max_abs_output_value = 0; - for (i = 0; i < size; ++i) { - int16_t new_value = - (((int32_t)*input++) * *coefficients++) >> kFrontendWindowBits; - *output++ = new_value; - if (new_value < 0) { - new_value = -new_value; - } - if (new_value > max_abs_output_value) { - max_abs_output_value = new_value; - } - } - // Shuffle the input down by the step size, and update how much we have used. - memmove(state->input, state->input + state->step, - sizeof(*state->input) * (state->size - state->step)); - state->input_used -= state->step; - state->max_abs_output_value = max_abs_output_value; - - // Indicate that the output buffer is valid for the next stage. - return 1; -} - -void WindowReset(struct WindowState* state) { - memset(state->input, 0, state->size * sizeof(*state->input)); - memset(state->output, 0, state->size * sizeof(*state->output)); - state->input_used = 0; - state->max_abs_output_value = 0; -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window.h b/src/tensorflow/lite/experimental/microfrontend/lib/window.h deleted file mode 100644 index bad81514..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/window.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_ - -#include -#include - -#define kFrontendWindowBits 12 - -#ifdef __cplusplus -extern "C" { -#endif - -struct WindowState { - size_t size; - int16_t* coefficients; - size_t step; - - int16_t* input; - size_t input_used; - int16_t* output; - int16_t max_abs_output_value; -}; - -// Applies a window to the samples coming in, stepping forward at the given -// rate. -int WindowProcessSamples(struct WindowState* state, const int16_t* samples, - size_t num_samples, size_t* num_samples_read); - -void WindowReset(struct WindowState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_ diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/window_util.c deleted file mode 100644 index eee6e7b5..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.c +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h" - -#include -#include -#include -#include - -// Some platforms don't have M_PI -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -void WindowFillConfigWithDefaults(struct WindowConfig* config) { - config->size_ms = 25; - config->step_size_ms = 10; -} - -int WindowPopulateState(const struct WindowConfig* config, - struct WindowState* state, int sample_rate) { - state->size = config->size_ms * sample_rate / 1000; - state->step = config->step_size_ms * sample_rate / 1000; - - state->coefficients = malloc(state->size * sizeof(*state->coefficients)); - if (state->coefficients == NULL) { - fprintf(stderr, "Failed to allocate window coefficients\n"); - return 0; - } - - // Populate the window values. - const float arg = M_PI * 2.0 / ((float)state->size); - int i; - for (i = 0; i < state->size; ++i) { - float float_value = 0.5 - (0.5 * cos(arg * (i + 0.5))); - // Scale it to fixed point and round it. - state->coefficients[i] = - floor(float_value * (1 << kFrontendWindowBits) + 0.5); - } - - state->input_used = 0; - state->input = malloc(state->size * sizeof(*state->input)); - if (state->input == NULL) { - fprintf(stderr, "Failed to allocate window input\n"); - return 0; - } - - state->output = malloc(state->size * sizeof(*state->output)); - if (state->output == NULL) { - fprintf(stderr, "Failed to allocate window output\n"); - return 0; - } - - return 1; -} - -void WindowFreeStateContents(struct WindowState* state) { - free(state->coefficients); - free(state->input); - free(state->output); -} diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/window_util.h deleted file mode 100644 index 68e4de9e..00000000 --- a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_ - -#include "tensorflow/lite/experimental/microfrontend/lib/window.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct WindowConfig { - // length of window frame in milliseconds - size_t size_ms; - // length of step for next frame in milliseconds - size_t step_size_ms; -}; - -// Populates the WindowConfig with "sane" default values. -void WindowFillConfigWithDefaults(struct WindowConfig* config); - -// Allocates any buffers. -int WindowPopulateState(const struct WindowConfig* config, - struct WindowState* state, int sample_rate); - -// Frees any allocated buffers. -void WindowFreeStateContents(struct WindowState* state); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_ diff --git a/src/tensorflow/lite/kernels/internal/common.cpp b/src/tensorflow/lite/kernels/internal/common.cpp new file mode 100644 index 00000000..1654ab84 --- /dev/null +++ b/src/tensorflow/lite/kernels/internal/common.cpp @@ -0,0 +1,55 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/kernels/internal/common.h" + +namespace tflite { + +int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, + int shift) { + using gemmlowp::RoundingDivideByPOT; + using gemmlowp::SaturatingRoundingDoublingHighMul; + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( + x * (1 << left_shift), quantized_multiplier), + right_shift); +} + +int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier, + int shift) { + // Inputs: + // - quantized_multiplier has fixed point at bit 31 + // - shift is -31 to +7 (negative for right shift) + // + // Assumptions: The following input ranges are assumed + // - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1) + // - scaling is chosen so final scaled result fits in int32_t + // - input x is in the range -(1<<47) <= x < (1<<47) + assert(quantized_multiplier >= 0); + assert(shift >= -31 && shift < 8); + assert(x >= -(static_cast(1) << 47) && + x < (static_cast(1) << 47)); + + int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000) + ? ((quantized_multiplier + (1 << 15)) >> 16) + : 0x7FFF; + int total_shift = 15 - shift; + x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1)); + int32_t result = x >> total_shift; + return result; +} + +} // namespace tflite diff --git a/src/tensorflow/lite/kernels/internal/common.h b/src/tensorflow/lite/kernels/internal/common.h index 7d38cd14..a9acb4f2 100644 --- a/src/tensorflow/lite/kernels/internal/common.h +++ b/src/tensorflow/lite/kernels/internal/common.h @@ -26,6 +26,7 @@ limitations under the License. #include #include "third_party/gemmlowp/fixedpoint/fixedpoint.h" +#include "tensorflow/lite/core/macros.h" #include "tensorflow/lite/kernels/internal/cppmath.h" #include "tensorflow/lite/kernels/internal/optimized/neon_check.h" #include "tensorflow/lite/kernels/internal/types.h" @@ -250,42 +251,11 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne( quantized_multiplier); } -inline int32_t MultiplyByQuantizedMultiplier(int32_t x, - int32_t quantized_multiplier, - int shift) { - using gemmlowp::RoundingDivideByPOT; - using gemmlowp::SaturatingRoundingDoublingHighMul; - int left_shift = shift > 0 ? shift : 0; - int right_shift = shift > 0 ? 0 : -shift; - return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - x * (1 << left_shift), quantized_multiplier), - right_shift); -} +TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier( + int32_t x, int32_t quantized_multiplier, int shift); -inline int32_t MultiplyByQuantizedMultiplier(int64_t x, - int32_t quantized_multiplier, - int shift) { - // Inputs: - // - quantized_multiplier has fixed point at bit 31 - // - shift is -31 to +7 (negative for right shift) - // - // Assumptions: The following input ranges are assumed - // - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1) - // - scaling is chosen so final scaled result fits in int32_t - // - input x is in the range -(1<<47) <= x < (1<<47) - assert(quantized_multiplier >= 0); - assert(shift >= -31 && shift < 8); - assert(x >= -(static_cast(1) << 47) && - x < (static_cast(1) << 47)); - - int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000) - ? ((quantized_multiplier + (1 << 15)) >> 16) - : 0x7FFF; - int total_shift = 15 - shift; - x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1)); - int32_t result = x >> total_shift; - return result; -} +TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier( + int64_t x, int32_t quantized_multiplier, int shift); #ifdef USE_NEON // Round uses ARM's rounding shift right. @@ -328,14 +298,16 @@ template int CountLeadingZeros(T integer_input) { static_assert(std::is_unsigned::value, "Only unsigned integer types handled."); -#if defined(__GNUC__) - return integer_input ? __builtin_clz(integer_input) - : std::numeric_limits::digits; -#else if (integer_input == 0) { return std::numeric_limits::digits; } - +#if defined(__GNUC__) + if (std::is_same::value) { + return __builtin_clz(integer_input); + } else if (std::is_same::value) { + return __builtin_clzll(integer_input); + } +#endif const T one_in_leading_positive = static_cast(1) << (std::numeric_limits::digits - 1); int leading_zeros = 0; @@ -344,7 +316,6 @@ int CountLeadingZeros(T integer_input) { ++leading_zeros; } return leading_zeros; -#endif } template @@ -1039,8 +1010,8 @@ inline void NdArrayDescsForElementwiseBroadcast(const Dims& input0_dims, // Copies dims to desc, calculating strides. template -inline void CopyDimsToDesc(const RuntimeShape& input_shape, - NdArrayDesc* desc_out) { +TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape, + NdArrayDesc* desc_out) { int desc_stride = 1; for (int i = N - 1; i >= 0; --i) { desc_out->extents[i] = input_shape.Dims(i); diff --git a/src/tensorflow/lite/kernels/internal/portable_tensor.h b/src/tensorflow/lite/kernels/internal/portable_tensor.h index 45135b1f..1eee6217 100644 --- a/src/tensorflow/lite/kernels/internal/portable_tensor.h +++ b/src/tensorflow/lite/kernels/internal/portable_tensor.h @@ -23,10 +23,6 @@ limitations under the License. namespace tflite { -inline RuntimeShape GetTensorShape(std::vector data) { - return RuntimeShape(data.size(), data.data()); -} - // A list of tensors in a format that can be used by kernels like split and // concatenation. template diff --git a/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp b/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp index a9cfee8e..024043d7 100644 --- a/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp +++ b/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp @@ -70,13 +70,19 @@ void ApplySignbitToVector(const float* __restrict__ vector, int v_size, void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements, int8_t* dst_buffer) { - for (int i = 0; i < num_elements; i += 2) { + for (int i = 0; i < num_elements / 2; i++) { + int8_t byte = src_buffer[i]; // Shift left first so that sign is properly extended when shifted right - dst_buffer[i] = static_cast(src_buffer[i / 2] << 4) >> 4; - // Break early if the tensor has odd length and the higher nibble should be - // ignored. - if (i + 1 == num_elements) break; - dst_buffer[i + 1] = static_cast(src_buffer[i / 2]) >> 4; + int8_t lower = static_cast(byte << 4) >> 4; + int8_t higher = byte >> 4; + dst_buffer[2 * i] = lower; + dst_buffer[2 * i + 1] = higher; + } + + // If the buffer size is odd, extract the final lower nibble. + if (num_elements % 2 != 0) { + dst_buffer[num_elements - 1] = + static_cast(src_buffer[num_elements / 2] << 4) >> 4; } } diff --git a/src/tensorflow/lite/kernels/internal/reference/add.h b/src/tensorflow/lite/kernels/internal/reference/add.h index ae1f47a8..faffb097 100644 --- a/src/tensorflow/lite/kernels/internal/reference/add.h +++ b/src/tensorflow/lite/kernels/internal/reference/add.h @@ -194,18 +194,20 @@ inline void Add(const ArithmeticParams& params, } } -template -inline typename std::enable_if::value, void>::type -BroadcastAdd4DSlow(const ArithmeticParams& params, +template +inline typename std::enable_if::value || dummy, void>::type +BroadcastAdd6DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const T* input1_data, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; + NdArrayDesc<6> desc1; + NdArrayDesc<6> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(4, output_shape); + RuntimeShape::ExtendedShape(6, output_shape); T activation_min, activation_max; GetActivationParams(params, &activation_min, &activation_max); @@ -221,18 +223,64 @@ BroadcastAdd4DSlow(const ArithmeticParams& params, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < extended_output_shape.Dims(0); ++b) { - for (int y = 0; y < extended_output_shape.Dims(1); ++y) { - for (int x = 0; x < extended_output_shape.Dims(2); ++x) { - for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - output_data[Offset(extended_output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, b, y, x, c)] + - input2_data[SubscriptToIndex(desc2, b, y, x, c)], + size_t input1_offset_a = 0; + size_t input2_offset_a = 0; + size_t output_offset_a = 0; + for (int a = 0; a < extended_output_shape.Dims(0); ++a) { + size_t input1_offset_d = input1_offset_a; + size_t input2_offset_d = input2_offset_a; + size_t output_offset_d = output_offset_a; + for (int d = 0; d < extended_output_shape.Dims(1); ++d) { + size_t input1_offset_b = input1_offset_d; + size_t input2_offset_b = input2_offset_d; + size_t output_offset_b = output_offset_d; + for (int b = 0; b < extended_output_shape.Dims(2); ++b) { + size_t input1_offset_y = input1_offset_b; + size_t input2_offset_y = input2_offset_b; + size_t output_offset_y = output_offset_b; + for (int y = 0; y < extended_output_shape.Dims(3); ++y) { + size_t input1_offset_x = input1_offset_y; + size_t input2_offset_x = input2_offset_y; + size_t output_offset_x = output_offset_y; + for (int x = 0; x < extended_output_shape.Dims(4); ++x) { + size_t input1_offset_c = input1_offset_x; + size_t input2_offset_c = input2_offset_x; + size_t output_offset_c = output_offset_x; + for (int c = 0; c < extended_output_shape.Dims(5); ++c) { + output_data[output_offset_c] = ActivationFunctionWithMinMax( + input1_data[input1_offset_c] + input2_data[input2_offset_c], activation_min, activation_max); + input1_offset_c += desc1.strides[5]; + input2_offset_c += desc2.strides[5]; + ++output_offset_c; + } + input1_offset_x += desc1.strides[4]; + input2_offset_x += desc2.strides[4]; + output_offset_x += extended_output_shape.Dims(5); + } + input1_offset_y += desc1.strides[3]; + input2_offset_y += desc2.strides[3]; + output_offset_y += + extended_output_shape.Dims(4) * extended_output_shape.Dims(5); } + input1_offset_b += desc1.strides[2]; + input2_offset_b += desc2.strides[2]; + output_offset_b += extended_output_shape.Dims(3) * + extended_output_shape.Dims(4) * + extended_output_shape.Dims(5); } + input1_offset_d += desc1.strides[1]; + input2_offset_d += desc2.strides[1]; + output_offset_d += + extended_output_shape.Dims(2) * extended_output_shape.Dims(3) * + extended_output_shape.Dims(4) * extended_output_shape.Dims(5); } + input1_offset_a += desc1.strides[0]; + input2_offset_a += desc2.strides[0]; + output_offset_a += + extended_output_shape.Dims(1) * extended_output_shape.Dims(2) * + extended_output_shape.Dims(3) * extended_output_shape.Dims(4) * + extended_output_shape.Dims(5); } } @@ -241,16 +289,16 @@ BroadcastAdd4DSlow(const ArithmeticParams& params, // choice of the shift (20 or 15, accordingly - see add.cc for more comments). template inline typename std::enable_if::value, void>::type -BroadcastAdd4DSlow(const ArithmeticParams& params, +BroadcastAdd6DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const T* input1_data, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; + NdArrayDesc<6> desc1; + NdArrayDesc<6> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(4, output_shape); + RuntimeShape::ExtendedShape(6, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -263,44 +311,98 @@ BroadcastAdd4DSlow(const ArithmeticParams& params, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < extended_output_shape.Dims(0); ++b) { - for (int y = 0; y < extended_output_shape.Dims(1); ++y) { - for (int x = 0; x < extended_output_shape.Dims(2); ++x) { - for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - const int32_t input1_val = - params.input1_offset + - input1_data[SubscriptToIndex(desc1, b, y, x, c)]; - const int32_t input2_val = - params.input2_offset + - input2_data[SubscriptToIndex(desc2, b, y, x, c)]; - const int32_t shifted_input1_val = - input1_val * (1 << params.left_shift); - const int32_t shifted_input2_val = - input2_val * (1 << params.left_shift); - const int32_t scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, params.input1_multiplier, - params.input1_shift); - const int32_t scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, params.input2_multiplier, - params.input2_shift); - const int32_t raw_sum = scaled_input1_val + scaled_input2_val; - const int32_t raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, params.output_multiplier, params.output_shift) + - params.output_offset; - const int32_t clamped_output = - std::min(params.quantized_activation_max, - std::max(params.quantized_activation_min, raw_output)); - output_data[Offset(extended_output_shape, b, y, x, c)] = - static_cast(clamped_output); + size_t input1_offset_a = 0; + size_t input2_offset_a = 0; + size_t output_offset_a = 0; + for (int a = 0; a < extended_output_shape.Dims(0); ++a) { + size_t input1_offset_d = input1_offset_a; + size_t input2_offset_d = input2_offset_a; + size_t output_offset_d = output_offset_a; + for (int d = 0; d < extended_output_shape.Dims(1); ++d) { + size_t input1_offset_b = input1_offset_d; + size_t input2_offset_b = input2_offset_d; + size_t output_offset_b = output_offset_d; + for (int b = 0; b < extended_output_shape.Dims(2); ++b) { + size_t input1_offset_y = input1_offset_b; + size_t input2_offset_y = input2_offset_b; + size_t output_offset_y = output_offset_b; + for (int y = 0; y < extended_output_shape.Dims(3); ++y) { + size_t input1_offset_x = input1_offset_y; + size_t input2_offset_x = input2_offset_y; + size_t output_offset_x = output_offset_y; + for (int x = 0; x < extended_output_shape.Dims(4); ++x) { + size_t input1_offset_c = input1_offset_x; + size_t input2_offset_c = input2_offset_x; + size_t output_offset_c = output_offset_x; + for (int c = 0; c < extended_output_shape.Dims(5); ++c) { + const int32_t input1_val = + params.input1_offset + input1_data[input1_offset_c]; + const int32_t input2_val = + params.input2_offset + input2_data[input2_offset_c]; + const int32_t shifted_input1_val = + input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = + input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, + params.input1_shift); + const int32_t scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, + params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min( + params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[output_offset_c] = static_cast(clamped_output); + input1_offset_c += desc1.strides[5]; + input2_offset_c += desc2.strides[5]; + ++output_offset_c; + } + input1_offset_x += desc1.strides[4]; + input2_offset_x += desc2.strides[4]; + output_offset_x += extended_output_shape.Dims(5); + } + input1_offset_y += desc1.strides[3]; + input2_offset_y += desc2.strides[3]; + output_offset_y += + extended_output_shape.Dims(4) * extended_output_shape.Dims(5); } + input1_offset_b += desc1.strides[2]; + input2_offset_b += desc2.strides[2]; + output_offset_b += extended_output_shape.Dims(3) * + extended_output_shape.Dims(4) * + extended_output_shape.Dims(5); } + input1_offset_d += desc1.strides[1]; + input2_offset_d += desc2.strides[1]; + output_offset_d += + extended_output_shape.Dims(2) * extended_output_shape.Dims(3) * + extended_output_shape.Dims(4) * extended_output_shape.Dims(5); } + input1_offset_a += desc1.strides[0]; + input2_offset_a += desc2.strides[0]; + output_offset_a += + extended_output_shape.Dims(1) * extended_output_shape.Dims(2) * + extended_output_shape.Dims(3) * extended_output_shape.Dims(4) * + extended_output_shape.Dims(5); } } +template +inline void BroadcastAdd4DSlow( + const ArithmeticParams& params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, T* output_data) { + return BroadcastAdd6DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); +} + inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, const RuntimeShape& unswitched_input1_shape, const uint8_t* unswitched_input1_data, diff --git a/src/tensorflow/lite/kernels/internal/reference/comparisons.cpp b/src/tensorflow/lite/kernels/internal/reference/comparisons.cpp new file mode 100644 index 00000000..86b4a6af --- /dev/null +++ b/src/tensorflow/lite/kernels/internal/reference/comparisons.cpp @@ -0,0 +1,37 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/kernels/internal/reference/comparisons.h" + +namespace tflite { +namespace reference_ops { + +BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess( + const RuntimeShape& unextended_input1_shape, + const RuntimeShape& unextended_input2_shape, + const RuntimeShape& unextended_output_shape) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1, + desc2}; +} + +} // namespace reference_ops +} // namespace tflite diff --git a/src/tensorflow/lite/kernels/internal/reference/comparisons.h b/src/tensorflow/lite/kernels/internal/reference/comparisons.h index d3b8c115..35583195 100644 --- a/src/tensorflow/lite/kernels/internal/reference/comparisons.h +++ b/src/tensorflow/lite/kernels/internal/reference/comparisons.h @@ -112,20 +112,11 @@ struct BroadcastComparison4DSlowCommon { NdArrayDesc<4> desc2; }; -inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess( +TFLITE_NOINLINE +BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess( const RuntimeShape& unextended_input1_shape, const RuntimeShape& unextended_input2_shape, - const RuntimeShape& unextended_output_shape) { - TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, - unextended_input2_shape, &desc1, &desc2); - return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1, - desc2}; -} + const RuntimeShape& unextended_output_shape); template F> inline void BroadcastComparison4DSlowImpl( diff --git a/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h index 8d9b318c..579964dc 100644 --- a/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +++ b/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h @@ -35,30 +35,31 @@ inline void CheckArithmeticParams(const ArithmeticParams& params) { TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits::max()); } -inline void ElementWise( - int size, const ArithmeticParams& params, const int8_t* input1_data, - const int8_t* input2_data, int8_t* output_data, - void (*check_arithmetic_params)(const ArithmeticParams&), - int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) { +// TODO(b/270589088): move to a more appropriate file (b/270589088#comment2) +template +void ElementWise(int size, const ArithmeticParams& params, const T* input1_data, + const T* input2_data, T* output_data, + void (*check_arithmetic_params)(const ArithmeticParams&), + T (*binary_func)(T, T, const ArithmeticParams&)) { CheckArithmeticParams(params); for (int i = 0; i < size; ++i) { output_data[i] = binary_func(input1_data[i], input2_data[i], params); } } - -inline void BroadcastBinaryFunction4DSlow( +// TODO(b/270589088): move to a more appropriate file. (b/270589088#comment2) +template +void BroadcastBinaryFunction6DSlow( const ArithmeticParams& params, const RuntimeShape& input1_shape, - const int8_t* input1_data, const RuntimeShape& input2_shape, - const int8_t* input2_data, const RuntimeShape& output_shape, - int8_t* output_data, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, T* output_data, void (*check_arithmetic_params)(const ArithmeticParams&), - int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; + T (*binary_func)(T, T, const ArithmeticParams&)) { + NdArrayDesc<6> desc1; + NdArrayDesc<6> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(4, output_shape); + RuntimeShape::ExtendedShape(6, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -71,19 +72,79 @@ inline void BroadcastBinaryFunction4DSlow( // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < extended_output_shape.Dims(0); ++b) { - for (int y = 0; y < extended_output_shape.Dims(1); ++y) { - for (int x = 0; x < extended_output_shape.Dims(2); ++x) { - for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func( - input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)], params); + size_t input1_offset_a = 0; + size_t input2_offset_a = 0; + size_t output_offset_a = 0; + for (int a = 0; a < extended_output_shape.Dims(0); ++a) { + size_t input1_offset_d = input1_offset_a; + size_t input2_offset_d = input2_offset_a; + size_t output_offset_d = output_offset_a; + for (int d = 0; d < extended_output_shape.Dims(1); ++d) { + size_t input1_offset_b = input1_offset_d; + size_t input2_offset_b = input2_offset_d; + size_t output_offset_b = output_offset_d; + for (int b = 0; b < extended_output_shape.Dims(2); ++b) { + size_t input1_offset_y = input1_offset_b; + size_t input2_offset_y = input2_offset_b; + size_t output_offset_y = output_offset_b; + for (int y = 0; y < extended_output_shape.Dims(3); ++y) { + size_t input1_offset_x = input1_offset_y; + size_t input2_offset_x = input2_offset_y; + size_t output_offset_x = output_offset_y; + for (int x = 0; x < extended_output_shape.Dims(4); ++x) { + size_t input1_offset_c = input1_offset_x; + size_t input2_offset_c = input2_offset_x; + size_t output_offset_c = output_offset_x; + for (int c = 0; c < extended_output_shape.Dims(5); ++c) { + output_data[output_offset_c] = + binary_func(input1_data[input1_offset_c], + input2_data[input2_offset_c], params); + input1_offset_c += desc1.strides[5]; + input2_offset_c += desc2.strides[5]; + ++output_offset_c; + } + input1_offset_x += desc1.strides[4]; + input2_offset_x += desc2.strides[4]; + output_offset_x += extended_output_shape.Dims(5); + } + input1_offset_y += desc1.strides[3]; + input2_offset_y += desc2.strides[3]; + output_offset_y += + extended_output_shape.Dims(4) * extended_output_shape.Dims(5); } + input1_offset_b += desc1.strides[2]; + input2_offset_b += desc2.strides[2]; + output_offset_b += extended_output_shape.Dims(3) * + extended_output_shape.Dims(4) * + extended_output_shape.Dims(5); } + input1_offset_d += desc1.strides[1]; + input2_offset_d += desc2.strides[1]; + output_offset_d += + extended_output_shape.Dims(2) * extended_output_shape.Dims(3) * + extended_output_shape.Dims(4) * extended_output_shape.Dims(5); } + input1_offset_a += desc1.strides[0]; + input2_offset_a += desc2.strides[0]; + output_offset_a += + extended_output_shape.Dims(1) * extended_output_shape.Dims(2) * + extended_output_shape.Dims(3) * extended_output_shape.Dims(4) * + extended_output_shape.Dims(5); } } +template +void BroadcastBinaryFunction4DSlow( + const ArithmeticParams& params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, T* output_data, + void (*check_arithmetic_params)(const ArithmeticParams&), + T (*binary_func)(T, T, const ArithmeticParams&)) { + BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, + check_arithmetic_params, binary_func); +} + inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) { const int32_t input1_val = params.input1_offset + x; const int32_t input2_val = params.input2_offset + y; @@ -127,6 +188,18 @@ inline void Add(const ArithmeticParams& params, AddElementwise(flat_size, params, input1_data, input2_data, output_data); } +inline void BroadcastAdd6DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int8_t* input1_data, + const RuntimeShape& input2_shape, + const int8_t* input2_data, + const RuntimeShape& output_shape, + int8_t* output_data) { + BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, + CheckArithmeticParams, AddFunc); +} + inline void BroadcastAdd4DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, const int8_t* input1_data, @@ -134,7 +207,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params, const int8_t* input2_data, const RuntimeShape& output_shape, int8_t* output_data) { - BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape, + BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, CheckArithmeticParams, AddFunc); } diff --git a/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h index 09d37b72..7e3f690e 100644 --- a/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h +++ b/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h @@ -1,10 +1,10 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -15,65 +15,4 @@ limitations under the License. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_ -#include - -#include "tensorflow/lite/kernels/internal/common.h" - -namespace tflite { -namespace reference_integer_ops { - -template -inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier, - int32_t shift, const RuntimeShape& unextended_input_shape, - const integer_type* input_data, int32_t input_zero_point, - const RuntimeShape& unextended_output_shape, - integer_type* output_data, int32_t output_zero_point) { - // Current implementation only supports dimension equals 4 and simultaneous - // reduction over width and height. - TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4); - TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4); - const RuntimeShape input_shape = - RuntimeShape::ExtendedShape(4, unextended_input_shape); - const RuntimeShape output_shape = - RuntimeShape::ExtendedShape(4, unextended_output_shape); - const int output_batch = output_shape.Dims(0); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - const int output_depth = output_shape.Dims(3); - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int num_elements_in_axis = input_width * input_height; - - TFLITE_CHECK_EQ(op_params.axis_count, 2); - TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) || - (op_params.axis[0] == 2 && op_params.axis[1] == 1)); - TFLITE_CHECK_EQ(output_height, 1); - TFLITE_CHECK_EQ(output_width, 1); - - static constexpr int32_t kMinInt = std::numeric_limits::min(); - static constexpr int32_t kMaxInt = std::numeric_limits::max(); - - for (int out_b = 0; out_b < output_batch; ++out_b) { - for (int out_d = 0; out_d < output_depth; ++out_d) { - int32_t acc = 0; - for (int in_h = 0; in_h < input_height; ++in_h) { - for (int in_w = 0; in_w < input_width; ++in_w) { - acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] - - input_zero_point; - } - } - acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift); - acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis - : (acc - num_elements_in_axis / 2) / num_elements_in_axis; - acc += output_zero_point; - acc = std::min(std::max(acc, kMinInt), kMaxInt); - output_data[Offset(output_shape, out_b, 0, 0, out_d)] = - static_cast(acc); - } - } -} - -} // namespace reference_integer_ops -} // namespace tflite - #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_ diff --git a/src/tensorflow/lite/kernels/internal/reference/mul.h b/src/tensorflow/lite/kernels/internal/reference/mul.h index 53197732..2767fef2 100644 --- a/src/tensorflow/lite/kernels/internal/reference/mul.h +++ b/src/tensorflow/lite/kernels/internal/reference/mul.h @@ -1,4 +1,4 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ inline void Mul(const ArithmeticParams& params, const int flat_size = MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { - output_data[i] = ActivationFunctionWithMinMax( + output_data[i] = ActivationFunctionWithMinMax( input1_data[i] * input2_data[i], output_activation_min, output_activation_max); } @@ -128,14 +128,18 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params, } } -template -void BroadcastMul4DSlow(const ArithmeticParams& params, - const RuntimeShape& unextended_input1_shape, - const T* input1_data, - const RuntimeShape& unextended_input2_shape, - const T* input2_data, - const RuntimeShape& unextended_output_shape, - T* output_data) { +template +inline typename std::enable_if< + !is_small_integer::value || enable_for_short_integers, void>::type +BroadcastMul4DSlow(const ArithmeticParams& params, + const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { T output_activation_min; T output_activation_max; GetActivationParams(params, &output_activation_min, &output_activation_max); @@ -167,7 +171,7 @@ void BroadcastMul4DSlow(const ArithmeticParams& params, for (int x = 0; x < output_shape.Dims(2); ++x) { for (int c = 0; c < output_shape.Dims(3); ++c) { output_data[Offset(output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax( + ActivationFunctionWithMinMax( input1_data[SubscriptToIndex(desc1, b, y, x, c)] * input2_data[SubscriptToIndex(desc2, b, y, x, c)], output_activation_min, output_activation_max); diff --git a/src/tensorflow/lite/kernels/internal/reference/reduce.h b/src/tensorflow/lite/kernels/internal/reference/reduce.h index c4d7598b..ab4745fc 100644 --- a/src/tensorflow/lite/kernels/internal/reference/reduce.h +++ b/src/tensorflow/lite/kernels/internal/reference/reduce.h @@ -268,11 +268,11 @@ inline bool Mean(const T* input_data, const int* input_dims, return true; } -template inline void Mean(const tflite::MeanParams& op_params, const RuntimeShape& unextended_input_shape, - const T* input_data, - const RuntimeShape& unextended_output_shape, T* output_data) { + const float* input_data, + const RuntimeShape& unextended_output_shape, + float* output_data) { ruy::profiler::ScopeLabel label("Mean4D"); // Current implementation only supports dimension equals 4 and simultaneous @@ -312,78 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params, } } -inline void Mean(const tflite::MeanParams& op_params, - const RuntimeShape& unextended_input_shape, - const uint8_t* input_data, int32_t input_zero_point, - float input_scale, const RuntimeShape& unextended_output_shape, - uint8_t* output_data, int32_t output_zero_point, - float output_scale) { - ruy::profiler::ScopeLabel label("Mean4D/Uint8"); - - // Current implementation only supports dimension equals 4 and simultaneous - // reduction over width and height. - TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4); - TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4); - const RuntimeShape input_shape = - RuntimeShape::ExtendedShape(4, unextended_input_shape); - const RuntimeShape output_shape = - RuntimeShape::ExtendedShape(4, unextended_output_shape); - const int output_batch = output_shape.Dims(0); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - const int output_depth = output_shape.Dims(3); - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const float num_elements_in_axis = input_width * input_height; - - TFLITE_CHECK_EQ(op_params.axis_count, 2); - TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) || - (op_params.axis[0] == 2 && op_params.axis[1] == 1)); - TFLITE_CHECK_EQ(output_height, 1); - TFLITE_CHECK_EQ(output_width, 1); - - constexpr int32_t kMinValue = std::numeric_limits::min(); - constexpr int32_t kMaxValue = std::numeric_limits::max(); - - float temp = input_zero_point * input_scale / output_scale; - temp = temp > 0 ? temp + 0.5f : temp - 0.5f; - int32_t bias = output_zero_point - static_cast(temp); - double real_scale = - static_cast(input_scale / (num_elements_in_axis * output_scale)); - - int32_t multiplier; - int shift; - QuantizeMultiplier(real_scale, &multiplier, &shift); - for (int out_b = 0; out_b < output_batch; ++out_b) { - for (int out_d = 0; out_d < output_depth; ++out_d) { - int32_t acc = 0; - for (int in_h = 0; in_h < input_height; ++in_h) { - for (int in_w = 0; in_w < input_width; ++in_w) { - acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; - } - } - acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift); - acc += bias; - acc = std::min(std::max(acc, kMinValue), kMaxValue); - output_data[Offset(output_shape, out_b, 0, 0, out_d)] = - static_cast(acc); - } - } -} - // Computes the mean of elements across dimensions given in axis. // It does so in two stages, first calculates the sum of elements along the axis // then divides it by the number of element in axis for quantized values. template inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point, - float input_scale, const int* input_dims, - const int input_num_dims, T* output_data, - int32_t output_zero_point, float output_scale, + const int* input_dims, const int input_num_dims, + T* output_data, int32_t output_multiplier, + int output_shift, int32_t output_zero_point, const int* output_dims, const int output_num_dims, const int* axis, const int num_axis_dimensions, bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum, bool compute_sum) { + const int32_t kMinValue = std::numeric_limits::min(); + const int32_t kMaxValue = std::numeric_limits::max(); const bool uint8_case = std::is_same::value; const bool int16_case = std::is_same::value; if (uint8_case) { @@ -430,40 +373,46 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point, } // Calculate mean by dividing output_data by num of aggregated element. - size_t num_elements_in_axis = 1; + int64_t num_elements_in_axis = 1; for (int idx = 0; idx < num_resolved_axis; ++idx) { size_t current = static_cast(input_dims[resolved_axis[idx]]); // Overflow prevention. - if (current > (std::numeric_limits::max() / num_elements_in_axis)) { + if (current > static_cast(std::numeric_limits::max() / + num_elements_in_axis)) { return false; } num_elements_in_axis *= current; } - if (num_elements_in_axis > 0) { - const float scale = input_scale / output_scale; - if (compute_sum) { - // TODO(b/116341117): Eliminate float and do this completely in 8bit. - const float bias = -input_zero_point * scale * num_elements_in_axis; - for (size_t idx = 0; idx < num_outputs; ++idx) { - const U value = - static_cast(TfLiteRound(temp_sum[idx] * scale + bias)) + - output_zero_point; - output_data[idx] = static_cast(value); - } - } else { - const float bias = -input_zero_point * scale; - for (size_t idx = 0; idx < num_outputs; ++idx) { - float float_mean = static_cast(temp_sum[idx]) / - static_cast(num_elements_in_axis); - float result = TfLiteMin( - TfLiteRound(float_mean * scale + bias) + output_zero_point, - static_cast(std::numeric_limits::max())); - result = TfLiteMax(result, - static_cast(std::numeric_limits::min())); - output_data[idx] = static_cast(result); - } - } + if (num_elements_in_axis == 0) { + return true; + } + + // Readapt output rescaling when calculating the mean to integrate a + // 1/num_elements_in_axis multiplier. + if (!compute_sum) { + TFLITE_DCHECK_GE(num_elements_in_axis, 0); + int shift = + 63 - CountLeadingZeros(static_cast(num_elements_in_axis)); + // To avoid any overflow risk 'shift' should be <= 32 and to satisfy + // 'MultiplyByQuantizedMultiplier' pre-conditions 'output_shift - shift' + // should be >= -31. Clamp the value at the price of some precision loss. + shift = std::min(shift, 32); + shift = std::min(shift, 31 + output_shift); + output_multiplier = static_cast( + (static_cast(output_multiplier) << shift) / + num_elements_in_axis); + output_shift = output_shift - shift; + } + + for (size_t idx = 0; idx < num_outputs; ++idx) { + const U shifted_sum = + static_cast(temp_sum[idx] - input_zero_point * num_elements_in_axis); + int32_t output = MultiplyByQuantizedMultiplier( + shifted_sum, output_multiplier, output_shift) + + output_zero_point; + output = std::min(std::max(output, kMinValue), kMaxValue); + output_data[idx] = static_cast(output); } return true; } @@ -478,8 +427,8 @@ inline bool QuantizedMeanOrSumExtraArgs( bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum, bool compute_sum) { return QuantizedMeanOrSum( - input_data, input_zero_point, input_scale, input_dims, input_num_dims, - output_data, output_zero_point, output_scale, output_dims, + input_data, input_zero_point, input_dims, input_num_dims, output_data, + output_multiplier, output_shift, output_zero_point, output_dims, output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index, resolved_axis, temp_sum, compute_sum); } diff --git a/src/tensorflow/lite/kernels/internal/runtime_shape.h b/src/tensorflow/lite/kernels/internal/runtime_shape.h index c2678b57..0e4df2c3 100644 --- a/src/tensorflow/lite/kernels/internal/runtime_shape.h +++ b/src/tensorflow/lite/kernels/internal/runtime_shape.h @@ -15,6 +15,8 @@ limitations under the License. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_ +#include "tensorflow/lite/kernels/internal/compatibility.h" + namespace tflite { template @@ -27,16 +29,19 @@ class RuntimeShape { public: RuntimeShape& operator=(RuntimeShape const&) = delete; - // RuntimeShape in TFLM supports up to 5 dimensions. + // RuntimeShape in TFLM supports up to 6 dimensions. // The name kMaxSmallSize comes from the same file of the upstream // tensorflow lite repo and need to be kept the same for max reuse. - static constexpr int kMaxSmallSize = 5; + static constexpr int kMaxSmallSize = 6; RuntimeShape() : size_(0) {} - explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {} + explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) { + TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize); + } RuntimeShape(int shape_size, int32_t value) : size_(shape_size) { + TFLITE_DCHECK_LE(shape_size, kMaxSmallSize); for (int i = 0; i < shape_size; ++i) { SetDim(i, value); } @@ -44,6 +49,7 @@ class RuntimeShape { RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(dimensions_count) { + // check of dimensions_count handled by ReplaceWith() ReplaceWith(dimensions_count, dims_data); } @@ -69,6 +75,7 @@ class RuntimeShape { static RuntimeShape ExtendedShape(int new_shape_size, const RuntimeShape& shape) { + TFLITE_DCHECK_LE(new_shape_size, kMaxSmallSize); return RuntimeShape(new_shape_size, shape, 1); } int32_t* DimsData() { return dims_; } @@ -76,6 +83,7 @@ class RuntimeShape { const int32_t* DimsDataUpTo5D() const { return dims_; } void ReplaceWith(int dimensions_count, const int32_t* dims_data) { + TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize); size_ = dimensions_count; int32_t* dst_dims = DimsData(); std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t)); diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.h b/src/tensorflow/lite/kernels/internal/tensor_ctypes.cpp similarity index 50% rename from src/tensorflow/lite/experimental/microfrontend/lib/fft_util.h rename to src/tensorflow/lite/kernels/internal/tensor_ctypes.cpp index 6a471301..6bd58fc1 100644 --- a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.h +++ b/src/tensorflow/lite/kernels/internal/tensor_ctypes.cpp @@ -1,4 +1,4 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,23 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_ -#include "tensorflow/lite/experimental/microfrontend/lib/fft.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" -#ifdef __cplusplus -extern "C" { -#endif +#include -// Prepares and FFT for the given input size. -int FftPopulateState(struct FftState* state, size_t input_size); +namespace tflite { -// Frees any allocated buffers. -void FftFreeStateContents(struct FftState* state); +RuntimeShape GetTensorShape(const TfLiteTensor* tensor) { + if (tensor == nullptr) { + return RuntimeShape(); + } -#ifdef __cplusplus -} // extern "C" -#endif + TfLiteIntArray* dims = tensor->dims; + const int dims_size = dims->size; + const int32_t* dims_data = reinterpret_cast(dims->data); + return RuntimeShape(dims_size, dims_data); +} -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_ +RuntimeShape GetTensorShape(std::vector data) { + return RuntimeShape(data.size(), data.data()); +} + +} // namespace tflite diff --git a/src/tensorflow/lite/kernels/internal/tensor_ctypes.h b/src/tensorflow/lite/kernels/internal/tensor_ctypes.h index 7e639b91..9a7205c0 100644 --- a/src/tensorflow/lite/kernels/internal/tensor_ctypes.h +++ b/src/tensorflow/lite/kernels/internal/tensor_ctypes.h @@ -15,7 +15,10 @@ limitations under the License. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_ +#include + #include "tensorflow/lite/core/c/common.h" +#include "tensorflow/lite/core/macros.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { @@ -31,16 +34,8 @@ inline const T* GetTensorData(const TfLiteTensor* tensor) { : nullptr; } -inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) { - if (tensor == nullptr) { - return RuntimeShape(); - } - - TfLiteIntArray* dims = tensor->dims; - const int dims_size = dims->size; - const int32_t* dims_data = reinterpret_cast(dims->data); - return RuntimeShape(dims_size, dims_data); -} +TFLITE_NOINLINE RuntimeShape GetTensorShape(const TfLiteTensor* tensor); +RuntimeShape GetTensorShape(std::vector data); } // namespace tflite diff --git a/src/tensorflow/lite/kernels/internal/types.h b/src/tensorflow/lite/kernels/internal/types.h index 77f741bb..043a8513 100644 --- a/src/tensorflow/lite/kernels/internal/types.h +++ b/src/tensorflow/lite/kernels/internal/types.h @@ -1,4 +1,4 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -659,6 +659,9 @@ struct ArithmeticParams { // int64_t activation params. int64_t int64_activation_min; int64_t int64_activation_max; + // int16_t activation params. + int16_t int16_activation_min; + int16_t int16_activation_max; // Processed output dimensions. // Let input "a" be the one that broadcasts in the faster-changing dimension. @@ -1022,6 +1025,18 @@ inline void SetActivationParams(int32_t min, int32_t max, P* params) { params->quantized_activation_max = max; } +template +inline void SetActivationParams(uint32_t min, uint32_t max, P* params) { + params->quantized_activation_min = min; + params->quantized_activation_max = max; +} + +template +inline void SetActivationParams(int16_t min, int16_t max, P* params) { + params->int16_activation_min = min; + params->int16_activation_max = max; +} + template inline void SetActivationParams(int64_t min, int64_t max, P* params) { params->int64_activation_min = min; @@ -1034,6 +1049,18 @@ inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) { *max = params.quantized_activation_max; } +template +inline void GetActivationParams(const P& params, uint32_t* min, uint32_t* max) { + *min = params.quantized_activation_min; + *max = params.quantized_activation_max; +} + +template +inline void GetActivationParams(const P& params, int16_t* min, int16_t* max) { + *min = params.int16_activation_min; + *max = params.int16_activation_max; +} + template inline void GetActivationParams(const P& params, float* min, float* max) { *min = params.float_activation_min; diff --git a/src/tensorflow/lite/micro/fake_micro_context.cpp b/src/tensorflow/lite/micro/fake_micro_context.cpp index 81f74ae3..03ea6dfc 100644 --- a/src/tensorflow/lite/micro/fake_micro_context.cpp +++ b/src/tensorflow/lite/micro/fake_micro_context.cpp @@ -39,16 +39,26 @@ FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors, allocator_(allocator) {} TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) { - allocated_tensor_count_++; + allocated_temp_count_++; return &tensors_[tensor_index]; } void FakeMicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) { - allocated_tensor_count_--; + allocated_temp_count_--; } bool FakeMicroContext::IsAllTempTfLiteTensorDeallocated() { - return !allocated_tensor_count_; + return !allocated_temp_count_; +} + +uint8_t* FakeMicroContext::AllocateTempBuffer(size_t size, size_t alignment) { + allocated_temp_count_++; + return allocator_->AllocateTemp(size, alignment); +} + +void FakeMicroContext::DeallocateTempBuffer(uint8_t* buffer) { + allocated_temp_count_--; + allocator_->DeallocateTemp(buffer); } TfLiteEvalTensor* FakeMicroContext::GetEvalTensor(int tensor_index) { diff --git a/src/tensorflow/lite/micro/fake_micro_context.h b/src/tensorflow/lite/micro/fake_micro_context.h index 31b39d38..b068f326 100644 --- a/src/tensorflow/lite/micro/fake_micro_context.h +++ b/src/tensorflow/lite/micro/fake_micro_context.h @@ -21,6 +21,10 @@ limitations under the License. namespace tflite { // A fake of MicroContext for kernel util tests. +// TODO(b/272759060): FakeMicroContext currently inherits from MicroContext. +// Which allow tests to use functions from MicroContext that weren't added to +// FakeMicroContext in tests. This should be looked into further. + class FakeMicroContext : public MicroContext { public: FakeMicroContext(TfLiteTensor* tensors, SingleArenaBufferAllocator* allocator, @@ -35,6 +39,9 @@ class FakeMicroContext : public MicroContext { void DeallocateTempTfLiteTensor(TfLiteTensor* tensor) override; bool IsAllTempTfLiteTensorDeallocated(); + uint8_t* AllocateTempBuffer(size_t size, size_t alignment) override; + void DeallocateTempBuffer(uint8_t* buffer) override; + TfLiteEvalTensor* GetEvalTensor(int tensor_index) override; private: @@ -44,7 +51,7 @@ class FakeMicroContext : public MicroContext { uint8_t* scratch_buffers_[kNumScratchBuffers_]; TfLiteTensor* tensors_; - int allocated_tensor_count_ = 0; + int allocated_temp_count_ = 0; SingleArenaBufferAllocator* allocator_; diff --git a/src/tensorflow/lite/micro/kernels/activations.cpp b/src/tensorflow/lite/micro/kernels/activations.cpp index 716dd6fc..3227ffbf 100644 --- a/src/tensorflow/lite/micro/kernels/activations.cpp +++ b/src/tensorflow/lite/micro/kernels/activations.cpp @@ -109,11 +109,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_RELU() { +TfLiteRegistration_V1 Register_RELU() { return tflite::micro::RegisterOp(ReluInit, ReluPrepare, ReluEval); } -TfLiteRegistration Register_RELU6() { +TfLiteRegistration_V1 Register_RELU6() { return tflite::micro::RegisterOp(Relu6Init, Relu6Prepare, Relu6Eval); } diff --git a/src/tensorflow/lite/micro/kernels/add.h b/src/tensorflow/lite/micro/kernels/add.h index 5b7be70c..6ec489c0 100644 --- a/src/tensorflow/lite/micro/kernels/add.h +++ b/src/tensorflow/lite/micro/kernels/add.h @@ -60,17 +60,17 @@ TfLiteStatus CalculateOpDataAdd(TfLiteContext* context, TfLiteAddParams* params, TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node); // Generic must define registration function. -TfLiteRegistration Register_ADD(); +TfLiteRegistration_V1 Register_ADD(); #if defined(ARDUINO) -TfLiteRegistration Register_ADD_INT8(); +TfLiteRegistration_V1 Register_ADD_INT8(); -TfLiteRegistration Register_ADD_INT16(); +TfLiteRegistration_V1 Register_ADD_INT16(); #else // Fallback registration -inline TfLiteRegistration Register_ADD_INT8() { return Register_ADD(); } +inline TfLiteRegistration_V1 Register_ADD_INT8() { return Register_ADD(); } -inline TfLiteRegistration Register_ADD_INT16() { return Register_ADD(); } +inline TfLiteRegistration_V1 Register_ADD_INT16() { return Register_ADD(); } #endif } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/add_common.cpp b/src/tensorflow/lite/micro/kernels/add_common.cpp index b285b800..cc945091 100644 --- a/src/tensorflow/lite/micro/kernels/add_common.cpp +++ b/src/tensorflow/lite/micro/kernels/add_common.cpp @@ -39,6 +39,8 @@ TfLiteStatus CalculateOpDataAdd(TfLiteContext* context, TfLiteAddParams* params, data->requires_broadcast = !HaveSameShapes(input1, input2); if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) { + TFLITE_CHECK_NE(output->quantization.type, kTfLiteNoQuantization); + // 8bit -> 8bit general quantized path, with general rescalings data->input1_offset = -input1->params.zero_point; data->input2_offset = -input2->params.zero_point; @@ -97,6 +99,14 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_STATUS( CalculateOpDataAdd(context, params, input1, input2, output, data)); + if (output->type == kTfLiteInt32) { + // Only support int32 unquantized add for now. + TF_LITE_ENSURE_EQ(context, input1->quantization.type, + kTfLiteNoQuantization); + TF_LITE_ENSURE_EQ(context, input2->quantization.type, + kTfLiteNoQuantization); + } + micro_context->DeallocateTempTfLiteTensor(input1); micro_context->DeallocateTempTfLiteTensor(input2); micro_context->DeallocateTempTfLiteTensor(output); diff --git a/src/tensorflow/lite/micro/kernels/add_n.cpp b/src/tensorflow/lite/micro/kernels/add_n.cpp index 1139e1a9..eea554be 100644 --- a/src/tensorflow/lite/micro/kernels/add_n.cpp +++ b/src/tensorflow/lite/micro/kernels/add_n.cpp @@ -208,7 +208,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_ADD_N() { +TfLiteRegistration_V1 Register_ADD_N() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/arg_min_max.cpp b/src/tensorflow/lite/micro/kernels/arg_min_max.cpp index 7c78e475..c38c19b3 100644 --- a/src/tensorflow/lite/micro/kernels/arg_min_max.cpp +++ b/src/tensorflow/lite/micro/kernels/arg_min_max.cpp @@ -107,11 +107,11 @@ TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_ARG_MAX() { +TfLiteRegistration_V1 Register_ARG_MAX() { return tflite::micro::RegisterOp(nullptr, nullptr, ArgMaxEval); } -TfLiteRegistration Register_ARG_MIN() { +TfLiteRegistration_V1 Register_ARG_MIN() { return tflite::micro::RegisterOp(nullptr, nullptr, ArgMinEval); } diff --git a/src/tensorflow/lite/micro/kernels/assign_variable.cpp b/src/tensorflow/lite/micro/kernels/assign_variable.cpp index f3aa12fa..a29fa57b 100644 --- a/src/tensorflow/lite/micro/kernels/assign_variable.cpp +++ b/src/tensorflow/lite/micro/kernels/assign_variable.cpp @@ -60,9 +60,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { MicroGraph& graph_info = micro_context->graph(); MicroResourceVariables* resources = graph_info.GetResourceVariables(); - TF_LITE_ENSURE_OK(context, - resources->Allocate(input_resource_id_tensor->data.i32[0], - context, input_value)); + // If the data field of this tensor is nullptr, we assume that this is a case + // of using resource variables in another subgraph, and the resource_id + // will be valid during Eval time. In case it wasn't valid, this will + // still be caught during Invoke. More info in b/277231654. + if (input_resource_id_tensor->data.i32 != nullptr) { + TF_LITE_ENSURE_OK(context, + resources->Allocate(input_resource_id_tensor->data.i32[0], + context, input_value)); + } micro_context->DeallocateTempTfLiteTensor(input_value); return kTfLiteOk; @@ -94,7 +100,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_ASSIGN_VARIABLE() { +TfLiteRegistration_V1 Register_ASSIGN_VARIABLE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp b/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp index 83fb3568..29ca2ff9 100644 --- a/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp +++ b/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp @@ -105,7 +105,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_BATCH_TO_SPACE_ND() { +TfLiteRegistration_V1 Register_BATCH_TO_SPACE_ND() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/broadcast_args.cpp b/src/tensorflow/lite/micro/kernels/broadcast_args.cpp index be2672ec..a526971c 100644 --- a/src/tensorflow/lite/micro/kernels/broadcast_args.cpp +++ b/src/tensorflow/lite/micro/kernels/broadcast_args.cpp @@ -83,7 +83,7 @@ TfLiteStatus BroadcastArgsEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_BROADCAST_ARGS() { +TfLiteRegistration_V1 Register_BROADCAST_ARGS() { return tflite::micro::RegisterOp(nullptr, BroadcastArgsPrepare, BroadcastArgsEval); } diff --git a/src/tensorflow/lite/micro/kernels/broadcast_to.cpp b/src/tensorflow/lite/micro/kernels/broadcast_to.cpp index 63a14db2..9a32331f 100644 --- a/src/tensorflow/lite/micro/kernels/broadcast_to.cpp +++ b/src/tensorflow/lite/micro/kernels/broadcast_to.cpp @@ -115,7 +115,7 @@ TfLiteStatus BroadcastToEval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_BROADCAST_TO() { +TfLiteRegistration_V1 Register_BROADCAST_TO() { return tflite::micro::RegisterOp(nullptr, BroadcastToPrepare, BroadcastToEval); } diff --git a/src/tensorflow/lite/micro/kernels/call_once.cpp b/src/tensorflow/lite/micro/kernels/call_once.cpp index 200242b2..9fdf7d05 100644 --- a/src/tensorflow/lite/micro/kernels/call_once.cpp +++ b/src/tensorflow/lite/micro/kernels/call_once.cpp @@ -81,7 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_CALL_ONCE() { +TfLiteRegistration_V1 Register_CALL_ONCE() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/cast.cpp b/src/tensorflow/lite/micro/kernels/cast.cpp index 0a0204d2..6dd20d1f 100644 --- a/src/tensorflow/lite/micro/kernels/cast.cpp +++ b/src/tensorflow/lite/micro/kernels/cast.cpp @@ -107,7 +107,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_CAST() { +TfLiteRegistration_V1 Register_CAST() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/ceil.cpp b/src/tensorflow/lite/micro/kernels/ceil.cpp index dbcd57c2..5716afef 100644 --- a/src/tensorflow/lite/micro/kernels/ceil.cpp +++ b/src/tensorflow/lite/micro/kernels/ceil.cpp @@ -66,7 +66,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_CEIL() { +TfLiteRegistration_V1 Register_CEIL() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/circular_buffer.cpp b/src/tensorflow/lite/micro/kernels/circular_buffer.cpp index 9779c32d..e598fc5a 100644 --- a/src/tensorflow/lite/micro/kernels/circular_buffer.cpp +++ b/src/tensorflow/lite/micro/kernels/circular_buffer.cpp @@ -108,8 +108,8 @@ TfLiteStatus CircularBufferEval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteRegistration* Register_CIRCULAR_BUFFER() { - static TfLiteRegistration r = tflite::micro::RegisterOp( +TfLiteRegistration_V1* Register_CIRCULAR_BUFFER() { + static TfLiteRegistration_V1 r = tflite::micro::RegisterOp( CircularBufferInit, CircularBufferPrepare, CircularBufferEval); return &r; } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp index 99ad9142..75cd3a52 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp @@ -198,29 +198,60 @@ TfLiteStatus EvalAddQuantizedInt16(TfLiteContext* context, TfLiteNode* node, return kTfLiteOk; } -void EvalAddFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteAddParams* params, const OpData* data, - const TfLiteEvalTensor* input1, - const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) { - tflite::ArithmeticParams op_params; - SetActivationParams(data->output_activation_min_f32, - data->output_activation_max_f32, &op_params); - if (data->requires_broadcast) { - reference_ops::BroadcastAdd4DSlow( - op_params, tflite::micro::GetTensorShape(input1), - tflite::micro::GetTensorData(input1), - tflite::micro::GetTensorShape(input2), - tflite::micro::GetTensorData(input2), - tflite::micro::GetTensorShape(output), - tflite::micro::GetTensorData(output)); - } else { - reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1), - tflite::micro::GetTensorData(input1), - tflite::micro::GetTensorShape(input2), - tflite::micro::GetTensorData(input2), - tflite::micro::GetTensorShape(output), - tflite::micro::GetTensorData(output)); +TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node, + TfLiteAddParams* params, const OpData* data, + const TfLiteEvalTensor* input1, + const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) { + switch (output->type) { + case kTfLiteFloat32: { + tflite::ArithmeticParams op_params; + SetActivationParams(data->output_activation_min_f32, + data->output_activation_max_f32, &op_params); + if (data->requires_broadcast) { + reference_ops::BroadcastAdd4DSlow( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } else { + reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } + } break; + case kTfLiteInt32: { + tflite::ArithmeticParams op_params; + SetActivationParams(std::numeric_limits::lowest(), + std::numeric_limits::max(), &op_params); + if (data->requires_broadcast) { + reference_ops::BroadcastAdd4DSlow( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } else { + reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } + } break; + default: + MicroPrintf("Type %s (%d) not supported.", + TfLiteTypeGetName(output->type), output->type); + return kTfLiteError; } + + return kTfLiteOk; } TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, @@ -282,6 +313,14 @@ TfLiteStatus PrepareAdd(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_STATUS( CalculateOpData(context, params, input1, input2, output, data)); + if (output->type == kTfLiteInt32) { + // Only support int32 unquantized add for now. + TF_LITE_ENSURE_EQ(context, input1->quantization.type, + kTfLiteNoQuantization); + TF_LITE_ENSURE_EQ(context, input2->quantization.type, + kTfLiteNoQuantization); + } + micro_context->DeallocateTempTfLiteTensor(input1); micro_context->DeallocateTempTfLiteTensor(input2); micro_context->DeallocateTempTfLiteTensor(output); @@ -302,8 +341,9 @@ TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); const OpData* data = static_cast(node->user_data); - if (output->type == kTfLiteFloat32) { - EvalAddFloat(context, node, params, data, input1, input2, output); + if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) { + TF_LITE_ENSURE_OK( + context, EvalAdd(context, node, params, data, input1, input2, output)); } else if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) { TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data, input1, input2, output)); @@ -356,15 +396,15 @@ TfLiteStatus EvalAddInt16(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteRegistration Register_ADD() { +TfLiteRegistration_V1 Register_ADD() { return tflite::micro::RegisterOp(InitAdd, PrepareAdd, EvalAdd); } -TfLiteRegistration Register_ADD_INT8() { +TfLiteRegistration_V1 Register_ADD_INT8() { return tflite::micro::RegisterOp(InitAdd, PrepareAdd, EvalAddInt8); } -TfLiteRegistration Register_ADD_INT16() { +TfLiteRegistration_V1 Register_ADD_INT16() { return tflite::micro::RegisterOp(InitAdd, PrepareAdd, EvalAddInt16); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp index 504fd1ee..2655918c 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/conv.h" -#include "third_party/cmsis_nn/Include/arm_nn_types.h" #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" @@ -467,15 +466,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_CONV_2D() { +TfLiteRegistration_V1 Register_CONV_2D() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } -TfLiteRegistration Register_CONV_2D_INT8() { +TfLiteRegistration_V1 Register_CONV_2D_INT8() { return tflite::micro::RegisterOp(Init, Prepare, EvalInt8); } -TfLiteRegistration Register_CONV_2D_INT16() { +TfLiteRegistration_V1 Register_CONV_2D_INT16() { return tflite::micro::RegisterOp(Init, Prepare, EvalInt16x8); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp index ebc760c4..5d46447e 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp @@ -433,15 +433,15 @@ TfLiteStatus EvalInt16x8(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_DEPTHWISE_CONV_2D() { +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } -TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8() { +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT8() { return tflite::micro::RegisterOp(Init, Prepare, EvalInt8); } -TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16() { +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT16() { return tflite::micro::RegisterOp(Init, Prepare, EvalInt16x8); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp index cacdf04a..0381b071 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp @@ -421,15 +421,15 @@ TfLiteStatus EvalInt16(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_FULLY_CONNECTED() { +TfLiteRegistration_V1 Register_FULLY_CONNECTED() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } -TfLiteRegistration Register_FULLY_CONNECTED_INT8() { +TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT8() { return tflite::micro::RegisterOp(Init, Prepare, EvalInt8); } -TfLiteRegistration Register_FULLY_CONNECTED_INT16() { +TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT16() { return tflite::micro::RegisterOp(Init, Prepare, EvalInt16); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp index 5bbbf11f..8cc3027a 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp @@ -169,15 +169,15 @@ TfLiteStatus EvalInt16(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteRegistration Register_MUL() { +TfLiteRegistration_V1 Register_MUL() { return tflite::micro::RegisterOp(MulInit, MulPrepare, Eval); } -TfLiteRegistration Register_MUL_INT8() { +TfLiteRegistration_V1 Register_MUL_INT8() { return tflite::micro::RegisterOp(MulInit, MulPrepare, EvalInt8); } -TfLiteRegistration Register_MUL_INT16() { +TfLiteRegistration_V1 Register_MUL_INT16() { return tflite::micro::RegisterOp(MulInit, MulPrepare, EvalInt16); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp index ce4ba765..e944ba2c 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp @@ -319,27 +319,27 @@ TfLiteStatus MaxEvalInt16(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_AVERAGE_POOL_2D_INT8() { +TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT8() { return tflite::micro::RegisterOp(Init, AveragePrepare, AverageEvalInt8); } -TfLiteRegistration Register_AVERAGE_POOL_2D_INT16() { +TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT16() { return tflite::micro::RegisterOp(Init, AveragePrepare, AverageEvalInt16); } -TfLiteRegistration Register_AVERAGE_POOL_2D() { +TfLiteRegistration_V1 Register_AVERAGE_POOL_2D() { return tflite::micro::RegisterOp(Init, AveragePrepare, AverageEval); } -TfLiteRegistration Register_MAX_POOL_2D_INT8() { +TfLiteRegistration_V1 Register_MAX_POOL_2D_INT8() { return tflite::micro::RegisterOp(Init, MaxPrepare, MaxEvalInt8); } -TfLiteRegistration Register_MAX_POOL_2D_INT16() { +TfLiteRegistration_V1 Register_MAX_POOL_2D_INT16() { return tflite::micro::RegisterOp(Init, MaxPrepare, MaxEvalInt16); } -TfLiteRegistration Register_MAX_POOL_2D() { +TfLiteRegistration_V1 Register_MAX_POOL_2D() { return tflite::micro::RegisterOp(Init, MaxPrepare, MaxEval); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp index 9efe8943..93ae608d 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp @@ -190,19 +190,19 @@ TfLiteStatus SoftmaxEvalInt16(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SOFTMAX() { +TfLiteRegistration_V1 Register_SOFTMAX() { return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEval); } -TfLiteRegistration Register_SOFTMAX_INT8() { +TfLiteRegistration_V1 Register_SOFTMAX_INT8() { return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt8); } -TfLiteRegistration Register_SOFTMAX_INT8_INT16() { +TfLiteRegistration_V1 Register_SOFTMAX_INT8_INT16() { return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt8_Int16); } -TfLiteRegistration Register_SOFTMAX_INT16() { +TfLiteRegistration_V1 Register_SOFTMAX_INT16() { return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt16); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp index 6941d223..c3ed8095 100644 --- a/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/svdf.h" -#include "third_party/cmsis_nn/Include/arm_nn_types.h" #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" @@ -213,11 +212,11 @@ TfLiteStatus EvalSvdfInt8(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SVDF() { +TfLiteRegistration_V1 Register_SVDF() { return tflite::micro::RegisterOp(Init, PrepareSvdf, EvalSvdf); } -TfLiteRegistration Register_SVDF_INT8() { +TfLiteRegistration_V1 Register_SVDF_INT8() { return tflite::micro::RegisterOp(Init, PrepareSvdf, EvalSvdfInt8); } diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cpp new file mode 100644 index 00000000..421d7666 --- /dev/null +++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cpp @@ -0,0 +1,683 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Integer version of unidirectional sequence LSTM. Only the standard LSTM +// (defined in the keras LSTM layer, e.g., no peephole etc.) is supported here. +// Currently used by the 8 bits activation case only, except for fallbacks. + +#include +#include + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/fully_connected.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/lstm_eval.h" +#include "tensorflow/lite/micro/kernels/lstm_shared.h" +#include "tensorflow/lite/micro/kernels/micro_tensor_utils.h" + +namespace tflite { + +namespace { + +struct OpData { + OpDataLSTM params_ref; + cmsis_nn_lstm_params params_cmsis_nn; +}; + +/*Helper Functions*/ +TfLiteStatus PrecomputeZeroPointTimesWeightWithBias( + TfLiteContext* context, int32_t zero_point, + const TfLiteTensor* weight_tensor, const TfLiteTensor* bias_tensor, + int32_t** output) { + if (weight_tensor == nullptr) { + return kTfLiteOk; + } + + const RuntimeShape& weight_shape = GetTensorShape(weight_tensor); + TF_LITE_ENSURE_EQ(context, weight_shape.DimensionsCount(), 2); + const int row = weight_shape.Dims(0); + const int col = weight_shape.Dims(1); + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + *output = static_cast( + context->AllocatePersistentBuffer(context, row * sizeof(int32_t))); + + if (bias_tensor == nullptr) { + memset(*output, 0, row * sizeof(int32_t)); + } else { + const int32_t* bias = GetTensorData(bias_tensor); + memcpy(*output, bias, row * sizeof(int32_t)); + } + + if (zero_point != 0) { + const int8_t* weight = GetTensorData(weight_tensor); + tflite::tensor_utils::MatrixScalarMultiplyAccumulate(weight, zero_point, + row, col, *output); + } + return kTfLiteOk; +} + +TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, + const LstmTensors& lstm_tensors, OpData* op_data) { + const TfLiteTensor* input = lstm_tensors.GetInternalTensor(kLstmInputTensor); + const TfLiteTensor* output_state = + lstm_tensors.GetInternalTensor(tflite::kLstmOutputStateTensor); + + TF_LITE_ENSURE(context, input->type == kTfLiteInt8); + + op_data->params_cmsis_nn.output_state_offset = + output_state->params.zero_point; + + const TfLiteTensor* input_to_forget_weights = + lstm_tensors.GetInternalTensor(kLstmInputToForgetWeightsTensor); + const TfLiteTensor* input_to_input_weights = + lstm_tensors.GetInternalTensor(kLstmInputToInputWeightsTensor); + const TfLiteTensor* input_to_output_weights = + lstm_tensors.GetInternalTensor(kLstmInputToOutputWeightsTensor); + const TfLiteTensor* input_to_cell_weights = + lstm_tensors.GetInternalTensor(kLstmInputToCellWeightsTensor); + const TfLiteTensor* forget_gate_bias = + lstm_tensors.GetInternalTensor(kLstmForgetGateBiasTensor); + const TfLiteTensor* cell_state = + lstm_tensors.GetInternalTensor(kLstmCellStateTensor); + + const TfLiteTensor* cell_gate_bias = + lstm_tensors.GetInternalTensor(kLstmCellGateBiasTensor); + const TfLiteTensor* output_gate_bias = + lstm_tensors.GetInternalTensor(kLstmOutputGateBiasTensor); + const TfLiteTensor* input_gate_bias = + lstm_tensors.GetInternalTensor(kLstmInputGateBiasTensor); + const TfLiteTensor* recurrent_to_forget_weights = + lstm_tensors.GetInternalTensor(kLstmRecurrentToForgetWeightsTensor); + const TfLiteTensor* recurrent_to_cell_weights = + lstm_tensors.GetInternalTensor(kLstmRecurrentToCellWeightsTensor); + const TfLiteTensor* recurrent_to_output_weights = + lstm_tensors.GetInternalTensor(kLstmRecurrentToOutputWeightsTensor); + const TfLiteTensor* recurrent_to_input_weights = + lstm_tensors.GetInternalTensor(kLstmRecurrentToInputWeightsTensor); + const TfLiteTensor* cell_to_output_weights = + lstm_tensors.GetInternalTensor(kLstmCellToOutputWeightsTensor); + const TfLiteTensor* forget_layer_norm_coefficients = + lstm_tensors.GetInternalTensor(kLstmForgetLayerNormCoefficientsTensor); + const TfLiteTensor* projection_weights = + lstm_tensors.GetInternalTensor(kLstmProjectionWeightsTensor); + + const bool use_layer_norm = (forget_layer_norm_coefficients != nullptr); + const bool use_peephole = (cell_to_output_weights != nullptr); + const bool use_projection = (projection_weights != nullptr); + const bool use_cifg = (input_to_input_weights == nullptr); + const bool lstm_unsupported_config = + use_layer_norm || use_peephole || use_projection || use_cifg; + TFLITE_DCHECK(!lstm_unsupported_config); + + // Pre-calculate bias + zero_point * weight. + int32_t* input_to_forget_effective_bias = nullptr; + int32_t* recurrent_to_forget_effective_bias = nullptr; + int32_t* input_to_cell_effective_bias = nullptr; + int32_t* recurrent_to_cell_effective_bias = nullptr; + int32_t* input_to_output_effective_bias = nullptr; + int32_t* recurrent_to_output_effective_bias = nullptr; + int32_t* input_to_input_effective_bias = nullptr; + int32_t* recurrent_to_input_effective_bias = nullptr; + + const int32_t output_state_zero_point = + -op_data->params_cmsis_nn.output_state_offset; + const int32_t input_zero_point = -input->params.zero_point; + + TF_LITE_ENSURE_OK(context, + PrecomputeZeroPointTimesWeightWithBias( + context, input_zero_point, input_to_forget_weights, + forget_gate_bias, &input_to_forget_effective_bias)); + + TF_LITE_ENSURE_OK(context, PrecomputeZeroPointTimesWeightWithBias( + context, output_state_zero_point, + recurrent_to_forget_weights, nullptr, + &recurrent_to_forget_effective_bias)); + + // Modulation gate. + TF_LITE_ENSURE_OK(context, + PrecomputeZeroPointTimesWeightWithBias( + context, input_zero_point, input_to_cell_weights, + cell_gate_bias, &input_to_cell_effective_bias)); + TF_LITE_ENSURE_OK( + context, PrecomputeZeroPointTimesWeightWithBias( + context, output_state_zero_point, recurrent_to_cell_weights, + nullptr, &recurrent_to_cell_effective_bias)); + + // Output gate. + TF_LITE_ENSURE_OK(context, + PrecomputeZeroPointTimesWeightWithBias( + context, input_zero_point, input_to_output_weights, + output_gate_bias, &input_to_output_effective_bias)); + + TF_LITE_ENSURE_OK(context, PrecomputeZeroPointTimesWeightWithBias( + context, output_state_zero_point, + recurrent_to_output_weights, nullptr, + &recurrent_to_output_effective_bias)); + + // Input gate. The calculation is only meaningful for non-cifg case. + TF_LITE_ENSURE_OK(context, + PrecomputeZeroPointTimesWeightWithBias( + context, input_zero_point, input_to_input_weights, + input_gate_bias, &input_to_input_effective_bias)); + TF_LITE_ENSURE_OK( + context, PrecomputeZeroPointTimesWeightWithBias( + context, output_state_zero_point, recurrent_to_input_weights, + nullptr, &recurrent_to_input_effective_bias)); + + op_data->params_cmsis_nn.i2f_effective_bias = input_to_forget_effective_bias; + op_data->params_cmsis_nn.r2f_effective_bias = + recurrent_to_forget_effective_bias; + op_data->params_cmsis_nn.i2c_effective_bias = input_to_cell_effective_bias; + op_data->params_cmsis_nn.r2c_effective_bias = + recurrent_to_cell_effective_bias; + op_data->params_cmsis_nn.i2o_effective_bias = input_to_output_effective_bias; + op_data->params_cmsis_nn.r2o_effective_bias = + recurrent_to_output_effective_bias; + op_data->params_cmsis_nn.i2i_effective_bias = input_to_input_effective_bias; + op_data->params_cmsis_nn.r2i_effective_bias = + recurrent_to_input_effective_bias; + + // Get intermediate scales and zero points. + float intermediate_scale[5]; + int32_t intermediate_zp[5]; + for (int i = 0; i < 4; ++i) { + // Q3.12 for activation functions. + intermediate_scale[i] = std::pow(2.0f, -12.0f); + intermediate_zp[i] = 0; + } + + MicroContext* micro_context = GetMicroContext(context); + // In the absence of projection, hidden becomes otuput and this intermediate + // is ignored. + TfLiteTensor* hidden = micro_context->AllocateTempIntermediateTensor(node, 4); + TF_LITE_ENSURE(context, hidden->quantization.type != kTfLiteNoQuantization); + auto* hidden_params = + static_cast(hidden->quantization.params); + intermediate_scale[4] = hidden_params->scale->data[0]; + intermediate_zp[4] = hidden_params->zero_point->data[0]; + if (hidden != nullptr) { + micro_context->DeallocateTempTfLiteTensor(hidden); + } + + // Scales. + const float default_scale = 1.0; + float input_scale = default_scale; + float input_to_input_weight_scale = default_scale; + float recurrent_to_input_weight_scale = default_scale; + float input_to_forget_weight_scale = default_scale; + float recurrent_to_forget_weight_scale = default_scale; + float input_to_cell_weight_scale = default_scale; + float recurrent_to_cell_weight_scale = default_scale; + float input_to_output_weight_scale = default_scale; + float recurrent_to_output_weight_scale = default_scale; + float output_state_scale = default_scale; + int cell_scale = 1; + + // Effective scales. + float effective_input_to_input_scale = default_scale; + float effective_recurrent_to_input_scale = default_scale; + float effective_cell_to_input_scale = default_scale; + float effective_input_to_forget_scale = default_scale; + float effective_recurrent_to_forget_scale = default_scale; + float effective_cell_to_forget_scale = default_scale; + float effective_input_to_cell_scale = default_scale; + float effective_recurrent_to_cell_scale = default_scale; + float effective_input_to_output_scale = default_scale; + float effective_recurrent_to_output_scale = default_scale; + float effective_cell_to_output_scale = default_scale; + float effective_hidden_scale = default_scale; + + // Populate scales. + input_to_input_weight_scale = input_to_input_weights->params.scale; + recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale; + + output_state_scale = output_state->params.scale; + + input_to_forget_weight_scale = input_to_forget_weights->params.scale; + input_to_cell_weight_scale = input_to_cell_weights->params.scale; + input_to_output_weight_scale = input_to_output_weights->params.scale; + recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale; + recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale; + recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale; + + // Check cell state (already used above) + TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale)); + TF_LITE_ENSURE(context, cell_scale <= -9); + + op_data->params_cmsis_nn.cell_state_shift = cell_scale; + input_scale = input->params.scale; + + // Calculate effective scales. + effective_input_to_input_scale = + input_to_input_weight_scale * input_scale / intermediate_scale[0]; + effective_recurrent_to_input_scale = recurrent_to_input_weight_scale * + output_state_scale / + intermediate_scale[0]; + + effective_input_to_forget_scale = + input_to_forget_weight_scale * input_scale / intermediate_scale[1]; + effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale * + output_state_scale / + intermediate_scale[1]; + + effective_input_to_cell_scale = + input_to_cell_weight_scale * input_scale / intermediate_scale[2]; + effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale * + output_state_scale / + intermediate_scale[2]; + + effective_input_to_output_scale = + input_to_output_weight_scale * input_scale / intermediate_scale[3]; + effective_recurrent_to_output_scale = recurrent_to_output_weight_scale * + output_state_scale / + intermediate_scale[3]; + + effective_hidden_scale = + std::pow(2.0f, -15.0f) / intermediate_scale[4] * std::pow(2.0f, -15.0f); + + // Decompose scales. + int shift_output; + QuantizeMultiplier( + static_cast(effective_input_to_input_scale), + &op_data->params_cmsis_nn.input_to_input_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.input_to_input_scaling.shift = + static_cast(shift_output); + + QuantizeMultiplier( + static_cast(effective_recurrent_to_input_scale), + &op_data->params_cmsis_nn.recurrent_to_input_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.recurrent_to_input_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier(static_cast(effective_cell_to_input_scale), + &op_data->params_cmsis_nn.cell_to_input_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.cell_to_input_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_input_to_forget_scale), + &op_data->params_cmsis_nn.input_to_forget_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.input_to_forget_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_recurrent_to_forget_scale), + &op_data->params_cmsis_nn.recurrent_to_forget_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.recurrent_to_forget_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_cell_to_forget_scale), + &op_data->params_cmsis_nn.cell_to_forget_scaling.multiplier, + &shift_output); + // ok + op_data->params_cmsis_nn.cell_to_forget_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier(static_cast(effective_input_to_cell_scale), + &op_data->params_cmsis_nn.input_to_cell_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.input_to_cell_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_recurrent_to_cell_scale), + &op_data->params_cmsis_nn.recurrent_to_cell_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.recurrent_to_cell_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_input_to_output_scale), + &op_data->params_cmsis_nn.input_to_output_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.input_to_output_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_recurrent_to_output_scale), + &op_data->params_cmsis_nn.recurrent_to_output_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.recurrent_to_output_scaling.shift = + static_cast(shift_output); + QuantizeMultiplier( + static_cast(effective_cell_to_output_scale), + &op_data->params_cmsis_nn.cell_to_output_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.cell_to_output_scaling.shift = + static_cast(shift_output); + + op_data->params_cmsis_nn.projection_scaling.shift = + static_cast(shift_output); + + QuantizeMultiplier(static_cast(effective_hidden_scale), + &op_data->params_cmsis_nn.hidden_scaling.multiplier, + &shift_output); + op_data->params_cmsis_nn.hidden_scaling.shift = + static_cast(shift_output); + + op_data->params_cmsis_nn.hidden_offset = intermediate_zp[4]; + + op_data->params_cmsis_nn.activation.min = std::numeric_limits::min(); + op_data->params_cmsis_nn.activation.max = std::numeric_limits::max(); + + return kTfLiteOk; +} + +template +TfLiteStatus CMSIS_NN_EvalInteger8x8_16Lstm( + const OpData& op_data, const LSTMKernelContents& kernel_content, + const LSTMBuffers& buffers) { + const OpDataLSTM& op_data_lstm = op_data.params_ref; + const TfLiteEvalTensor* input = + kernel_content.GetInternalTensor(tflite::kLstmInputTensor); + const TfLiteEvalTensor* input_gate_bias = + kernel_content.GetInternalTensor(tflite::kLstmInputGateBiasTensor); + const TfLiteEvalTensor* forget_gate_bias = + kernel_content.GetInternalTensor(tflite::kLstmForgetGateBiasTensor); + const TfLiteEvalTensor* cell_gate_bias = + kernel_content.GetInternalTensor(tflite::kLstmCellGateBiasTensor); + const TfLiteEvalTensor* output_gate_bias = + kernel_content.GetInternalTensor(tflite::kLstmOutputGateBiasTensor); + const TfLiteEvalTensor* input_to_output_weights = + kernel_content.GetInternalTensor(tflite::kLstmInputToOutputWeightsTensor); + const TfLiteEvalTensor* recurrent_to_output_weights = + kernel_content.GetInternalTensor( + tflite::kLstmRecurrentToOutputWeightsTensor); + const TfLiteEvalTensor* input_to_input_weights = + kernel_content.GetInternalTensor(tflite::kLstmInputToInputWeightsTensor); + const TfLiteEvalTensor* input_to_forget_weights = + kernel_content.GetInternalTensor(tflite::kLstmInputToForgetWeightsTensor); + const TfLiteEvalTensor* input_to_cell_weights = + kernel_content.GetInternalTensor(tflite::kLstmInputToCellWeightsTensor); + const TfLiteEvalTensor* recurrent_to_input_weights = + kernel_content.GetInternalTensor( + tflite::kLstmRecurrentToInputWeightsTensor); + const TfLiteEvalTensor* recurrent_to_forget_weights = + kernel_content.GetInternalTensor( + tflite::kLstmRecurrentToForgetWeightsTensor); + const TfLiteEvalTensor* recurrent_to_cell_weights = + kernel_content.GetInternalTensor( + tflite::kLstmRecurrentToCellWeightsTensor); + const TfLiteEvalTensor* cell_to_input_weights = + kernel_content.GetInternalTensor(tflite::kLstmCellToInputWeightsTensor); + const TfLiteEvalTensor* cell_to_forget_weights = + kernel_content.GetInternalTensor(tflite::kLstmCellToForgetWeightsTensor); + const TfLiteEvalTensor* cell_to_output_weights = + kernel_content.GetInternalTensor(tflite::kLstmCellToOutputWeightsTensor); + const TfLiteEvalTensor* cell_state = + kernel_content.GetInternalTensor(tflite::kLstmCellStateTensor); + const TfLiteEvalTensor* output_state = + kernel_content.GetInternalTensor(tflite::kLstmOutputStateTensor); + const TfLiteEvalTensor* output = kernel_content.output_tensor; + + TFLITE_DCHECK(input->dims->size >= 2 && input->dims->size <= 3); + + cmsis_nn_lstm_context scratch_buffers; + scratch_buffers.input_gate = reinterpret_cast(buffers.buffer0); + scratch_buffers.forget_gate = reinterpret_cast(buffers.buffer1); + scratch_buffers.cell_gate = reinterpret_cast(buffers.buffer2); + scratch_buffers.output_gate = reinterpret_cast(buffers.buffer3); + + cmsis_nn_lstm_params cmsis_lstm_params = op_data.params_cmsis_nn; + cmsis_lstm_params.time_major = op_data_lstm.size_info.time_major; + cmsis_lstm_params.clip.cell = + op_data_lstm.cell_state_info.quantized_cell_clip; + + cmsis_lstm_params.input_gate_bias = const_cast( + tflite::micro::GetOptionalTensorData(input_gate_bias)); + cmsis_lstm_params.forget_gate_bias = const_cast( + tflite::micro::GetOptionalTensorData(forget_gate_bias)); + cmsis_lstm_params.cell_gate_bias = const_cast( + tflite::micro::GetOptionalTensorData(cell_gate_bias)); + cmsis_lstm_params.output_gate_bias = const_cast( + tflite::micro::GetOptionalTensorData(output_gate_bias)); + + const bool time_major = op_data_lstm.size_info.time_major; + const int n_input = input->dims->data[input->dims->size - 1]; + const int n_output = recurrent_to_output_weights->dims->data[1]; + + int max_time, n_batch; + if (input->dims->size == 2) { + max_time = 1; + n_batch = input->dims->data[0]; + } else { + max_time = (time_major) ? input->dims->data[0] : input->dims->data[1]; + n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0]; + } + + cmsis_nn_lstm_dims lstm_dims; + lstm_dims.num_inputs = n_input; + lstm_dims.num_outputs = n_output; + lstm_dims.num_batches = n_batch; + lstm_dims.max_time = max_time; + + arm_lstm_unidirectional_s16_s8( + &scratch_buffers, + const_cast(tflite::micro::GetTensorData(input)), + &lstm_dims, + const_cast( + tflite::micro::GetOptionalTensorData(input_to_input_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + input_to_forget_weights)), + const_cast( + tflite::micro::GetOptionalTensorData(input_to_cell_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + input_to_output_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + recurrent_to_input_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + recurrent_to_forget_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + recurrent_to_cell_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + recurrent_to_output_weights)), + const_cast( + tflite::micro::GetOptionalTensorData(cell_to_input_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + cell_to_forget_weights)), + const_cast(tflite::micro::GetOptionalTensorData( + cell_to_output_weights)), + nullptr, &cmsis_lstm_params, + const_cast(tflite::micro::GetTensorData(output_state)), + const_cast(tflite::micro::GetTensorData(cell_state)), + const_cast(tflite::micro::GetTensorData(output))); + + return kTfLiteOk; +} + +/*Kernel functions*/ + +void* UnidirectionalSequenceLstmInit(TfLiteContext* context, const char* buffer, + size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(OpData)); +} + +TfLiteStatus UnidirectionalSequenceLstmPrepare(TfLiteContext* context, + TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 24); + + TFLITE_DCHECK(node->builtin_data != nullptr); + TFLITE_DCHECK(node->user_data != nullptr); + + OpData* op_data = reinterpret_cast(node->user_data); + OpDataLSTM* op_data_lstm = &op_data->params_ref; + + const auto* builtin_data = + static_cast(node->builtin_data); + // All TempTfLiteTensors will be deallocated through the destructor. + LstmTensors lstm_tensors(context, node); + TF_LITE_ENSURE_OK(context, lstm_tensors.ValidateTensorStatus(context)); + + op_data_lstm->cell_gate_nonlinear_type = builtin_data->activation; + op_data_lstm->size_info = + CreateLstmSizeInfo(builtin_data->time_major, + lstm_tensors.GetInternalTensor(kLstmInputTensor)->dims, + lstm_tensors.HiddenStateTensor()->dims); + + const TfLiteTensor* input = lstm_tensors.GetInternalTensor(kLstmInputTensor); + const auto activation_type = input->type; + + if (kTfLiteInt8 == activation_type) { + TF_LITE_ENSURE_STATUS( + CalculateOpData(context, node, lstm_tensors, op_data)); + } + + TF_LITE_ENSURE_OK(context, ValidateTensorSize(context, lstm_tensors, + op_data_lstm->size_info)); + + // Create cell state information and gate parameters (Fully Connected and Mul) + auto cell_state_type = + lstm_tensors.GetInternalTensor(kLstmCellStateTensor)->type; + if (cell_state_type == kTfLiteFloat32) { + op_data_lstm->cell_state_info = + CreateLstmCellStateInfoFloat(builtin_data->cell_clip); + TF_LITE_ENSURE_OK(context, PrepareGateParametersFloat(context, lstm_tensors, + op_data_lstm)); + } else if (cell_state_type == kTfLiteInt16) { + op_data_lstm->cell_state_info = CreateLstmCellStateInfo( + lstm_tensors.CellStateTensor()->params.scale, builtin_data->cell_clip); + TF_LITE_ENSURE_OK(context, PrepareGateParametersInteger( + context, lstm_tensors, op_data_lstm)); + } else { + MicroPrintf( + "Cell state type %s (%d) not supported. The quantized Unidirectional " + "Sequence LSTM Op only support int16 cell state", + TfLiteTypeGetName(cell_state_type), cell_state_type); + return kTfLiteError; + } + // request buffers (four buffers) + for (size_t i = 0; i < 4; i++) { + TF_LITE_ENSURE_OK(context, context->RequestScratchBufferInArena( + context, + op_data_lstm->size_info.batch_size * + op_data_lstm->size_info.state_dimension * + TfLiteTypeGetSize(cell_state_type), + &(op_data_lstm->buffer_indices[i]))); + } + + return kTfLiteOk; +} + +TfLiteStatus UnidirectionalSequenceLstmEval(TfLiteContext* context, + TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + const OpData& op_data = *reinterpret_cast(node->user_data); + const OpDataLSTM& op_data_lstm = op_data.params_ref; + + auto kernel_content = CreateLSTMKernelContent(context, node); + + const auto activation_type = + kernel_content.internal_tensors[kLstmInputTensor]->type; + const auto weight_type = + kernel_content.internal_tensors[kLstmInputToInputWeightsTensor]->type; + + switch (activation_type) { + case kTfLiteFloat32: { + LSTMBuffers buffers = + CreateLSTMBuffers(context, op_data_lstm.buffer_indices); + EvalLstm(op_data_lstm, kernel_content, + buffers); + break; + } + case kTfLiteInt8: { + switch (weight_type) { + case kTfLiteInt8: { + // 8(activation)x8(weight)->16(cell) LSTM with 32 bits bias + LSTMBuffers buffers = + CreateLSTMBuffers(context, op_data_lstm.buffer_indices); + return CMSIS_NN_EvalInteger8x8_16Lstm( + op_data, kernel_content, buffers); + break; + } + default: { + MicroPrintf("Filter type %s (%d) not supported.", + TfLiteTypeGetName(weight_type), activation_type); + return kTfLiteError; + } + } + break; + } + case kTfLiteInt16: { + switch (weight_type) { + case kTfLiteInt8: { + // 16(activation)x8(weight)->16(cell) LSTM with 64 bits bias + LSTMBuffers buffers = + CreateLSTMBuffers(context, op_data_lstm.buffer_indices); + EvalLstm(op_data_lstm, + kernel_content, buffers); + break; + } + default: { + MicroPrintf("Filter type %s (%d) not supported.", + TfLiteTypeGetName(weight_type), weight_type); + return kTfLiteError; + } + } + break; + } + default: { + MicroPrintf("Input type %s (%d) not supported.", + TfLiteTypeGetName(activation_type), activation_type); + return kTfLiteError; + } + } + return kTfLiteOk; +} + +TfLiteStatus UnidirectionalSequenceLstmEvalInt8(TfLiteContext* context, + TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + const OpData& op_data = *reinterpret_cast(node->user_data); + const OpDataLSTM& op_data_lstm = op_data.params_ref; + auto kernel_content = CreateLSTMKernelContent(context, node); + const auto activation_type = + kernel_content.internal_tensors[kLstmInputTensor]->type; + const auto weight_type = + kernel_content.internal_tensors[kLstmInputToInputWeightsTensor]->type; + + TFLITE_DCHECK(weight_type == kTfLiteInt16 && + "Only int16 filter type supported."); + + if (activation_type == kTfLiteInt8) { + LSTMBuffers buffers = + CreateLSTMBuffers(context, op_data_lstm.buffer_indices); + + return CMSIS_NN_EvalInteger8x8_16Lstm(op_data, kernel_content, + buffers); + } else { + MicroPrintf("Input type %s (%d) not supported.", + TfLiteTypeGetName(activation_type), activation_type); + return kTfLiteError; + } + return kTfLiteOk; +} + +} // namespace + +TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM() { + return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit, + UnidirectionalSequenceLstmPrepare, + UnidirectionalSequenceLstmEval); +} + +TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM_INT8() { + return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit, + UnidirectionalSequenceLstmPrepare, + UnidirectionalSequenceLstmEvalInt8); +} + +} // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/comparisons.cpp b/src/tensorflow/lite/micro/kernels/comparisons.cpp index 31ab9259..76a820a8 100644 --- a/src/tensorflow/lite/micro/kernels/comparisons.cpp +++ b/src/tensorflow/lite/micro/kernels/comparisons.cpp @@ -579,27 +579,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_EQUAL() { +TfLiteRegistration_V1 Register_EQUAL() { return tflite::micro::RegisterOp(Init, Prepare, EqualEval); } -TfLiteRegistration Register_NOT_EQUAL() { +TfLiteRegistration_V1 Register_NOT_EQUAL() { return tflite::micro::RegisterOp(Init, Prepare, NotEqualEval); } -TfLiteRegistration Register_GREATER() { +TfLiteRegistration_V1 Register_GREATER() { return tflite::micro::RegisterOp(Init, Prepare, GreaterEval); } -TfLiteRegistration Register_GREATER_EQUAL() { +TfLiteRegistration_V1 Register_GREATER_EQUAL() { return tflite::micro::RegisterOp(Init, Prepare, GreaterEqualEval); } -TfLiteRegistration Register_LESS() { +TfLiteRegistration_V1 Register_LESS() { return tflite::micro::RegisterOp(Init, Prepare, LessEval); } -TfLiteRegistration Register_LESS_EQUAL() { +TfLiteRegistration_V1 Register_LESS_EQUAL() { return tflite::micro::RegisterOp(Init, Prepare, LessEqualEval); } diff --git a/src/tensorflow/lite/micro/kernels/concatenation.cpp b/src/tensorflow/lite/micro/kernels/concatenation.cpp index 59157564..4e1a7968 100644 --- a/src/tensorflow/lite/micro/kernels/concatenation.cpp +++ b/src/tensorflow/lite/micro/kernels/concatenation.cpp @@ -18,7 +18,6 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/internal/portable_tensor.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/internal/types.h" #include "tensorflow/lite/kernels/kernel_util.h" @@ -252,7 +251,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_CONCATENATION() { +TfLiteRegistration_V1 Register_CONCATENATION() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/conv.h b/src/tensorflow/lite/micro/kernels/conv.h index d50ddc6f..5ad35bca 100644 --- a/src/tensorflow/lite/micro/kernels/conv.h +++ b/src/tensorflow/lite/micro/kernels/conv.h @@ -76,37 +76,39 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node, TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node); -// This is the most generic TfLiteRegistration. The actual supported types may -// still be target dependent. The only requirement is that every implementation -// (reference or optimized) must define this function. -TfLiteRegistration Register_CONV_2D(); +// This is the most generic TfLiteRegistration_V1. The actual supported types +// may still be target dependent. The only requirement is that every +// implementation (reference or optimized) must define this function. +TfLiteRegistration_V1 Register_CONV_2D(); #if defined(XTENSA) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int8 activations and int8 weights and always calls the reference // implementation. -TfLiteRegistration Register_CONV_2D_INT8REF(); +TfLiteRegistration_V1 Register_CONV_2D_INT8REF(); #else -inline TfLiteRegistration Register_CONV_2D_INT8REF() { +inline TfLiteRegistration_V1 Register_CONV_2D_INT8REF() { return Register_CONV_2D(); } #endif #if defined(ARDUINO) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int8 activations and int8 weights and uses the latency optimized // implementations. -TfLiteRegistration Register_CONV_2D_INT8(); +TfLiteRegistration_V1 Register_CONV_2D_INT8(); -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int16 activations and int8 weights and uses the latency optimized // implementations. -TfLiteRegistration Register_CONV_2D_INT16(); +TfLiteRegistration_V1 Register_CONV_2D_INT16(); #else -inline TfLiteRegistration Register_CONV_2D_INT8() { return Register_CONV_2D(); } +inline TfLiteRegistration_V1 Register_CONV_2D_INT8() { + return Register_CONV_2D(); +} -inline TfLiteRegistration Register_CONV_2D_INT16() { +inline TfLiteRegistration_V1 Register_CONV_2D_INT16() { return Register_CONV_2D(); } #endif diff --git a/src/tensorflow/lite/micro/kernels/conv_test.h b/src/tensorflow/lite/micro/kernels/conv_test.h index aa7ea443..5ea0261e 100644 --- a/src/tensorflow/lite/micro/kernels/conv_test.h +++ b/src/tensorflow/lite/micro/kernels/conv_test.h @@ -28,35 +28,37 @@ namespace testing { TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, int output_length, TfLiteConvParams* conv_params, - TfLiteRegistration registration, float* output_data); + TfLiteRegistration_V1 registration, float* output_data); TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, int output_length, TfLiteConvParams* conv_params, - TfLiteRegistration registration, int8_t* output_data); + TfLiteRegistration_V1 registration, + int8_t* output_data); TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, int output_length, TfLiteConvParams* conv_params, - TfLiteRegistration registration, uint8_t* output_data); + TfLiteRegistration_V1 registration, + uint8_t* output_data); TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, const float* expected_output_data, int output_length, TfLiteConvParams* conv_params, - TfLiteRegistration registration, + TfLiteRegistration_V1 registration, float* output_data, float tolerance = 1e-5); TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, const int8_t* expected_output_data, int output_length, TfLiteConvParams* conv_params, - TfLiteRegistration registration, + TfLiteRegistration_V1 registration, int8_t* output_data, float tolerance = 1e-5); TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, const uint8_t* expected_output_data, int output_length, TfLiteConvParams* conv_params, - TfLiteRegistration registration, + TfLiteRegistration_V1 registration, uint8_t* output_data, float tolerance = 1e-5); TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data, @@ -65,7 +67,8 @@ TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data, int* output_dims_data, const float* expected_output_data, TfLiteConvParams* conv_params, - TfLiteRegistration registration, float* output_data); + TfLiteRegistration_V1 registration, + float* output_data); TfLiteStatus TestConvQuantizedPerLayer( int* input_dims_data, const float* input_data, uint8_t* input_quantized, @@ -74,7 +77,7 @@ TfLiteStatus TestConvQuantizedPerLayer( const float* bias_data, int32_t* bias_quantized, int* output_dims_data, const float* expected_output_data, uint8_t* expected_output_quantized, float output_scale, TfLiteConvParams* conv_params, - TfLiteRegistration registration, uint8_t* output_data); + TfLiteRegistration_V1 registration, uint8_t* output_data); TfLiteStatus TestConvQuantizedPerChannel( int* input_dims_data, const float* input_data, int8_t* input_quantized, @@ -84,7 +87,7 @@ TfLiteStatus TestConvQuantizedPerChannel( float* bias_scales, int* bias_zero_points, int* output_dims_data, const float* expected_output_data, int8_t* expected_output_data_quantized, float output_scale, int output_zero_point, TfLiteConvParams* conv_params, - TfLiteRegistration registration, int8_t* output_data, + TfLiteRegistration_V1 registration, int8_t* output_data, TfLiteType tensor_weight_type = kTfLiteNoType); TfLiteStatus TestConvQuantizedPerChannel( @@ -96,7 +99,7 @@ TfLiteStatus TestConvQuantizedPerChannel( int* bias_zero_points, int* output_dims_data, const float* expected_output_data, int16_t* expected_output_data_quantized, float output_scale, int output_zero_point, TfLiteConvParams* conv_params, - TfLiteRegistration registration, int16_t* output_data); + TfLiteRegistration_V1 registration, int16_t* output_data); TfLiteStatus TestConvQuantizedPerChannel( int* input_dims_data, const float* input_data, int16_t* input_quantized, @@ -106,7 +109,7 @@ TfLiteStatus TestConvQuantizedPerChannel( float* bias_scales, int* bias_zero_points, int* output_dims_data, const float* expected_output_data, int16_t* expected_output_data_quantized, float output_scale, int output_zero_point, TfLiteConvParams* conv_params, - TfLiteRegistration registration, int16_t* output_data); + TfLiteRegistration_V1 registration, int16_t* output_data); } // namespace testing } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/cumsum.cpp b/src/tensorflow/lite/micro/kernels/cumsum.cpp index 4f8a9659..1b005e6a 100644 --- a/src/tensorflow/lite/micro/kernels/cumsum.cpp +++ b/src/tensorflow/lite/micro/kernels/cumsum.cpp @@ -168,7 +168,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_CUMSUM() { +TfLiteRegistration_V1 Register_CUMSUM() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/depth_to_space.cpp b/src/tensorflow/lite/micro/kernels/depth_to_space.cpp index 7f229fbf..932e295c 100644 --- a/src/tensorflow/lite/micro/kernels/depth_to_space.cpp +++ b/src/tensorflow/lite/micro/kernels/depth_to_space.cpp @@ -135,7 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_DEPTH_TO_SPACE() { +TfLiteRegistration_V1 Register_DEPTH_TO_SPACE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/depthwise_conv.h b/src/tensorflow/lite/micro/kernels/depthwise_conv.h index 589a02a5..72d4bf02 100644 --- a/src/tensorflow/lite/micro/kernels/depthwise_conv.h +++ b/src/tensorflow/lite/micro/kernels/depthwise_conv.h @@ -49,28 +49,28 @@ TfLiteStatus CalculateOpDataDepthwiseConv( TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node); -// This is the most generic TfLiteRegistration. The actual supported types may -// still be target dependent. The only requirement is that every implementation -// (reference or optimized) must define this function. -TfLiteRegistration Register_DEPTHWISE_CONV_2D(); +// This is the most generic TfLiteRegistration_V1. The actual supported types +// may still be target dependent. The only requirement is that every +// implementation (reference or optimized) must define this function. +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D(); #if defined(ARDUINO) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int8 activations and int8 weights and uses the latency optimized // implementations. -TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8(); +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT8(); -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int16 activations and int8 weights and uses the latency optimized // implementations. -TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16(); +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT16(); #else -inline TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8() { +inline TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT8() { return Register_DEPTHWISE_CONV_2D(); } -inline TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16() { +inline TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT16() { return Register_DEPTHWISE_CONV_2D(); } #endif diff --git a/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp b/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp index 2a0ae2f4..6d5f6c27 100644 --- a/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp +++ b/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp @@ -188,6 +188,13 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) { affine_quantization->zero_point->size); } + TF_LITE_ENSURE_MSG( + context, + input->type == filter->type || + (input->type == kTfLiteInt8 && + (filter->type == kTfLiteInt4 || filter->type == kTfLiteInt8)), + "Hybrid models are not supported on TFLite Micro."); + if (filter->type == kTfLiteInt4) { int filter_size = RuntimeShape(filter->dims->size, diff --git a/src/tensorflow/lite/micro/kernels/dequantize.cpp b/src/tensorflow/lite/micro/kernels/dequantize.cpp index f51db508..1a62176f 100644 --- a/src/tensorflow/lite/micro/kernels/dequantize.cpp +++ b/src/tensorflow/lite/micro/kernels/dequantize.cpp @@ -80,7 +80,7 @@ TfLiteStatus DequantizeEval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteRegistration Register_DEQUANTIZE() { +TfLiteRegistration_V1 Register_DEQUANTIZE() { return tflite::micro::RegisterOp(DequantizeInit, DequantizePrepare, DequantizeEval); } diff --git a/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp b/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp index 7aadbbf8..3a750549 100644 --- a/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp +++ b/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp @@ -799,8 +799,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration* Register_DETECTION_POSTPROCESS() { - static TfLiteRegistration r = tflite::micro::RegisterOp(Init, Prepare, Eval); +TfLiteRegistration_V1* Register_DETECTION_POSTPROCESS() { + static TfLiteRegistration_V1 r = + tflite::micro::RegisterOp(Init, Prepare, Eval); return &r; } diff --git a/src/tensorflow/lite/micro/kernels/div.cpp b/src/tensorflow/lite/micro/kernels/div.cpp index 5c986126..8771ebc0 100644 --- a/src/tensorflow/lite/micro/kernels/div.cpp +++ b/src/tensorflow/lite/micro/kernels/div.cpp @@ -201,7 +201,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_DIV() { +TfLiteRegistration_V1 Register_DIV() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/elementwise.cpp b/src/tensorflow/lite/micro/kernels/elementwise.cpp index 81b27039..1f3b5ecb 100644 --- a/src/tensorflow/lite/micro/kernels/elementwise.cpp +++ b/src/tensorflow/lite/micro/kernels/elementwise.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,9 +25,6 @@ limitations under the License. #include "tensorflow/lite/micro/micro_utils.h" namespace tflite { -namespace ops { -namespace micro { -namespace elementwise { namespace { constexpr int kAbsNameId = 0; @@ -351,9 +348,11 @@ TfLiteStatus RsqrtEval(TfLiteContext* context, TfLiteNode* node) { context, node, [](float f) { return 1.f / std::sqrt(f); }, /*validate_input_func=*/nullptr, type); case kTfLiteInt8: - return EvalImplQuantized(context, node, - elementwise::RsqrtEvalQuantized, - elementwise::validate_input_func, type); + return EvalImplQuantized(context, node, RsqrtEvalQuantized, + validate_input_func, type); + case kTfLiteInt16: + return EvalImplQuantized(context, node, RsqrtEvalQuantized, + validate_input_func, type); default: MicroPrintf("Current data type %s is not supported.", @@ -371,60 +370,47 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -} // namespace elementwise -TfLiteRegistration Register_ABS() { +TfLiteRegistration_V1 Register_ABS() { return tflite::micro::RegisterOp( - elementwise::ElementWiseAbsRsqrtInit, - elementwise::PrepareAbsRsqrt, - elementwise::AbsEval); + ElementWiseAbsRsqrtInit, PrepareAbsRsqrt, + AbsEval); } -TfLiteRegistration Register_SIN() { +TfLiteRegistration_V1 Register_SIN() { return tflite::micro::RegisterOp( - nullptr, elementwise::GenericPrepare, - elementwise::SinEval); + nullptr, GenericPrepare, SinEval); } -TfLiteRegistration Register_COS() { +TfLiteRegistration_V1 Register_COS() { return tflite::micro::RegisterOp( - nullptr, elementwise::GenericPrepare, - elementwise::CosEval); + nullptr, GenericPrepare, CosEval); } -TfLiteRegistration Register_LOG() { +TfLiteRegistration_V1 Register_LOG() { return tflite::micro::RegisterOp( - nullptr, elementwise::GenericPrepare, - elementwise::LogEval); + nullptr, GenericPrepare, LogEval); } -TfLiteRegistration Register_SQRT() { +TfLiteRegistration_V1 Register_SQRT() { return tflite::micro::RegisterOp( - nullptr, elementwise::GenericPrepare, - elementwise::SqrtEval); + nullptr, GenericPrepare, SqrtEval); } -TfLiteRegistration Register_RSQRT() { +TfLiteRegistration_V1 Register_RSQRT() { return tflite::micro::RegisterOp( - elementwise::ElementWiseAbsRsqrtInit, - elementwise::PrepareAbsRsqrt, - elementwise::RsqrtEval); + ElementWiseAbsRsqrtInit, + PrepareAbsRsqrt, RsqrtEval); } -TfLiteRegistration Register_SQUARE() { +TfLiteRegistration_V1 Register_SQUARE() { return tflite::micro::RegisterOp( - nullptr, elementwise::GenericPrepare, - elementwise::SquareEval); + nullptr, GenericPrepare, SquareEval); } -TfLiteRegistration Register_LOGICAL_NOT() { +TfLiteRegistration_V1 Register_LOGICAL_NOT() { return tflite::micro::RegisterOp( - nullptr, elementwise::GenericPrepare, - elementwise::LogicalNotEval); + nullptr, GenericPrepare, LogicalNotEval); } -} // namespace micro -} // namespace ops } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/elu.cpp b/src/tensorflow/lite/micro/kernels/elu.cpp index c4786d6f..482baed2 100644 --- a/src/tensorflow/lite/micro/kernels/elu.cpp +++ b/src/tensorflow/lite/micro/kernels/elu.cpp @@ -144,7 +144,7 @@ TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_ELU() { +TfLiteRegistration_V1 Register_ELU() { return tflite::micro::RegisterOp(EluInit, EluPrepare, EluEval); } diff --git a/src/tensorflow/lite/micro/kernels/ethosu.cpp b/src/tensorflow/lite/micro/kernels/ethosu.cpp index c305121e..1b792fb8 100644 --- a/src/tensorflow/lite/micro/kernels/ethosu.cpp +++ b/src/tensorflow/lite/micro/kernels/ethosu.cpp @@ -20,7 +20,7 @@ limitations under the License. namespace tflite { -TfLiteRegistration* Register_ETHOSU() { return nullptr; } +TfLiteRegistration_V1* Register_ETHOSU() { return nullptr; } const char* GetString_ETHOSU() { return ""; } diff --git a/src/tensorflow/lite/micro/kernels/ethosu.h b/src/tensorflow/lite/micro/kernels/ethosu.h index cfbb0d3f..93ef1d5b 100644 --- a/src/tensorflow/lite/micro/kernels/ethosu.h +++ b/src/tensorflow/lite/micro/kernels/ethosu.h @@ -19,7 +19,7 @@ limitations under the License. namespace tflite { -TfLiteRegistration* Register_ETHOSU(); +TfLiteRegistration_V1* Register_ETHOSU(); const char* GetString_ETHOSU(); diff --git a/src/tensorflow/lite/micro/kernels/exp.cpp b/src/tensorflow/lite/micro/kernels/exp.cpp index a835ee0a..44a39f45 100644 --- a/src/tensorflow/lite/micro/kernels/exp.cpp +++ b/src/tensorflow/lite/micro/kernels/exp.cpp @@ -72,7 +72,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_EXP() { +TfLiteRegistration_V1 Register_EXP() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/expand_dims.cpp b/src/tensorflow/lite/micro/kernels/expand_dims.cpp index ad45dd88..4c98ef9d 100644 --- a/src/tensorflow/lite/micro/kernels/expand_dims.cpp +++ b/src/tensorflow/lite/micro/kernels/expand_dims.cpp @@ -142,7 +142,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_EXPAND_DIMS() { +TfLiteRegistration_V1 Register_EXPAND_DIMS() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/fill.cpp b/src/tensorflow/lite/micro/kernels/fill.cpp index 6a3f4998..a759a0fe 100644 --- a/src/tensorflow/lite/micro/kernels/fill.cpp +++ b/src/tensorflow/lite/micro/kernels/fill.cpp @@ -133,7 +133,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_FILL() { +TfLiteRegistration_V1 Register_FILL() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/floor.cpp b/src/tensorflow/lite/micro/kernels/floor.cpp index 207b5c4b..bf6404c3 100644 --- a/src/tensorflow/lite/micro/kernels/floor.cpp +++ b/src/tensorflow/lite/micro/kernels/floor.cpp @@ -41,7 +41,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_FLOOR() { +TfLiteRegistration_V1 Register_FLOOR() { return tflite::micro::RegisterOp(nullptr, nullptr, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/floor_div.cpp b/src/tensorflow/lite/micro/kernels/floor_div.cpp index f143d28a..d70080e7 100644 --- a/src/tensorflow/lite/micro/kernels/floor_div.cpp +++ b/src/tensorflow/lite/micro/kernels/floor_div.cpp @@ -123,7 +123,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_FLOOR_DIV() { +TfLiteRegistration_V1 Register_FLOOR_DIV() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/floor_mod.cpp b/src/tensorflow/lite/micro/kernels/floor_mod.cpp index 939a4dd7..aa53b157 100644 --- a/src/tensorflow/lite/micro/kernels/floor_mod.cpp +++ b/src/tensorflow/lite/micro/kernels/floor_mod.cpp @@ -121,7 +121,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_FLOOR_MOD() { +TfLiteRegistration_V1 Register_FLOOR_MOD() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/fully_connected.h b/src/tensorflow/lite/micro/kernels/fully_connected.h index 3ecf27e4..cbeaf4c5 100644 --- a/src/tensorflow/lite/micro/kernels/fully_connected.h +++ b/src/tensorflow/lite/micro/kernels/fully_connected.h @@ -68,15 +68,15 @@ TfLiteStatus CalculateOpDataFullyConnected( TfLiteType data_type, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output, OpDataFullyConnected* data); -// This is the most generic TfLiteRegistration. The actual supported types may -// still be target dependent. The only requirement is that every implementation -// (reference or optimized) must define this function. -TfLiteRegistration Register_FULLY_CONNECTED(); +// This is the most generic TfLiteRegistration_V1. The actual supported types +// may still be target dependent. The only requirement is that every +// implementation (reference or optimized) must define this function. +TfLiteRegistration_V1 Register_FULLY_CONNECTED(); #if defined(ARDUINO) || defined(HEXAGON) || defined(XTENSA) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int8. -TfLiteRegistration Register_FULLY_CONNECTED_INT8(); +TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT8(); #else // Note that while this block gets used for both reference and optimized kernels @@ -84,16 +84,16 @@ TfLiteRegistration Register_FULLY_CONNECTED_INT8(); // define fallback implementation that allow reference kernels to still be used // from applications that call a more specific kernel variant. -inline TfLiteRegistration Register_FULLY_CONNECTED_INT8() { +inline TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT8() { return Register_FULLY_CONNECTED(); } #endif #if defined(ARDUINO) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int16. -TfLiteRegistration Register_FULLY_CONNECTED_INT16(); +TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT16(); #else // Note that while this block gets used for both reference and optimized kernels @@ -101,7 +101,7 @@ TfLiteRegistration Register_FULLY_CONNECTED_INT16(); // define fallback implementation that allow reference kernels to still be used // from applications that call a more specific kernel variant. -inline TfLiteRegistration Register_FULLY_CONNECTED_INT16() { +inline TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT16() { return Register_FULLY_CONNECTED(); } diff --git a/src/tensorflow/lite/micro/kernels/gather.cpp b/src/tensorflow/lite/micro/kernels/gather.cpp index 4ec53473..9c858957 100644 --- a/src/tensorflow/lite/micro/kernels/gather.cpp +++ b/src/tensorflow/lite/micro/kernels/gather.cpp @@ -217,7 +217,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_GATHER() { +TfLiteRegistration_V1 Register_GATHER() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/gather_nd.cpp b/src/tensorflow/lite/micro/kernels/gather_nd.cpp index 3a02e815..27307d1a 100644 --- a/src/tensorflow/lite/micro/kernels/gather_nd.cpp +++ b/src/tensorflow/lite/micro/kernels/gather_nd.cpp @@ -205,7 +205,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_GATHER_ND() { +TfLiteRegistration_V1 Register_GATHER_ND() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/hard_swish.cpp b/src/tensorflow/lite/micro/kernels/hard_swish.cpp index a0b3f7c6..8e3a9cde 100644 --- a/src/tensorflow/lite/micro/kernels/hard_swish.cpp +++ b/src/tensorflow/lite/micro/kernels/hard_swish.cpp @@ -67,7 +67,7 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_HARD_SWISH() { +TfLiteRegistration_V1 Register_HARD_SWISH() { return tflite::micro::RegisterOp(HardSwishInit, tflite::HardSwishPrepare, HardSwishEval); } diff --git a/src/tensorflow/lite/micro/kernels/if.cpp b/src/tensorflow/lite/micro/kernels/if.cpp index 39eca8b4..a23bfc53 100644 --- a/src/tensorflow/lite/micro/kernels/if.cpp +++ b/src/tensorflow/lite/micro/kernels/if.cpp @@ -114,7 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_IF() { +TfLiteRegistration_V1 Register_IF() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/kernel_runner.cpp b/src/tensorflow/lite/micro/kernels/kernel_runner.cpp index 070f32a5..6ec2e350 100644 --- a/src/tensorflow/lite/micro/kernels/kernel_runner.cpp +++ b/src/tensorflow/lite/micro/kernels/kernel_runner.cpp @@ -34,7 +34,7 @@ void ClearBufferApi(TfLiteContext* context_) { context_->RequestScratchBufferInArena = nullptr; } -KernelRunner::KernelRunner(const TfLiteRegistration& registration, +KernelRunner::KernelRunner(const TfLiteRegistration_V1& registration, TfLiteTensor* tensors, int tensors_size, TfLiteIntArray* inputs, TfLiteIntArray* outputs, void* builtin_data, TfLiteIntArray* intermediates) @@ -94,7 +94,7 @@ TfLiteStatus KernelRunner::Invoke() { context_.GetScratchBuffer = MicroContextGetScratchBuffer; if (registration_.invoke == nullptr) { - MicroPrintf("TfLiteRegistration missing invoke function pointer!"); + MicroPrintf("TfLiteRegistration_V1 missing invoke function pointer!"); return kTfLiteError; } @@ -110,7 +110,7 @@ TfLiteStatus KernelRunner::Free() { context_.GetScratchBuffer = MicroContextGetScratchBuffer; if (registration_.free == nullptr) { - MicroPrintf("TfLiteRegistration missing free function pointer!"); + MicroPrintf("TfLiteRegistration_V1 missing free function pointer!"); return kTfLiteError; } diff --git a/src/tensorflow/lite/micro/kernels/kernel_runner.h b/src/tensorflow/lite/micro/kernels/kernel_runner.h index c7d53c3a..64eac8a6 100644 --- a/src/tensorflow/lite/micro/kernels/kernel_runner.h +++ b/src/tensorflow/lite/micro/kernels/kernel_runner.h @@ -25,7 +25,7 @@ limitations under the License. namespace tflite { namespace micro { -// Helper class to perform a simulated kernel (i.e. TfLiteRegistration) +// Helper class to perform a simulated kernel (i.e. TfLiteRegistration_V1) // lifecycle (init, prepare, invoke). All internal allocations are handled by // this class. Simply pass in the registration, list of required tensors, inputs // array, outputs array, and any pre-builtin data. Calling Invoke() will @@ -33,22 +33,22 @@ namespace micro { // output provided during construction. class KernelRunner { public: - KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors, + KernelRunner(const TfLiteRegistration_V1& registration, TfLiteTensor* tensors, int tensors_size, TfLiteIntArray* inputs, TfLiteIntArray* outputs, void* builtin_data, TfLiteIntArray* intermediates = nullptr); - // Calls init and prepare on the kernel (i.e. TfLiteRegistration) struct. Any - // exceptions will be DebugLog'd and returned as a status code. + // Calls init and prepare on the kernel (i.e. TfLiteRegistration_V1) struct. + // Any exceptions will be DebugLog'd and returned as a status code. TfLiteStatus InitAndPrepare(const char* init_data = nullptr, size_t length = 0); - // Calls init, prepare, and invoke on a given TfLiteRegistration pointer. - // After successful invoke, results will be available in the output tensor as - // passed into the constructor of this class. + // Calls invoke on a given TfLiteRegistration_V1 pointer. After successful + // invoke, results will be available in the output tensor as passed into the + // constructor of this class. TfLiteStatus Invoke(); - // Calls Free on a given TfLiteRegistration pointer(if it's implemented). + // Calls Free on a given TfLiteRegistration_V1 pointer(if it's implemented). // After successful Free, kTfLiteOk status will be returned. If Free is not // implemented for a given kernel kTfLiteError will be returned. TfLiteStatus Free(); @@ -68,7 +68,7 @@ class KernelRunner { TfLiteContext context_ = {}; TfLiteNode node_ = {}; - const TfLiteRegistration& registration_; + const TfLiteRegistration_V1& registration_; SingleArenaBufferAllocator* allocator_; MockMicroGraph mock_micro_graph_; diff --git a/src/tensorflow/lite/micro/kernels/kernel_util.cpp b/src/tensorflow/lite/micro/kernels/kernel_util.cpp index 76031b87..6d766672 100644 --- a/src/tensorflow/lite/micro/kernels/kernel_util.cpp +++ b/src/tensorflow/lite/micro/kernels/kernel_util.cpp @@ -38,7 +38,7 @@ int ValidateTensorIndexing(const TfLiteContext* context, int index, } // namespace -TfLiteRegistration RegisterOp( +TfLiteRegistration_V1 RegisterOp( void* (*init)(TfLiteContext* context, const char* buffer, size_t length), TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node), TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node), @@ -50,8 +50,7 @@ TfLiteRegistration RegisterOp( /*profiling_string=*/nullptr, /*builtin_code=*/0, /*custom_name=*/nullptr, - /*version=*/0, - /*registration_external=*/nullptr}; + /*version=*/0}; } // Returns a mutable tensor for a given input index. is_variable must be checked diff --git a/src/tensorflow/lite/micro/kernels/kernel_util.h b/src/tensorflow/lite/micro/kernels/kernel_util.h index f30ae44c..191ab2db 100644 --- a/src/tensorflow/lite/micro/kernels/kernel_util.h +++ b/src/tensorflow/lite/micro/kernels/kernel_util.h @@ -28,7 +28,7 @@ limitations under the License. namespace tflite { namespace micro { -TfLiteRegistration RegisterOp( +TfLiteRegistration_V1 RegisterOp( void* (*init)(TfLiteContext* context, const char* buffer, size_t length), TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node), TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node), diff --git a/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp b/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp index d4225e46..794f2b67 100644 --- a/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp +++ b/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp @@ -135,7 +135,7 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_L2_POOL_2D() { +TfLiteRegistration_V1 Register_L2_POOL_2D() { return tflite::micro::RegisterOp(nullptr, L2Prepare, L2Eval); } diff --git a/src/tensorflow/lite/micro/kernels/l2norm.cpp b/src/tensorflow/lite/micro/kernels/l2norm.cpp index 97f372aa..6dbf93c7 100644 --- a/src/tensorflow/lite/micro/kernels/l2norm.cpp +++ b/src/tensorflow/lite/micro/kernels/l2norm.cpp @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/internal/portable_tensor.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h" #include "tensorflow/lite/kernels/internal/reference/l2normalization.h" #include "tensorflow/lite/kernels/kernel_util.h" @@ -132,10 +131,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_L2NORM_REF() { +TfLiteRegistration_V1 Register_L2NORM_REF() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } -TfLiteRegistration Register_L2_NORMALIZATION() { return Register_L2NORM_REF(); } +TfLiteRegistration_V1 Register_L2_NORMALIZATION() { + return Register_L2NORM_REF(); +} } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/leaky_relu.cpp b/src/tensorflow/lite/micro/kernels/leaky_relu.cpp index 7b51ebcb..1873e3cc 100644 --- a/src/tensorflow/lite/micro/kernels/leaky_relu.cpp +++ b/src/tensorflow/lite/micro/kernels/leaky_relu.cpp @@ -87,7 +87,7 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteError; } -TfLiteRegistration Register_LEAKY_RELU() { +TfLiteRegistration_V1 Register_LEAKY_RELU() { return tflite::micro::RegisterOp(LeakyReluInit, LeakyReluPrepare, LeakyReluEval); } diff --git a/src/tensorflow/lite/micro/kernels/log_softmax.cpp b/src/tensorflow/lite/micro/kernels/log_softmax.cpp index 0b1838c3..1ce04c65 100644 --- a/src/tensorflow/lite/micro/kernels/log_softmax.cpp +++ b/src/tensorflow/lite/micro/kernels/log_softmax.cpp @@ -141,7 +141,7 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_LOG_SOFTMAX() { +TfLiteRegistration_V1 Register_LOG_SOFTMAX() { return tflite::micro::RegisterOp(nullptr, LogSoftmaxPrepare, LogSoftmaxEval); } diff --git a/src/tensorflow/lite/micro/kernels/logical.cpp b/src/tensorflow/lite/micro/kernels/logical.cpp index c85e0c5b..415c85c5 100644 --- a/src/tensorflow/lite/micro/kernels/logical.cpp +++ b/src/tensorflow/lite/micro/kernels/logical.cpp @@ -33,11 +33,11 @@ TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_LOGICAL_OR() { +TfLiteRegistration_V1 Register_LOGICAL_OR() { return tflite::micro::RegisterOp(nullptr, nullptr, LogicalOrEval); } -TfLiteRegistration Register_LOGICAL_AND() { +TfLiteRegistration_V1 Register_LOGICAL_AND() { return tflite::micro::RegisterOp(nullptr, nullptr, LogicalAndEval); } diff --git a/src/tensorflow/lite/micro/kernels/logistic.cpp b/src/tensorflow/lite/micro/kernels/logistic.cpp index 108206ad..f968771c 100644 --- a/src/tensorflow/lite/micro/kernels/logistic.cpp +++ b/src/tensorflow/lite/micro/kernels/logistic.cpp @@ -105,7 +105,7 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_LOGISTIC() { +TfLiteRegistration_V1 Register_LOGISTIC() { return tflite::micro::RegisterOp(LogisticInit, LogisticPrepare, LogisticEval); } } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/lstm_eval.cpp b/src/tensorflow/lite/micro/kernels/lstm_eval.cpp index 4666e908..93d6bc7e 100644 --- a/src/tensorflow/lite/micro/kernels/lstm_eval.cpp +++ b/src/tensorflow/lite/micro/kernels/lstm_eval.cpp @@ -27,6 +27,81 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { + +LstmTensors::LstmTensors(TfLiteContext* context, TfLiteNode* node) { + micro_context_ = GetMicroContext(context); + // 24 internal tensors. see lstm_shared.h for tensor names + for (size_t i = 0; i < 24; i++) { + internal_tensors_[i] = micro_context_->AllocateTempInputTensor(node, i); + } + output_tensor_ = + micro_context_->AllocateTempOutputTensor(node, kLstmOutputTensor); +} + +LstmTensors::~LstmTensors() { + for (size_t i = 0; i < 24; i++) { + if (internal_tensors_[i] != nullptr) { + micro_context_->DeallocateTempTfLiteTensor(internal_tensors_[i]); + } + } + micro_context_->DeallocateTempTfLiteTensor(output_tensor_); +} + +// Verify the LSTM internal tensor properties (e.g., type checks) +// Input/output/states/fc weights tensors are required for kernel evaulation. +// The state tensors should be variables. Variants of the standard LSTM +// are not supported here, therefore their corresponding tensors should be +// invalid +TfLiteStatus LstmTensors::ValidateTensorStatus(TfLiteContext* context) const { + // Verify certain tensor properties + // input tensor + TF_LITE_ENSURE(context, internal_tensors_[kLstmInputTensor] != nullptr); + // hidden state + TF_LITE_ENSURE(context, internal_tensors_[kLstmOutputStateTensor] != nullptr); + TF_LITE_ENSURE(context, + internal_tensors_[kLstmOutputStateTensor]->is_variable); + // hidden state becomes input so they must have the same type + TF_LITE_ENSURE_EQ(context, internal_tensors_[kLstmOutputStateTensor]->type, + internal_tensors_[kLstmInputTensor]->type); + // cell state + TF_LITE_ENSURE(context, internal_tensors_[kLstmCellStateTensor] != nullptr); + TF_LITE_ENSURE(context, internal_tensors_[kLstmCellStateTensor]->is_variable); + // output + TF_LITE_ENSURE(context, output_tensor_ != nullptr); + // output type is the same as the input type (activations) + TF_LITE_ENSURE_EQ(context, output_tensor_->type, + internal_tensors_[kLstmInputTensor]->type); + + // weight tensors (1-9, see lstm_shared for index definition) + const auto weight_type = + internal_tensors_[kLstmInputToForgetWeightsTensor]->type; + for (size_t i = 1; i < 9; i++) { + TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr); + TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, weight_type); + } + + // bias tensors (12-15, see lstm_shared for index definition) + const auto bias_type = internal_tensors_[kLstmForgetGateBiasTensor]->type; + for (size_t i = 12; i < 16; i++) { + TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr); + TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, bias_type); + } + // Tensors from LSTM variants are invalid + // No peephole + for (size_t i = 9; i < 12; i++) { + TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr); + } + // No projection + for (size_t i = 16; i < 18; i++) { + TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr); + } + // No internal layer norm + for (size_t i = 20; i < 24; i++) { + TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr); + } + return kTfLiteOk; +} + namespace lstm_internal { const int32_t kInt16Max = std::numeric_limits::max(); @@ -70,17 +145,15 @@ void Tanh(int32_t cell_state_scale_power, const RuntimeShape& input_data_shape, int16_t* input_data, const RuntimeShape& output_data_shape, int16_t* output_data) { int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3; + int32_t input_multiplier = 0; if (tanh_input_left_shift < 0) /* handling negative shift value */ { - int32_t i; tanh_input_left_shift = -tanh_input_left_shift; - for (i = 0; i < input_data_shape.FlatSize(); i++) { - input_data[i] = input_data[i] >> tanh_input_left_shift; - } - tanh_input_left_shift = 0; + input_multiplier = 3; } - reference_integer_ops::Tanh(0, tanh_input_left_shift, input_data_shape, - input_data, output_data_shape, output_data); + reference_integer_ops::Tanh(input_multiplier, tanh_input_left_shift, + input_data_shape, input_data, output_data_shape, + output_data); } void Tanh(int32_t cell_state_scale_power, const RuntimeShape& input_data_shape, diff --git a/src/tensorflow/lite/micro/kernels/lstm_eval.h b/src/tensorflow/lite/micro/kernels/lstm_eval.h index ebede610..62bc6354 100644 --- a/src/tensorflow/lite/micro/kernels/lstm_eval.h +++ b/src/tensorflow/lite/micro/kernels/lstm_eval.h @@ -29,6 +29,130 @@ limitations under the License. #include "tensorflow/lite/micro/micro_log.h" namespace tflite { + +// Interface to access all the TempTfLiteTensors of the LSTM kernel during the +// preparation phase. Can only be constructed through the constructor to avoid +// memory leakage. All TempTfLiteTensors will be deallocated through the +// destructor. +class LstmTensors { + public: + LstmTensors(const LstmTensors& other) = delete; + LstmTensors& operator=(const LstmTensors& other) = delete; + + LstmTensors(TfLiteContext* context, TfLiteNode* node); + ~LstmTensors(); + + // Verify the LSTM internal tensor properties (e.g., type checks) + // Input/output/states/fc weights tensors are required for kernel evaulation. + // The state tensors should be variables. Variants of the standard LSTM + // are not supported here, therefore their corresponding tensors should be + // invalid + TfLiteStatus ValidateTensorStatus(TfLiteContext* context) const; + + // Internal tensors. see lstm_shared.h for tensor names + const TfLiteTensor* GetInternalTensor(const int tensor_index) const { + return internal_tensors_[tensor_index]; + } + + const TfLiteTensor* HiddenStateTensor() const { + return internal_tensors_[kLstmOutputStateTensor]; + } + const TfLiteTensor* CellStateTensor() const { + return internal_tensors_[kLstmCellStateTensor]; + } + const TfLiteTensor* OutputTensor() const { return output_tensor_; } + + private: + // see lstm_shared.h for tensor names + MicroContext* micro_context_; + TfLiteTensor* internal_tensors_[24]; + TfLiteTensor* output_tensor_; +}; + +// Deduce the size information (Batch (B), Time Steps (T), Input dimension (I), +// State dimension (S)) that defines the LSTM using the input and hidden state +// tensor +LstmSizeInfo CreateLstmSizeInfo( + const bool time_major, const TfLiteIntArray* input_tensor_shape, + const TfLiteIntArray* hidden_state_tensor_shape); + +TfLiteStatus ValidateWeightTensorSize(TfLiteContext* context, + const TfLiteTensor* tensor, int dim1_size, + int dim2_size); + +TfLiteStatus ValidateBiasTensorSize(TfLiteContext* context, + const TfLiteTensor* tensor, int size); + +// Go through every tensors and make sure their shape match the kernel +// configuration +TfLiteStatus ValidateTensorSize(TfLiteContext* context, + const LstmTensors& tensors, + const LstmSizeInfo& size_info); + +// Wrapper function to create gate parameters for the four internal LSTM gates +TfLiteStatus CreateGateParams( + TfLiteContext* context, + /*Input tensors*/ + const TfLiteTensor* input, const TfLiteTensor* input_weight, + const TfLiteTensor* input_bias, + /*Hidden state tensors*/ + const TfLiteTensor* hidden_state, const TfLiteTensor* hidden_state_weight, + const TfLiteTensor* hidden_state_bias, + /*Scale of the fc output (input to non-linear activation)*/ + const float nonlinear_activation_input_scale, const TfLiteType cell_type, + const tflite::GateParameters& gate_params); + +// Create parameters for element wise multiplication that happens in a) cell +// state update ; b) hidden state update +// Note that all the output of gates are symmetrically quantized so only scales +// are required for input. However, during the hidden state update phase, the +// output is the updated hidden state, which is asymmetrically quantized. Thus +// output may require zero point +tflite::ArithmeticParams CreateInterGateMulParams(const float input1_scale, + const float input2_scale, + const float output_scale, + const TfLiteType output_type, + const int output_zp = 0); + +// Create the additional information about the cell state, which include: +// cell_state_scale_power: used in integer nonlinear function (e.g., tanh) +// quantized_cell_clip: quantized cell clip range +CellStateInfo CreateLstmCellStateInfo(const float cell_state_scale, + const float cell_clip); + +CellStateInfo CreateLstmCellStateInfoFloat(const float cell_clip); +tflite::FullyConnectedParams CreateFCParamsFloat(); + +tflite::GateParameters CreateGateParamsFloat(); + +tflite::ArithmeticParams CreateInterGateMulParamsFloat(); + +TfLiteStatus PrepareGateParametersFloat(TfLiteContext* context, + const LstmTensors& lstm_tensors, + OpDataLSTM* op_data_lstm); + +TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context, + const LstmTensors& lstm_tensors, + OpDataLSTM* op_data_lstm); + +LSTMKernelContents CreateLSTMKernelContent(TfLiteContext* context, + TfLiteNode* node); + +template +LSTMBuffers CreateLSTMBuffers(TfLiteContext* context, + const int* buffer_indices) { + LSTMBuffers buffers; + buffers.buffer0 = reinterpret_cast( + context->GetScratchBuffer(context, buffer_indices[0])); + buffers.buffer1 = reinterpret_cast( + context->GetScratchBuffer(context, buffer_indices[1])); + buffers.buffer2 = reinterpret_cast( + context->GetScratchBuffer(context, buffer_indices[2])); + buffers.buffer3 = reinterpret_cast( + context->GetScratchBuffer(context, buffer_indices[3])); + return buffers; +} + // Since LSTM includes multiple intermediate stages, introducing the internal // namespace to expose them for testing namespace lstm_internal { @@ -269,7 +393,7 @@ template void LstmStep(const LstmStepManager& step_info, const OpDataLSTM& op_data, LSTMKernelContents& kernel_content, - LSTMBuffers& buffers) { + const LSTMBuffers& buffers) { /*Step1: Calculate gate outputs to prepare cell state update*/ CellType* gate_internal_buffer = buffers.buffer3; CellType* forget_gate_output = buffers.buffer0; @@ -385,7 +509,7 @@ template TfLiteStatus EvalLstm(const OpDataLSTM& op_data, LSTMKernelContents& kernel_content, - LSTMBuffers& buffers) { + const LSTMBuffers& buffers) { lstm_internal::LstmStepManager step_info(&op_data.size_info); const auto& size_info = op_data.size_info; // time is the first dimention, enable batch computation diff --git a/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cpp b/src/tensorflow/lite/micro/kernels/lstm_eval_common.cpp similarity index 53% rename from src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cpp rename to src/tensorflow/lite/micro/kernels/lstm_eval_common.cpp index e671abec..9631b4c1 100644 --- a/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cpp +++ b/src/tensorflow/lite/micro/kernels/lstm_eval_common.cpp @@ -13,130 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -// Integer version of unidirectional sequence lstm. Only the standard LSTM -// (defined in the keras LSTM layer, e.g., no peephole etc.) is supported here. -// Currently used by the 16 bits activation case only - -#include -#include - #include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/fully_connected.h" -#include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/lstm_eval.h" -#include "tensorflow/lite/micro/kernels/lstm_shared.h" namespace tflite { -namespace { -/*Helper Functions*/ - -// Interface to access all the TempTfLiteTensors of the LSTM kernel during the -// preparation phase. Can only be constructed through the constructor to avoid -// memory leakage. All TempTfLiteTensors will be deallocated through the -// destructor. -class LstmTensors { - public: - LstmTensors(const LstmTensors& other) = delete; - LstmTensors& operator=(const LstmTensors& other) = delete; - - LstmTensors(TfLiteContext* context, TfLiteNode* node) { - micro_context_ = GetMicroContext(context); - // 24 internal tensors. see lstm_shared.h for tensor names - for (size_t i = 0; i < 24; i++) { - internal_tensors_[i] = micro_context_->AllocateTempInputTensor(node, i); - } - output_tensor_ = - micro_context_->AllocateTempOutputTensor(node, kLstmOutputTensor); - } - - ~LstmTensors() { - for (size_t i = 0; i < 24; i++) { - if (internal_tensors_[i] != nullptr) { - micro_context_->DeallocateTempTfLiteTensor(internal_tensors_[i]); - } - } - micro_context_->DeallocateTempTfLiteTensor(output_tensor_); - } - - // Verify the LSTM internal tensor properties (e.g., type checks) - // Input/output/states/fc weights tensors are required for kernel evaulation. - // The state tensors should be variables. Variants of the standard LSTM - // are not supported here, therefore their corresponding tensors should be - // invalid - TfLiteStatus ValidateTensorStatus(TfLiteContext* context) const { - // Verify certain tensor properties - // input tensor - TF_LITE_ENSURE(context, internal_tensors_[kLstmInputTensor] != nullptr); - // hidden state - TF_LITE_ENSURE(context, - internal_tensors_[kLstmOutputStateTensor] != nullptr); - TF_LITE_ENSURE(context, - internal_tensors_[kLstmOutputStateTensor]->is_variable); - // hidden state becomes input so they must have the same type - TF_LITE_ENSURE_EQ(context, internal_tensors_[kLstmOutputStateTensor]->type, - internal_tensors_[kLstmInputTensor]->type); - // cell state - TF_LITE_ENSURE(context, internal_tensors_[kLstmCellStateTensor] != nullptr); - TF_LITE_ENSURE(context, - internal_tensors_[kLstmCellStateTensor]->is_variable); - // output - TF_LITE_ENSURE(context, output_tensor_ != nullptr); - // output type is the same as the input type (activations) - TF_LITE_ENSURE_EQ(context, output_tensor_->type, - internal_tensors_[kLstmInputTensor]->type); - - // weight tensors (1-9, see lstm_shared for index definition) - const auto weight_type = - internal_tensors_[kLstmInputToForgetWeightsTensor]->type; - for (size_t i = 1; i < 9; i++) { - TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr); - TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, weight_type); - } - - // bias tensors (12-15, see lstm_shared for index definition) - const auto bias_type = internal_tensors_[kLstmForgetGateBiasTensor]->type; - for (size_t i = 12; i < 16; i++) { - TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr); - TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, bias_type); - } - // Tensors from LSTM variants are invalid - // No peephole - for (size_t i = 9; i < 12; i++) { - TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr); - } - // No projection - for (size_t i = 16; i < 18; i++) { - TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr); - } - // No internal layer norm - for (size_t i = 20; i < 24; i++) { - TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr); - } - return kTfLiteOk; - } - - // Internal tensors. see lstm_shared.h for tensor names - const TfLiteTensor* GetInternalTensor(const int tensor_index) const { - return internal_tensors_[tensor_index]; - } - - const TfLiteTensor* HiddenStateTensor() const { - return internal_tensors_[kLstmOutputStateTensor]; - } - const TfLiteTensor* CellStateTensor() const { - return internal_tensors_[kLstmCellStateTensor]; - } - const TfLiteTensor* OutputTensor() const { return output_tensor_; } - - private: - // see lstm_shared.h for tensor names - MicroContext* micro_context_; - TfLiteTensor* internal_tensors_[24]; - TfLiteTensor* output_tensor_; -}; - // Deduce the size information (Batch (B), Time Steps (T), Input dimension (I), // State dimension (S)) that defines the LSTM using the input and hidden state // tensor @@ -269,7 +152,7 @@ tflite::ArithmeticParams CreateInterGateMulParams(const float input1_scale, const float input2_scale, const float output_scale, const TfLiteType output_type, - const int output_zp = 0) { + const int output_zp) { tflite::ArithmeticParams op_params = {}; if (output_type == kTfLiteInt16) { op_params.quantized_activation_min = std::numeric_limits::min(); @@ -310,6 +193,7 @@ CellStateInfo CreateLstmCellStateInfo(const float cell_state_scale, static_cast(cell_state_scale), -32768.0), 32767.0)); + return cell_state_info; } @@ -344,26 +228,26 @@ tflite::ArithmeticParams CreateInterGateMulParamsFloat() { TfLiteStatus PrepareGateParametersFloat(TfLiteContext* context, const LstmTensors& lstm_tensors, - OpDataLSTM* op_data) { + OpDataLSTM* op_data_lstm) { // Gate Parameters - op_data->forget_gate_parameters = CreateGateParamsFloat(); - op_data->input_gate_parameters = CreateGateParamsFloat(); - op_data->cell_gate_parameters = CreateGateParamsFloat(); - op_data->output_gate_parameters = CreateGateParamsFloat(); + op_data_lstm->forget_gate_parameters = CreateGateParamsFloat(); + op_data_lstm->input_gate_parameters = CreateGateParamsFloat(); + op_data_lstm->cell_gate_parameters = CreateGateParamsFloat(); + op_data_lstm->output_gate_parameters = CreateGateParamsFloat(); // Inter gate multiplication parameters - op_data->inter_gate_parameters.forget_cell_mul_params = + op_data_lstm->inter_gate_parameters.forget_cell_mul_params = CreateInterGateMulParamsFloat(); - op_data->inter_gate_parameters.input_mul_params = + op_data_lstm->inter_gate_parameters.input_mul_params = CreateInterGateMulParamsFloat(); - op_data->inter_gate_parameters.output_mul_params = + op_data_lstm->inter_gate_parameters.output_mul_params = CreateInterGateMulParamsFloat(); return kTfLiteOk; } TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context, const LstmTensors& lstm_tensors, - OpDataLSTM* op_data) { - float nonlinear_input_scale = 0.00024414062; // 2^-12 Q3.12 -> Q0.15 + OpDataLSTM* op_data_lstm) { + float nonlinear_input_scale = 0.000244140625; // 2^-12 Q3.12 -> Q0.15 TF_LITE_ENSURE_OK( context, CreateGateParams( @@ -373,7 +257,7 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context, lstm_tensors.GetInternalTensor(kLstmOutputStateTensor), lstm_tensors.GetInternalTensor(kLstmRecurrentToForgetWeightsTensor), /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16, - op_data->forget_gate_parameters)); + op_data_lstm->forget_gate_parameters)); TF_LITE_ENSURE_OK( context, CreateGateParams( @@ -383,7 +267,7 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context, lstm_tensors.GetInternalTensor(kLstmOutputStateTensor), lstm_tensors.GetInternalTensor(kLstmRecurrentToInputWeightsTensor), /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16, - op_data->input_gate_parameters)); + op_data_lstm->input_gate_parameters)); TF_LITE_ENSURE_OK( context, CreateGateParams( @@ -393,7 +277,7 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context, lstm_tensors.GetInternalTensor(kLstmOutputStateTensor), lstm_tensors.GetInternalTensor(kLstmRecurrentToCellWeightsTensor), /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16, - op_data->cell_gate_parameters)); + op_data_lstm->cell_gate_parameters)); TF_LITE_ENSURE_OK( context, CreateGateParams( @@ -403,25 +287,26 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context, lstm_tensors.GetInternalTensor(kLstmOutputStateTensor), lstm_tensors.GetInternalTensor(kLstmRecurrentToOutputWeightsTensor), /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16, - op_data->output_gate_parameters)); + op_data_lstm->output_gate_parameters)); // Inter gate multiplication parameters - float nonlinear_output_scale = 0.00003051757; // 2^-15 Q3.12 -> Q0.15 + float nonlinear_output_scale = 0.000030517578125; // 2^-15 Q3.12 -> Q0.15 float cell_state_scale = lstm_tensors.CellStateTensor()->params.scale; // forget gate output (nonlinear output) x cell state -> cell state - op_data->inter_gate_parameters.forget_cell_mul_params = + op_data_lstm->inter_gate_parameters.forget_cell_mul_params = CreateInterGateMulParams(nonlinear_output_scale, cell_state_scale, cell_state_scale, kTfLiteInt16); // input gate output x cell gate output -> cell state - op_data->inter_gate_parameters.input_mul_params = + op_data_lstm->inter_gate_parameters.input_mul_params = CreateInterGateMulParams(nonlinear_output_scale, nonlinear_output_scale, cell_state_scale, kTfLiteInt16); // tanh output x output gate output -> hidden state (potentially asymmetric) - op_data->inter_gate_parameters.output_mul_params = CreateInterGateMulParams( - nonlinear_output_scale, nonlinear_output_scale, - lstm_tensors.HiddenStateTensor()->params.scale, - lstm_tensors.HiddenStateTensor()->type, - lstm_tensors.HiddenStateTensor()->params.zero_point); + op_data_lstm->inter_gate_parameters.output_mul_params = + CreateInterGateMulParams( + nonlinear_output_scale, nonlinear_output_scale, + lstm_tensors.HiddenStateTensor()->params.scale, + lstm_tensors.HiddenStateTensor()->type, + lstm_tensors.HiddenStateTensor()->params.zero_point); return kTfLiteOk; } @@ -438,152 +323,4 @@ LSTMKernelContents CreateLSTMKernelContent(TfLiteContext* context, return kernel_content; } -template -LSTMBuffers CreateLSTMBuffers(TfLiteContext* context, - const int* buffer_indices) { - LSTMBuffers buffers; - buffers.buffer0 = reinterpret_cast( - context->GetScratchBuffer(context, buffer_indices[0])); - buffers.buffer1 = reinterpret_cast( - context->GetScratchBuffer(context, buffer_indices[1])); - buffers.buffer2 = reinterpret_cast( - context->GetScratchBuffer(context, buffer_indices[2])); - buffers.buffer3 = reinterpret_cast( - context->GetScratchBuffer(context, buffer_indices[3])); - return buffers; -} - -/*Kernel functions*/ - -void* UnidirectionalSequenceLstmInit(TfLiteContext* context, const char* buffer, - size_t length) { - TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); - return context->AllocatePersistentBuffer(context, sizeof(OpDataLSTM)); -} - -TfLiteStatus UnidirectionalSequenceLstmPrepare(TfLiteContext* context, - TfLiteNode* node) { - TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); - TF_LITE_ENSURE_EQ(context, node->inputs->size, 24); - - TFLITE_DCHECK(node->builtin_data != nullptr); - TFLITE_DCHECK(node->user_data != nullptr); - - OpDataLSTM* op_data = reinterpret_cast(node->user_data); - const auto* builtin_data = - static_cast(node->builtin_data); - // All TempTfLiteTensors will be deallocated through the destructor. - LstmTensors lstm_tensors(context, node); - TF_LITE_ENSURE_OK(context, lstm_tensors.ValidateTensorStatus(context)); - - op_data->cell_gate_nonlinear_type = builtin_data->activation; - op_data->size_info = - CreateLstmSizeInfo(builtin_data->time_major, - lstm_tensors.GetInternalTensor(kLstmInputTensor)->dims, - lstm_tensors.HiddenStateTensor()->dims); - TF_LITE_ENSURE_OK( - context, ValidateTensorSize(context, lstm_tensors, op_data->size_info)); - - // Create cell state information and gate parameters (Fully Connected and Mul) - auto cell_state_type = - lstm_tensors.GetInternalTensor(kLstmCellStateTensor)->type; - if (cell_state_type == kTfLiteFloat32) { - op_data->cell_state_info = - CreateLstmCellStateInfoFloat(builtin_data->cell_clip); - TF_LITE_ENSURE_OK( - context, PrepareGateParametersFloat(context, lstm_tensors, op_data)); - } else if (cell_state_type == kTfLiteInt16) { - op_data->cell_state_info = CreateLstmCellStateInfo( - lstm_tensors.CellStateTensor()->params.scale, builtin_data->cell_clip); - TF_LITE_ENSURE_OK( - context, PrepareGateParametersInteger(context, lstm_tensors, op_data)); - } else { - MicroPrintf( - "Cell state type %s (%d) not supported. The quantized Unidirectional " - "Sequence LSTM Op only support int16 cell state", - TfLiteTypeGetName(cell_state_type), cell_state_type); - return kTfLiteError; - } - // request buffers (four buffers) - for (size_t i = 0; i < 4; i++) { - TF_LITE_ENSURE_OK(context, context->RequestScratchBufferInArena( - context, - op_data->size_info.batch_size * - op_data->size_info.state_dimension * - TfLiteTypeGetSize(cell_state_type), - &(op_data->buffer_indices[i]))); - } - return kTfLiteOk; -} - -TfLiteStatus UnidirectionalSequenceLstmEval(TfLiteContext* context, - TfLiteNode* node) { - TFLITE_DCHECK(node->user_data != nullptr); - const OpDataLSTM& op_data = *reinterpret_cast(node->user_data); - auto kernel_content = CreateLSTMKernelContent(context, node); - - const auto activation_type = - kernel_content.internal_tensors[kLstmInputTensor]->type; - const auto weight_type = - kernel_content.internal_tensors[kLstmInputToInputWeightsTensor]->type; - - switch (activation_type) { - case kTfLiteFloat32: { - LSTMBuffers buffers = - CreateLSTMBuffers(context, op_data.buffer_indices); - EvalLstm(op_data, kernel_content, buffers); - break; - } - case kTfLiteInt8: { - switch (weight_type) { - case kTfLiteInt8: { - // 8(activation)x8(weight)->16(cell) LSTM with 32 bits bias - LSTMBuffers buffers = - CreateLSTMBuffers(context, op_data.buffer_indices); - EvalLstm(op_data, kernel_content, - buffers); - break; - } - default: { - MicroPrintf("Filter type %s (%d) not supported.", - TfLiteTypeGetName(weight_type), activation_type); - return kTfLiteError; - } - } - break; - } - case kTfLiteInt16: { - switch (weight_type) { - case kTfLiteInt8: { - // 16(activation)x8(weight)->16(cell) LSTM with 64 bits bias - LSTMBuffers buffers = - CreateLSTMBuffers(context, op_data.buffer_indices); - EvalLstm(op_data, kernel_content, - buffers); - break; - } - default: { - MicroPrintf("Filter type %s (%d) not supported.", - TfLiteTypeGetName(weight_type), weight_type); - return kTfLiteError; - } - } - break; - } - default: { - MicroPrintf("Input type %s (%d) not supported.", - TfLiteTypeGetName(activation_type), activation_type); - return kTfLiteError; - } - } - return kTfLiteOk; -} - -} // namespace - -TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM() { - return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit, - UnidirectionalSequenceLstmPrepare, - UnidirectionalSequenceLstmEval); -} } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp b/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp index b7b9cba8..434e4efa 100644 --- a/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp +++ b/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp @@ -109,12 +109,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_MAXIMUM() { +TfLiteRegistration_V1 Register_MAXIMUM() { return tflite::micro::RegisterOp(nullptr, nullptr, Eval); } -TfLiteRegistration Register_MINIMUM() { +TfLiteRegistration_V1 Register_MINIMUM() { return tflite::micro::RegisterOp(nullptr, nullptr, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/micro_ops.h b/src/tensorflow/lite/micro/kernels/micro_ops.h index 252efc62..14b874d0 100644 --- a/src/tensorflow/lite/micro/kernels/micro_ops.h +++ b/src/tensorflow/lite/micro/kernels/micro_ops.h @@ -31,108 +31,107 @@ namespace tflite { // (https://abseil.io/tips/130). Any new ops (or cleanup of existing ops should // have their Register function declarations in the tflite namespace. -TfLiteRegistration Register_ADD(); -TfLiteRegistration Register_ADD_N(); -TfLiteRegistration Register_ARG_MAX(); -TfLiteRegistration Register_ARG_MIN(); -TfLiteRegistration Register_ASSIGN_VARIABLE(); -TfLiteRegistration Register_AVERAGE_POOL_2D(); -TfLiteRegistration Register_BATCH_TO_SPACE_ND(); -TfLiteRegistration Register_BROADCAST_ARGS(); -TfLiteRegistration Register_BROADCAST_TO(); -TfLiteRegistration Register_CALL_ONCE(); -TfLiteRegistration Register_CAST(); -TfLiteRegistration Register_CEIL(); +TfLiteRegistration_V1 Register_ABS(); +TfLiteRegistration_V1 Register_ADD(); +TfLiteRegistration_V1 Register_ADD_N(); +TfLiteRegistration_V1 Register_ARG_MAX(); +TfLiteRegistration_V1 Register_ARG_MIN(); +TfLiteRegistration_V1 Register_ASSIGN_VARIABLE(); +TfLiteRegistration_V1 Register_AVERAGE_POOL_2D(); +TfLiteRegistration_V1 Register_BATCH_TO_SPACE_ND(); +TfLiteRegistration_V1 Register_BROADCAST_ARGS(); +TfLiteRegistration_V1 Register_BROADCAST_TO(); +TfLiteRegistration_V1 Register_CALL_ONCE(); +TfLiteRegistration_V1 Register_CAST(); +TfLiteRegistration_V1 Register_CEIL(); // TODO(b/160234179): Change custom OPs to also return by value. -TfLiteRegistration* Register_CIRCULAR_BUFFER(); -TfLiteRegistration Register_CONCATENATION(); -TfLiteRegistration Register_CONV_2D(); -TfLiteRegistration Register_CUMSUM(); -TfLiteRegistration Register_DEPTH_TO_SPACE(); -TfLiteRegistration Register_DEPTHWISE_CONV_2D(); -TfLiteRegistration Register_DEQUANTIZE(); -TfLiteRegistration Register_DIV(); -TfLiteRegistration Register_ELU(); -TfLiteRegistration Register_EQUAL(); -TfLiteRegistration* Register_ETHOSU(); -TfLiteRegistration Register_EXP(); -TfLiteRegistration Register_EXPAND_DIMS(); -TfLiteRegistration Register_FILL(); -TfLiteRegistration Register_FLOOR(); -TfLiteRegistration Register_FLOOR_DIV(); -TfLiteRegistration Register_FLOOR_MOD(); -TfLiteRegistration Register_FULLY_CONNECTED(); -TfLiteRegistration Register_GATHER(); -TfLiteRegistration Register_GATHER_ND(); -TfLiteRegistration Register_GREATER(); -TfLiteRegistration Register_GREATER_EQUAL(); -TfLiteRegistration Register_HARD_SWISH(); -TfLiteRegistration Register_IF(); -TfLiteRegistration Register_L2_NORMALIZATION(); -TfLiteRegistration Register_L2_POOL_2D(); -TfLiteRegistration Register_LEAKY_RELU(); -TfLiteRegistration Register_LESS(); -TfLiteRegistration Register_LESS_EQUAL(); -TfLiteRegistration Register_LOG_SOFTMAX(); -TfLiteRegistration Register_LOGICAL_AND(); -TfLiteRegistration Register_LOGICAL_OR(); -TfLiteRegistration Register_LOGISTIC(); -TfLiteRegistration Register_MAX_POOL_2D(); -TfLiteRegistration Register_MAXIMUM(); -TfLiteRegistration Register_MEAN(); -TfLiteRegistration Register_MINIMUM(); -TfLiteRegistration Register_MIRROR_PAD(); -TfLiteRegistration Register_MUL(); -TfLiteRegistration Register_NEG(); -TfLiteRegistration Register_NOT_EQUAL(); -TfLiteRegistration Register_PACK(); -TfLiteRegistration Register_PAD(); -TfLiteRegistration Register_PADV2(); -TfLiteRegistration Register_PRELU(); -TfLiteRegistration Register_QUANTIZE(); -TfLiteRegistration Register_READ_VARIABLE(); -TfLiteRegistration Register_REDUCE_MAX(); -TfLiteRegistration Register_RELU(); -TfLiteRegistration Register_RELU6(); -TfLiteRegistration Register_RESIZE_BILINEAR(); -TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR(); -TfLiteRegistration Register_SELECT_V2(); -TfLiteRegistration Register_SHAPE(); -TfLiteRegistration Register_SLICE(); -TfLiteRegistration Register_SOFTMAX(); -TfLiteRegistration Register_SPACE_TO_BATCH_ND(); -TfLiteRegistration Register_SPACE_TO_DEPTH(); -TfLiteRegistration Register_SPLIT(); -TfLiteRegistration Register_SPLIT_V(); -TfLiteRegistration Register_SQUARED_DIFFERENCE(); -TfLiteRegistration Register_SQUEEZE(); -TfLiteRegistration Register_STRIDED_SLICE(); -TfLiteRegistration Register_SUB(); -TfLiteRegistration Register_SUM(); -TfLiteRegistration Register_SVDF(); -TfLiteRegistration Register_TANH(); -TfLiteRegistration Register_TRANSPOSE(); -TfLiteRegistration Register_TRANSPOSE_CONV(); +TfLiteRegistration_V1* Register_CIRCULAR_BUFFER(); +TfLiteRegistration_V1 Register_CONCATENATION(); +TfLiteRegistration_V1 Register_CONV_2D(); +TfLiteRegistration_V1 Register_COS(); +TfLiteRegistration_V1 Register_CUMSUM(); +TfLiteRegistration_V1 Register_DEPTH_TO_SPACE(); +TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D(); +TfLiteRegistration_V1 Register_DEQUANTIZE(); +TfLiteRegistration_V1 Register_DIV(); +TfLiteRegistration_V1 Register_ELU(); +TfLiteRegistration_V1 Register_EQUAL(); +TfLiteRegistration_V1* Register_ETHOSU(); +TfLiteRegistration_V1 Register_EXP(); +TfLiteRegistration_V1 Register_EXPAND_DIMS(); +TfLiteRegistration_V1 Register_FILL(); +TfLiteRegistration_V1 Register_FLOOR(); +TfLiteRegistration_V1 Register_FLOOR_DIV(); +TfLiteRegistration_V1 Register_FLOOR_MOD(); +TfLiteRegistration_V1 Register_FULLY_CONNECTED(); +TfLiteRegistration_V1 Register_GATHER(); +TfLiteRegistration_V1 Register_GATHER_ND(); +TfLiteRegistration_V1 Register_GREATER(); +TfLiteRegistration_V1 Register_GREATER_EQUAL(); +TfLiteRegistration_V1 Register_HARD_SWISH(); +TfLiteRegistration_V1 Register_IF(); +TfLiteRegistration_V1 Register_L2_NORMALIZATION(); +TfLiteRegistration_V1 Register_L2_POOL_2D(); +TfLiteRegistration_V1 Register_LEAKY_RELU(); +TfLiteRegistration_V1 Register_LESS(); +TfLiteRegistration_V1 Register_LESS_EQUAL(); +TfLiteRegistration_V1 Register_LOG(); +TfLiteRegistration_V1 Register_LOG_SOFTMAX(); +TfLiteRegistration_V1 Register_LOGICAL_AND(); +TfLiteRegistration_V1 Register_LOGICAL_NOT(); +TfLiteRegistration_V1 Register_LOGICAL_OR(); +TfLiteRegistration_V1 Register_LOGISTIC(); +TfLiteRegistration_V1 Register_MAX_POOL_2D(); +TfLiteRegistration_V1 Register_MAXIMUM(); +TfLiteRegistration_V1 Register_MEAN(); +TfLiteRegistration_V1 Register_MINIMUM(); +TfLiteRegistration_V1 Register_MIRROR_PAD(); +TfLiteRegistration_V1 Register_MUL(); +TfLiteRegistration_V1 Register_NEG(); +TfLiteRegistration_V1 Register_NOT_EQUAL(); +TfLiteRegistration_V1 Register_PACK(); +TfLiteRegistration_V1 Register_PAD(); +TfLiteRegistration_V1 Register_PADV2(); +TfLiteRegistration_V1 Register_PRELU(); +TfLiteRegistration_V1 Register_QUANTIZE(); +TfLiteRegistration_V1 Register_READ_VARIABLE(); +TfLiteRegistration_V1 Register_REDUCE_MAX(); +TfLiteRegistration_V1 Register_RELU(); +TfLiteRegistration_V1 Register_RELU6(); +TfLiteRegistration_V1 Register_RESIZE_BILINEAR(); +TfLiteRegistration_V1 Register_RESIZE_NEAREST_NEIGHBOR(); +TfLiteRegistration_V1 Register_RSQRT(); +TfLiteRegistration_V1 Register_SELECT_V2(); +TfLiteRegistration_V1 Register_SHAPE(); +TfLiteRegistration_V1 Register_SIN(); +TfLiteRegistration_V1 Register_SLICE(); +TfLiteRegistration_V1 Register_SOFTMAX(); +TfLiteRegistration_V1 Register_SPACE_TO_BATCH_ND(); +TfLiteRegistration_V1 Register_SPACE_TO_DEPTH(); +TfLiteRegistration_V1 Register_SPLIT(); +TfLiteRegistration_V1 Register_SPLIT_V(); +TfLiteRegistration_V1 Register_SQRT(); +TfLiteRegistration_V1 Register_SQUARE(); +TfLiteRegistration_V1 Register_SQUARED_DIFFERENCE(); +TfLiteRegistration_V1 Register_SQUEEZE(); +TfLiteRegistration_V1 Register_STRIDED_SLICE(); +TfLiteRegistration_V1 Register_SUB(); +TfLiteRegistration_V1 Register_SUM(); +TfLiteRegistration_V1 Register_SVDF(); +TfLiteRegistration_V1 Register_TANH(); +TfLiteRegistration_V1 Register_TRANSPOSE(); +TfLiteRegistration_V1 Register_TRANSPOSE_CONV(); // TODO(b/230666079): resolve conflict with xtensa implementation -TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM(); -TfLiteRegistration Register_UNPACK(); -TfLiteRegistration Register_VAR_HANDLE(); -TfLiteRegistration Register_WHILE(); -TfLiteRegistration Register_ZEROS_LIKE(); +TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM(); +TfLiteRegistration_V1 Register_UNPACK(); +TfLiteRegistration_V1 Register_VAR_HANDLE(); +TfLiteRegistration_V1 Register_WHILE(); +TfLiteRegistration_V1 Register_ZEROS_LIKE(); namespace ops { namespace micro { - -TfLiteRegistration Register_ABS(); -TfLiteRegistration Register_COS(); -TfLiteRegistration Register_LOG(); -TfLiteRegistration Register_LOGICAL_NOT(); -TfLiteRegistration Register_RESHAPE(); -TfLiteRegistration Register_ROUND(); -TfLiteRegistration Register_RSQRT(); -TfLiteRegistration Register_SIN(); -TfLiteRegistration Register_SQRT(); -TfLiteRegistration Register_SQUARE(); +TfLiteRegistration_V1 Register_RESHAPE(); +TfLiteRegistration_V1 Register_ROUND(); } // namespace micro } // namespace ops } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/mirror_pad.cpp b/src/tensorflow/lite/micro/kernels/mirror_pad.cpp index 90d3bd9e..c6ee1da7 100644 --- a/src/tensorflow/lite/micro/kernels/mirror_pad.cpp +++ b/src/tensorflow/lite/micro/kernels/mirror_pad.cpp @@ -208,7 +208,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_MIRROR_PAD() { +TfLiteRegistration_V1 Register_MIRROR_PAD() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/mul.h b/src/tensorflow/lite/micro/kernels/mul.h index d0148643..d64248db 100644 --- a/src/tensorflow/lite/micro/kernels/mul.h +++ b/src/tensorflow/lite/micro/kernels/mul.h @@ -61,13 +61,13 @@ void EvalMulFloatReference(TfLiteContext* context, TfLiteNode* node, TfLiteEvalTensor* output); // Generic must define registration function. -TfLiteRegistration Register_MUL(); +TfLiteRegistration_V1 Register_MUL(); #if defined(ARDUINO) -TfLiteRegistration Register_MUL_INT8(); +TfLiteRegistration_V1 Register_MUL_INT8(); #else // Fallback registration -inline TfLiteRegistration Register_MUL_INT8() { return Register_MUL(); } +inline TfLiteRegistration_V1 Register_MUL_INT8() { return Register_MUL(); } #endif } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/neg.cpp b/src/tensorflow/lite/micro/kernels/neg.cpp index db26f6c6..cde9979f 100644 --- a/src/tensorflow/lite/micro/kernels/neg.cpp +++ b/src/tensorflow/lite/micro/kernels/neg.cpp @@ -50,7 +50,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_NEG() { +TfLiteRegistration_V1 Register_NEG() { return tflite::micro::RegisterOp(nullptr, nullptr, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/pack.cpp b/src/tensorflow/lite/micro/kernels/pack.cpp index 5a4eb4f5..4c2a9724 100644 --- a/src/tensorflow/lite/micro/kernels/pack.cpp +++ b/src/tensorflow/lite/micro/kernels/pack.cpp @@ -105,7 +105,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_PACK() { +TfLiteRegistration_V1 Register_PACK() { return tflite::micro::RegisterOp(nullptr, nullptr, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/pad.cpp b/src/tensorflow/lite/micro/kernels/pad.cpp index 579df1a6..f169e45e 100644 --- a/src/tensorflow/lite/micro/kernels/pad.cpp +++ b/src/tensorflow/lite/micro/kernels/pad.cpp @@ -18,7 +18,6 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/kernels/internal/portable_tensor.h" #include "tensorflow/lite/kernels/internal/types.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/op_macros.h" @@ -218,12 +217,12 @@ TfLiteStatus PadPrepare(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteRegistration Register_PAD() { +TfLiteRegistration_V1 Register_PAD() { return tflite::micro::RegisterOp(Init, PadPrepare, Eval); } // Also register Pad as PadV2. -TfLiteRegistration Register_PADV2() { +TfLiteRegistration_V1 Register_PADV2() { return tflite::micro::RegisterOp(Init, PadPrepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/pooling.h b/src/tensorflow/lite/micro/kernels/pooling.h index fd0d2c93..e7e0b4dc 100644 --- a/src/tensorflow/lite/micro/kernels/pooling.h +++ b/src/tensorflow/lite/micro/kernels/pooling.h @@ -113,27 +113,27 @@ void MaxPoolingEvalQuantized(TfLiteContext* context, TfLiteNode* node, } #if defined(ARDUINO) || defined(XTENSA) -TfLiteRegistration Register_AVERAGE_POOL_2D_INT8(); +TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT8(); -TfLiteRegistration Register_MAX_POOL_2D_INT8(); +TfLiteRegistration_V1 Register_MAX_POOL_2D_INT8(); -TfLiteRegistration Register_AVERAGE_POOL_2D_INT16(); +TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT16(); -TfLiteRegistration Register_MAX_POOL_2D_INT16(); +TfLiteRegistration_V1 Register_MAX_POOL_2D_INT16(); #else -inline TfLiteRegistration Register_AVERAGE_POOL_2D_INT8() { +inline TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT8() { return tflite::Register_AVERAGE_POOL_2D(); } -inline TfLiteRegistration Register_MAX_POOL_2D_INT8() { +inline TfLiteRegistration_V1 Register_MAX_POOL_2D_INT8() { return tflite::Register_MAX_POOL_2D(); } -inline TfLiteRegistration Register_AVERAGE_POOL_2D_INT16() { +inline TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT16() { return tflite::Register_AVERAGE_POOL_2D(); } -inline TfLiteRegistration Register_MAX_POOL_2D_INT16() { +inline TfLiteRegistration_V1 Register_MAX_POOL_2D_INT16() { return tflite::Register_MAX_POOL_2D(); } #endif diff --git a/src/tensorflow/lite/micro/kernels/prelu.cpp b/src/tensorflow/lite/micro/kernels/prelu.cpp index f4294723..62e8eb9d 100644 --- a/src/tensorflow/lite/micro/kernels/prelu.cpp +++ b/src/tensorflow/lite/micro/kernels/prelu.cpp @@ -68,7 +68,7 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) { } } -TfLiteRegistration Register_PRELU() { +TfLiteRegistration_V1 Register_PRELU() { return tflite::micro::RegisterOp(PreluInit, PreluPrepare, PreluEval); } diff --git a/src/tensorflow/lite/micro/kernels/quantize.cpp b/src/tensorflow/lite/micro/kernels/quantize.cpp index b5eb9c3c..0e3336d9 100644 --- a/src/tensorflow/lite/micro/kernels/quantize.cpp +++ b/src/tensorflow/lite/micro/kernels/quantize.cpp @@ -33,7 +33,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } // namespace -TfLiteRegistration Register_QUANTIZE() { +TfLiteRegistration_V1 Register_QUANTIZE() { return tflite::micro::RegisterOp(Init, PrepareQuantizeReference, EvalQuantizeReference); } diff --git a/src/tensorflow/lite/micro/kernels/read_variable.cpp b/src/tensorflow/lite/micro/kernels/read_variable.cpp index 600a1bdd..d173bc5f 100644 --- a/src/tensorflow/lite/micro/kernels/read_variable.cpp +++ b/src/tensorflow/lite/micro/kernels/read_variable.cpp @@ -80,7 +80,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_READ_VARIABLE() { +TfLiteRegistration_V1 Register_READ_VARIABLE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/reduce.cpp b/src/tensorflow/lite/micro/kernels/reduce.cpp index b4734f93..810d96f6 100644 --- a/src/tensorflow/lite/micro/kernels/reduce.cpp +++ b/src/tensorflow/lite/micro/kernels/reduce.cpp @@ -57,15 +57,15 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) { static_cast(node->user_data)); } -TfLiteRegistration Register_MEAN() { +TfLiteRegistration_V1 Register_MEAN() { return tflite::micro::RegisterOp(InitReduce, PrepareMeanOrSum, EvalMean); } -TfLiteRegistration Register_REDUCE_MAX() { +TfLiteRegistration_V1 Register_REDUCE_MAX() { return tflite::micro::RegisterOp(InitReduce, PrepareMax, EvalMax); } -TfLiteRegistration Register_SUM() { +TfLiteRegistration_V1 Register_SUM() { return tflite::micro::RegisterOp(InitReduce, PrepareMeanOrSum, EvalSum); } diff --git a/src/tensorflow/lite/micro/kernels/reduce.h b/src/tensorflow/lite/micro/kernels/reduce.h index 5956974e..3b70665d 100644 --- a/src/tensorflow/lite/micro/kernels/reduce.h +++ b/src/tensorflow/lite/micro/kernels/reduce.h @@ -56,9 +56,9 @@ TfLiteStatus EvalSumHelper(TfLiteContext* context, TfLiteNode* node, void ReduceResolveAxis(const int* axis_data, int axis_count, MeanParams* op_params); -TfLiteRegistration Register_MEAN(); -TfLiteRegistration Register_REDUCE_MAX(); -TfLiteRegistration Register_SUM(); +TfLiteRegistration_V1 Register_MEAN(); +TfLiteRegistration_V1 Register_REDUCE_MAX(); +TfLiteRegistration_V1 Register_SUM(); } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/reduce_common.cpp b/src/tensorflow/lite/micro/kernels/reduce_common.cpp index b2fceeb8..0dab49c2 100644 --- a/src/tensorflow/lite/micro/kernels/reduce_common.cpp +++ b/src/tensorflow/lite/micro/kernels/reduce_common.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -160,26 +160,6 @@ TfLiteStatus QuantizedMeanOrSum(TfLiteContext* context, TfLiteNode* node, return kTfLiteOk; } -template -TfLiteStatus Mean(TfLiteContext* context, TfLiteNode* node, - OpDataReduce* op_data, int* temp_index, int* resolved_axis, - U* temp_sum) { - const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0); - const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1); - TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); - TfLiteReducerParams* params = - static_cast(node->builtin_data); - - reference_ops::Mean( - tflite::micro::GetTensorData(input), &input->dims->data[0], - input->dims->size, tflite::micro::GetTensorData(output), - &output->dims->data[0], output->dims->size, - tflite::micro::GetTensorData(axis), op_data->num_axis, - params->keep_dims, temp_index, resolved_axis, temp_sum); - - return kTfLiteOk; -} - template TfLiteStatus EvalIntegerMean(TfLiteContext* context, TfLiteNode* node, int num_axis, OpDataReduce* op_data, @@ -187,14 +167,9 @@ TfLiteStatus EvalIntegerMean(TfLiteContext* context, TfLiteNode* node, int32_t* temp_sum = static_cast( context->GetScratchBuffer(context, op_data->temp_buffer_idx)); - if (op_data->input_zp == op_data->output_zp && - op_data->input_scale == op_data->output_scale) { - Mean(context, node, op_data, temp_index, - resolved_axis, temp_sum); - } else { - QuantizedMeanOrSum(context, node, temp_index, resolved_axis, - temp_sum, op_data, /*compute_sum=*/false); - } + QuantizedMeanOrSum(context, node, temp_index, resolved_axis, + temp_sum, op_data, /*compute_sum=*/false); + return kTfLiteOk; } diff --git a/src/tensorflow/lite/micro/kernels/reshape.cpp b/src/tensorflow/lite/micro/kernels/reshape.cpp index 0c6806d1..7c8549a3 100644 --- a/src/tensorflow/lite/micro/kernels/reshape.cpp +++ b/src/tensorflow/lite/micro/kernels/reshape.cpp @@ -114,7 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace reshape -TfLiteRegistration Register_RESHAPE() { +TfLiteRegistration_V1 Register_RESHAPE() { return tflite::micro::RegisterOp(nullptr, reshape::Prepare, reshape::Eval); } diff --git a/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp b/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp index 56432e1b..48f3b9d6 100644 --- a/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp +++ b/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp @@ -109,7 +109,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_RESIZE_BILINEAR() { +TfLiteRegistration_V1 Register_RESIZE_BILINEAR() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp b/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp index 4ed09d00..c6c8f6ff 100644 --- a/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp +++ b/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp @@ -116,7 +116,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR() { +TfLiteRegistration_V1 Register_RESIZE_NEAREST_NEIGHBOR() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/round.cpp b/src/tensorflow/lite/micro/kernels/round.cpp index 0bda8783..8db5fa2e 100644 --- a/src/tensorflow/lite/micro/kernels/round.cpp +++ b/src/tensorflow/lite/micro/kernels/round.cpp @@ -67,7 +67,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace round -TfLiteRegistration Register_ROUND() { +TfLiteRegistration_V1 Register_ROUND() { return tflite::micro::RegisterOp(nullptr, round::Prepare, round::Eval); } diff --git a/src/tensorflow/lite/micro/kernels/select.cpp b/src/tensorflow/lite/micro/kernels/select.cpp index 1b05bd2f..d467c07f 100644 --- a/src/tensorflow/lite/micro/kernels/select.cpp +++ b/src/tensorflow/lite/micro/kernels/select.cpp @@ -189,7 +189,7 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) { // // 1. Either the same shape (in which case the select is elementwise), or // 2. Broadcastable shapes between 'condition', 'x' and 'y'. -TfLiteRegistration Register_SELECT_V2() { +TfLiteRegistration_V1 Register_SELECT_V2() { return tflite::micro::RegisterOp(tflite::SelectInit, tflite::SelectPrepare, tflite::SelectEval); } diff --git a/src/tensorflow/lite/micro/kernels/shape.cpp b/src/tensorflow/lite/micro/kernels/shape.cpp index e85bb81f..3ced3209 100644 --- a/src/tensorflow/lite/micro/kernels/shape.cpp +++ b/src/tensorflow/lite/micro/kernels/shape.cpp @@ -60,7 +60,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SHAPE() { +TfLiteRegistration_V1 Register_SHAPE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/slice.cpp b/src/tensorflow/lite/micro/kernels/slice.cpp index cc3cd5b4..90e977a0 100644 --- a/src/tensorflow/lite/micro/kernels/slice.cpp +++ b/src/tensorflow/lite/micro/kernels/slice.cpp @@ -140,6 +140,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); break; + case kTfLiteBool: + reference_ops::Slice(op_params, + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + break; default: MicroPrintf("Input tensor type %s (%d) not supported.", TfLiteTypeGetName(input->type), input->type); @@ -150,7 +157,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SLICE() { +TfLiteRegistration_V1 Register_SLICE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/softmax.h b/src/tensorflow/lite/micro/kernels/softmax.h index 0b498bd8..c9c18ca2 100644 --- a/src/tensorflow/lite/micro/kernels/softmax.h +++ b/src/tensorflow/lite/micro/kernels/softmax.h @@ -32,34 +32,36 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context, TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node); -// This is the most generic TfLiteRegistration. The actual supported types may -// still be target dependent. The only requirement is that every implementation -// (reference or optimized) must define this function. -TfLiteRegistration Register_SOFTMAX(); +// This is the most generic TfLiteRegistration_V1. The actual supported types +// may still be target dependent. The only requirement is that every +// implementation (reference or optimized) must define this function. +TfLiteRegistration_V1 Register_SOFTMAX(); #if defined(XTENSA) || defined(ARDUINO) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int8 input and int16 output. -TfLiteRegistration Register_SOFTMAX_INT8_INT16(); +TfLiteRegistration_V1 Register_SOFTMAX_INT8_INT16(); #else -inline TfLiteRegistration Register_SOFTMAX_INT8_INT16() { +inline TfLiteRegistration_V1 Register_SOFTMAX_INT8_INT16() { return Register_SOFTMAX(); } #endif #if defined(ARDUINO) -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int8 input/output and uses the latency optimized implementations. -TfLiteRegistration Register_SOFTMAX_INT8(); +TfLiteRegistration_V1 Register_SOFTMAX_INT8(); -// Returns a TfLiteRegistration struct for kernel variant that only supports +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports // int16 input/output and uses the latency optimized implementations. -TfLiteRegistration Register_SOFTMAX_INT16(); +TfLiteRegistration_V1 Register_SOFTMAX_INT16(); #else -inline TfLiteRegistration Register_SOFTMAX_INT8() { return Register_SOFTMAX(); } +inline TfLiteRegistration_V1 Register_SOFTMAX_INT8() { + return Register_SOFTMAX(); +} -inline TfLiteRegistration Register_SOFTMAX_INT16() { +inline TfLiteRegistration_V1 Register_SOFTMAX_INT16() { return Register_SOFTMAX(); } #endif diff --git a/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp b/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp index 11b32c3f..a4dab2af 100644 --- a/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp +++ b/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp @@ -114,7 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_SPACE_TO_BATCH_ND() { +TfLiteRegistration_V1 Register_SPACE_TO_BATCH_ND() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/space_to_depth.cpp b/src/tensorflow/lite/micro/kernels/space_to_depth.cpp index 3640e2cd..99837ee0 100644 --- a/src/tensorflow/lite/micro/kernels/space_to_depth.cpp +++ b/src/tensorflow/lite/micro/kernels/space_to_depth.cpp @@ -120,7 +120,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SPACE_TO_DEPTH() { +TfLiteRegistration_V1 Register_SPACE_TO_DEPTH() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/split.cpp b/src/tensorflow/lite/micro/kernels/split.cpp index 226e4bf7..97d9a2d1 100644 --- a/src/tensorflow/lite/micro/kernels/split.cpp +++ b/src/tensorflow/lite/micro/kernels/split.cpp @@ -118,7 +118,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SPLIT() { +TfLiteRegistration_V1 Register_SPLIT() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/split_v.cpp b/src/tensorflow/lite/micro/kernels/split_v.cpp index 1d2fb559..ef5594eb 100644 --- a/src/tensorflow/lite/micro/kernels/split_v.cpp +++ b/src/tensorflow/lite/micro/kernels/split_v.cpp @@ -120,7 +120,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SPLIT_V() { +TfLiteRegistration_V1 Register_SPLIT_V() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/squared_difference.cpp b/src/tensorflow/lite/micro/kernels/squared_difference.cpp index 8786a871..b0cd389c 100644 --- a/src/tensorflow/lite/micro/kernels/squared_difference.cpp +++ b/src/tensorflow/lite/micro/kernels/squared_difference.cpp @@ -44,6 +44,44 @@ void* SquaredDifferenceInit(TfLiteContext* context, const char* buffer, return context->AllocatePersistentBuffer(context, sizeof(OpData)); } +void PrepareQuantized( + const TfLiteQuantizationParams& input1_quantization_params, + const TfLiteQuantizationParams& input2_quantization_params, + const TfLiteQuantizationParams& output_quantization_params, + const int left_shift, const int32_t quantized_activation_min, + const int32_t quantized_activation_max, OpData* data) { + data->arithmetic_params.input1_offset = + -input1_quantization_params.zero_point; + data->arithmetic_params.input2_offset = + -input2_quantization_params.zero_point; + data->arithmetic_params.output_offset = output_quantization_params.zero_point; + data->arithmetic_params.left_shift = left_shift; + const double twice_max_input_scale = + 2.0 * static_cast(std::max(input1_quantization_params.scale, + input2_quantization_params.scale)); + const double real_input1_multiplier = + static_cast(input1_quantization_params.scale) / + twice_max_input_scale; + double real_input2_multiplier = + static_cast(input2_quantization_params.scale) / + twice_max_input_scale; + const double real_output_multiplier = + (twice_max_input_scale * twice_max_input_scale) / + static_cast((1 << data->arithmetic_params.left_shift * 2) * + output_quantization_params.scale); + QuantizeMultiplierSmallerThanOneExp( + real_input1_multiplier, &data->arithmetic_params.input1_multiplier, + &data->arithmetic_params.input1_shift); + QuantizeMultiplierSmallerThanOneExp( + real_input2_multiplier, &data->arithmetic_params.input2_multiplier, + &data->arithmetic_params.input2_shift); + QuantizeMultiplier(real_output_multiplier, + &data->arithmetic_params.output_multiplier, + &data->arithmetic_params.output_shift); + data->arithmetic_params.quantized_activation_min = quantized_activation_min; + data->arithmetic_params.quantized_activation_max = quantized_activation_max; +} + TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); @@ -68,11 +106,10 @@ TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context, TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type); output->type = input2->type; - // Ensure the quantization parameters are equivalent. + const TfLiteQuantizationParams& input1_quantization_params = input1->params; + const TfLiteQuantizationParams& input2_quantization_params = input2->params; + const TfLiteQuantizationParams& output_quantization_params = output->params; if (input1->type == kTfLiteInt8) { - const auto& input1_quantization_params = input1->params; - const auto& input2_quantization_params = input2->params; - const auto& output_quantization_params = output->params; const int32_t integer_type_min = std::numeric_limits::min(); const int32_t integer_type_max = std::numeric_limits::max(); TF_LITE_ENSURE(context, @@ -87,43 +124,25 @@ TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context, output_quantization_params.zero_point >= integer_type_min); TF_LITE_ENSURE(context, output_quantization_params.zero_point <= integer_type_max); - data->arithmetic_params.input1_offset = - -input1_quantization_params.zero_point; - data->arithmetic_params.input2_offset = - -input2_quantization_params.zero_point; - data->arithmetic_params.output_offset = - output_quantization_params.zero_point; - - // shift to make integer for scales. - // 7 is selected so that maximum shifted result 255^2 * (1 << (7 * 2 )) - // does not overflow signed 32-bit integer - data->arithmetic_params.left_shift = 7; - const double twice_max_input_scale = - 2.0 * static_cast(std::max(input1_quantization_params.scale, - input2_quantization_params.scale)); - const double real_input1_multiplier = - static_cast(input1_quantization_params.scale) / - twice_max_input_scale; - double real_input2_multiplier = - static_cast(input2_quantization_params.scale) / - twice_max_input_scale; - const double real_output_multiplier = - (twice_max_input_scale * twice_max_input_scale) / - static_cast((1 << data->arithmetic_params.left_shift * 2) * - output_quantization_params.scale); - QuantizeMultiplierSmallerThanOneExp( - real_input1_multiplier, &data->arithmetic_params.input1_multiplier, - &data->arithmetic_params.input1_shift); - QuantizeMultiplierSmallerThanOneExp( - real_input2_multiplier, &data->arithmetic_params.input2_multiplier, - &data->arithmetic_params.input2_shift); - QuantizeMultiplierSmallerThanOneExp( - real_output_multiplier, &data->arithmetic_params.output_multiplier, - &data->arithmetic_params.output_shift); - data->arithmetic_params.quantized_activation_min = - std::numeric_limits::min(); - data->arithmetic_params.quantized_activation_max = - std::numeric_limits::max(); + // leftshift = 7 is selected so that maximum shifted result 255^2 * (1 << (7 + // * 2 )) does not overflow signed 32-bit integer + PrepareQuantized(input1_quantization_params, input2_quantization_params, + output_quantization_params, /*left_shift=*/7, + /*quantized_activation_min*/ integer_type_min, + /*quantized_activation_max*/ integer_type_max, data); + } else if (input1->type == kTfLiteInt16) { + const int32_t integer_type_min = std::numeric_limits::min(); + const int32_t integer_type_max = std::numeric_limits::max(); + TF_LITE_ENSURE(context, input1_quantization_params.zero_point == 0); + TF_LITE_ENSURE(context, input2_quantization_params.zero_point == 0); + TF_LITE_ENSURE(context, output_quantization_params.zero_point == 0); + + // leftshift = 0 as number is already 16-bit. so that maximum shifted result + // 32767^2 * (1 << (0 * 2 )) + PrepareQuantized(input1_quantization_params, input2_quantization_params, + output_quantization_params, /*left_shift=*/0, + /*quantized_activation_min*/ integer_type_min, + /*quantized_activation_max*/ integer_type_max, data); } data->requires_broadcast = !HaveSameShapes(input1, input2); @@ -134,8 +153,8 @@ TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context, return kTfLiteOk; } -inline int8_t SquaredDifference(int8_t x, int8_t y, - const ArithmeticParams& params) { +template +T SquaredDifference(T x, T y, const ArithmeticParams& params) { const int32_t input1_val = params.input1_offset + x; const int32_t input2_val = params.input2_offset + y; const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); @@ -148,16 +167,16 @@ inline int8_t SquaredDifference(int8_t x, int8_t y, shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32_t raw_diff = scaled_input1_val - scaled_input2_val; - // Max of this is 255^2 * (1 << 14), so won't overflow 32 bits. + // Max of this is 32767^2 * (1 << 0), so won't overflow 32 bits. const int32_t squared_raw_diff = raw_diff * raw_diff; const int32_t raw_output = - MultiplyByQuantizedMultiplierSmallerThanOneExp( - squared_raw_diff, params.output_multiplier, params.output_shift) + + MultiplyByQuantizedMultiplier(squared_raw_diff, params.output_multiplier, + params.output_shift) + params.output_offset; const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); - return static_cast(clamped_output); + return static_cast(clamped_output); } template @@ -180,9 +199,9 @@ void EvalQuantizedSquaredDifference(TfLiteContext* context, TfLiteNode* node, const int flat_size = tflite::micro::GetTensorShape(input1).FlatSize(); reference_integer_ops::ElementWise( flat_size, op_data->arithmetic_params, - tflite::micro::GetTensorData(input1), - tflite::micro::GetTensorData(input2), - tflite::micro::GetTensorData(output), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorData(output), reference_integer_ops::CheckArithmeticParams, SquaredDifference); } } @@ -228,9 +247,13 @@ TfLiteStatus SquaredDifferenceEval(TfLiteContext* context, TfLiteNode* node) { } else if (output->type == kTfLiteInt8) { EvalQuantizedSquaredDifference(context, node, data, input1, input2, output); + } else if (output->type == kTfLiteInt16) { + EvalQuantizedSquaredDifference(context, node, data, input1, input2, + output); } else { MicroPrintf( - "SquaredDifference only supports FLOAT32, INT32 and INT8 now, got %d.", + "SquaredDifference only supports FLOAT32, INT32 , INT16 and INT8 now, " + "got %d.", output->type); return kTfLiteError; } @@ -239,7 +262,7 @@ TfLiteStatus SquaredDifferenceEval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_SQUARED_DIFFERENCE() { +TfLiteRegistration_V1 Register_SQUARED_DIFFERENCE() { return tflite::micro::RegisterOp( SquaredDifferenceInit, SquaredDifferencePrepare, SquaredDifferenceEval); } diff --git a/src/tensorflow/lite/micro/kernels/squeeze.cpp b/src/tensorflow/lite/micro/kernels/squeeze.cpp index 01753849..3ebf448d 100644 --- a/src/tensorflow/lite/micro/kernels/squeeze.cpp +++ b/src/tensorflow/lite/micro/kernels/squeeze.cpp @@ -111,7 +111,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_SQUEEZE() { +TfLiteRegistration_V1 Register_SQUEEZE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/strided_slice.cpp b/src/tensorflow/lite/micro/kernels/strided_slice.cpp index fede9548..e31f32c6 100644 --- a/src/tensorflow/lite/micro/kernels/strided_slice.cpp +++ b/src/tensorflow/lite/micro/kernels/strided_slice.cpp @@ -200,7 +200,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_STRIDED_SLICE() { +TfLiteRegistration_V1 Register_STRIDED_SLICE() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/sub.cpp b/src/tensorflow/lite/micro/kernels/sub.cpp index a54c488f..38df0bb2 100644 --- a/src/tensorflow/lite/micro/kernels/sub.cpp +++ b/src/tensorflow/lite/micro/kernels/sub.cpp @@ -161,7 +161,7 @@ TfLiteStatus SubEval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteRegistration Register_SUB() { +TfLiteRegistration_V1 Register_SUB() { return tflite::micro::RegisterOp(SubInit, SubPrepare, SubEval); } diff --git a/src/tensorflow/lite/micro/kernels/svdf.h b/src/tensorflow/lite/micro/kernels/svdf.h index c081cf2f..33390854 100644 --- a/src/tensorflow/lite/micro/kernels/svdf.h +++ b/src/tensorflow/lite/micro/kernels/svdf.h @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -77,13 +77,14 @@ void EvalFloatSvdfReference( TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node); -// This is the most generic TfLiteRegistration. The actual supported types may -// still be target dependent. The only requirement is that every implementation -// (reference or optimized) must define this function. -TfLiteRegistration Register_SVDF(); +// This is the most generic TfLiteRegistration_V1. The actual supported types +// may still be target dependent. The only requirement is that every +// implementation (reference or optimized) must define this function. +TfLiteRegistration_V1 Register_SVDF(); -#if defined(HEXAGON) || defined(ARDUINO) -TfLiteRegistration Register_SVDF_INT8(); +#if defined(HEXAGON) || defined(ARDUINO) || defined(XTENSA) + +TfLiteRegistration_V1 Register_SVDF_INT8(); #else // Note that while this block gets used for both reference and optimized kernels @@ -91,7 +92,7 @@ TfLiteRegistration Register_SVDF_INT8(); // define fallback implementation that allow reference kernels to still be used // from applications that call a more specific kernel variant. -inline TfLiteRegistration Register_SVDF_INT8() { return Register_SVDF(); } +inline TfLiteRegistration_V1 Register_SVDF_INT8() { return Register_SVDF(); } #endif } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/svdf_common.cpp b/src/tensorflow/lite/micro/kernels/svdf_common.cpp index fb92b4fd..d7dd963f 100644 --- a/src/tensorflow/lite/micro/kernels/svdf_common.cpp +++ b/src/tensorflow/lite/micro/kernels/svdf_common.cpp @@ -1,4 +1,4 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -303,6 +303,7 @@ void EvalFloatSvdfReference( tflite::micro::GetTensorData(weights_feature); const float* weights_time_ptr = tflite::micro::GetTensorData(weights_time); + // TODO(#1751): account for optional bias tensor const float* bias_ptr = tflite::micro::GetTensorData(bias); const float* input_ptr = tflite::micro::GetTensorData(input); @@ -459,6 +460,7 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { weights_time->params.scale / output->params.scale); // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready. + // TODO(#1751): account for optional bias tensor TF_LITE_ENSURE( context, std::abs(static_cast(bias->params.scale) - @@ -507,6 +509,7 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { micro_context->DeallocateTempTfLiteTensor(weights_time); micro_context->DeallocateTempTfLiteTensor(activation_state); micro_context->DeallocateTempTfLiteTensor(output); + // TODO(#1751): account for optional bias tensor micro_context->DeallocateTempTfLiteTensor(bias); return kTfLiteOk; } diff --git a/src/tensorflow/lite/micro/kernels/tanh.cpp b/src/tensorflow/lite/micro/kernels/tanh.cpp index 33ea8d2b..060cb38c 100644 --- a/src/tensorflow/lite/micro/kernels/tanh.cpp +++ b/src/tensorflow/lite/micro/kernels/tanh.cpp @@ -192,7 +192,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_TANH() { +TfLiteRegistration_V1 Register_TANH() { return tflite::micro::RegisterOp(TanhInit, TanhPrepare, TanhEval); } diff --git a/src/tensorflow/lite/micro/kernels/transpose.cpp b/src/tensorflow/lite/micro/kernels/transpose.cpp index daa75f17..00e907e5 100644 --- a/src/tensorflow/lite/micro/kernels/transpose.cpp +++ b/src/tensorflow/lite/micro/kernels/transpose.cpp @@ -116,7 +116,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_TRANSPOSE() { +TfLiteRegistration_V1 Register_TRANSPOSE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } } // namespace tflite diff --git a/src/tensorflow/lite/micro/kernels/transpose_conv.cpp b/src/tensorflow/lite/micro/kernels/transpose_conv.cpp index 9ea31454..dc0ee171 100644 --- a/src/tensorflow/lite/micro/kernels/transpose_conv.cpp +++ b/src/tensorflow/lite/micro/kernels/transpose_conv.cpp @@ -166,6 +166,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { micro_context->AllocateTempInputTensor(node, kFilterTensor); TF_LITE_ENSURE(context, filter != nullptr); + TF_LITE_ENSURE_MSG( + context, + input->type == filter->type || + (input->type == kTfLiteInt16 && filter->type == kTfLiteInt8), + "Hybrid models are not supported on TFLite Micro."); + // Get height and width of the output. const int width = SizeOfDimension(output, 2); const int height = SizeOfDimension(output, 1); @@ -253,11 +259,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const OpData& data = *(static_cast(node->user_data)); TF_LITE_ENSURE_EQ(context, input->type, output->type); - TF_LITE_ENSURE_MSG( - context, - input->type == filter->type || - (input->type == kTfLiteInt16 && filter->type == kTfLiteInt8), - "Hybrid models are not supported on TFLite Micro."); switch (input->type) { // Already know in/out types are same. case kTfLiteFloat32: { @@ -344,7 +345,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_TRANSPOSE_CONV() { +TfLiteRegistration_V1 Register_TRANSPOSE_CONV() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h b/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h new file mode 100644 index 00000000..a6071663 --- /dev/null +++ b/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h @@ -0,0 +1,47 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H_ +#define TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H_ + +#include + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/types.h" + +namespace tflite { + +// This is the most generic TfLiteRegistration_V1. The actual supported types +// may still be target dependent. The only requirement is that every +// implementation (reference or optimized) must define this function. +// TODO(b/230666079): resolve conflict with xtensa implementation +TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM(); + +#if defined(ARDUINO) +// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports +// int8 activations and int8 weights and uses the latency optimized +// implementations. +TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM_INT8(); + +#else +inline TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM_INT8() { + return Register_UNIDIRECTIONAL_SEQUENCE_LSTM(); +} +#endif + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H_ diff --git a/src/tensorflow/lite/micro/kernels/unpack.cpp b/src/tensorflow/lite/micro/kernels/unpack.cpp index 4ade8f3f..d6fcf62c 100644 --- a/src/tensorflow/lite/micro/kernels/unpack.cpp +++ b/src/tensorflow/lite/micro/kernels/unpack.cpp @@ -101,7 +101,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace -TfLiteRegistration Register_UNPACK() { +TfLiteRegistration_V1 Register_UNPACK() { return tflite::micro::RegisterOp(nullptr, nullptr, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/var_handle.cpp b/src/tensorflow/lite/micro/kernels/var_handle.cpp index cbd2485c..5ddf90f2 100644 --- a/src/tensorflow/lite/micro/kernels/var_handle.cpp +++ b/src/tensorflow/lite/micro/kernels/var_handle.cpp @@ -86,7 +86,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_VAR_HANDLE() { +TfLiteRegistration_V1 Register_VAR_HANDLE() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/while.cpp b/src/tensorflow/lite/micro/kernels/while.cpp index 811c9eae..65c5ac8a 100644 --- a/src/tensorflow/lite/micro/kernels/while.cpp +++ b/src/tensorflow/lite/micro/kernels/while.cpp @@ -126,7 +126,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace. -TfLiteRegistration Register_WHILE() { +TfLiteRegistration_V1 Register_WHILE() { return tflite::micro::RegisterOp(Init, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/kernels/zeros_like.cpp b/src/tensorflow/lite/micro/kernels/zeros_like.cpp index bb0c3147..5c702abd 100644 --- a/src/tensorflow/lite/micro/kernels/zeros_like.cpp +++ b/src/tensorflow/lite/micro/kernels/zeros_like.cpp @@ -81,7 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } // namespace -TfLiteRegistration Register_ZEROS_LIKE() { +TfLiteRegistration_V1 Register_ZEROS_LIKE() { return tflite::micro::RegisterOp(nullptr, Prepare, Eval); } diff --git a/src/tensorflow/lite/micro/memory_helpers.cpp b/src/tensorflow/lite/micro/memory_helpers.cpp index dbc5e014..b306811f 100644 --- a/src/tensorflow/lite/micro/memory_helpers.cpp +++ b/src/tensorflow/lite/micro/memory_helpers.cpp @@ -1,4 +1,4 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -104,7 +104,7 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor, // If flatbuffer_tensor.shape == nullptr, then flatbuffer_tensor is a scalar // so has 1 element. if (flatbuffer_tensor.shape() != nullptr) { - for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) { + for (size_t n = 0; n < flatbuffer_tensor.shape()->size(); ++n) { element_count *= flatbuffer_tensor.shape()->Get(n); } } diff --git a/src/tensorflow/lite/micro/micro_allocation_info.cpp b/src/tensorflow/lite/micro/micro_allocation_info.cpp index 0160cb14..a89a5e6c 100644 --- a/src/tensorflow/lite/micro/micro_allocation_info.cpp +++ b/src/tensorflow/lite/micro/micro_allocation_info.cpp @@ -179,6 +179,7 @@ TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo( const int32_t* offline_offsets, SubgraphAllocations* allocations) { AllocationInfo* allocation_info = info_.allocation_info; // Initialize allocation info for every tensor in every subgraph. + int offline_index = 0; for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size(); subgraph_idx++) { const SubGraph* subgraph = model_->subgraphs()->Get(subgraph_idx); @@ -203,7 +204,7 @@ TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo( (!subgraph->tensors()->Get(i)->is_variable()) && (current->bytes != 0); if (offline_offsets) { - current->offline_offset = offline_offsets[i]; + current->offline_offset = offline_offsets[offline_index++]; // Mark offline planned variable tensors so they can get an offline // offset and be handled offline. @@ -211,7 +212,6 @@ TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo( current->offline_offset != kOnlinePlannedBuffer) { current->needs_allocating = true; } - } else { current->offline_offset = kOnlinePlannedBuffer; } diff --git a/src/tensorflow/lite/micro/micro_allocator.cpp b/src/tensorflow/lite/micro/micro_allocator.cpp index 4585079c..0b199519 100644 --- a/src/tensorflow/lite/micro/micro_allocator.cpp +++ b/src/tensorflow/lite/micro/micro_allocator.cpp @@ -703,6 +703,14 @@ TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor( return tensor; } +uint8_t* MicroAllocator::AllocateTempBuffer(size_t size, size_t alignment) { + return non_persistent_buffer_allocator_->AllocateTemp(size, alignment); +} + +void MicroAllocator::DeallocateTempBuffer(uint8_t* buffer) { + non_persistent_buffer_allocator_->DeallocateTemp(buffer); +} + TfLiteStatus MicroAllocator::ResetTempAllocations() { return non_persistent_buffer_allocator_->ResetTempAllocations(); } diff --git a/src/tensorflow/lite/micro/micro_allocator.h b/src/tensorflow/lite/micro/micro_allocator.h index c68c7135..05dbf892 100644 --- a/src/tensorflow/lite/micro/micro_allocator.h +++ b/src/tensorflow/lite/micro/micro_allocator.h @@ -68,7 +68,7 @@ struct ScratchBufferRequest { struct NodeAndRegistration { TfLiteNode node; - const TfLiteRegistration* registration; + const TfLiteRegistration_V1* registration; }; // Holds a pointer to a buffer for a scratch buffer requested by a kernel during @@ -195,6 +195,13 @@ class MicroAllocator { virtual void DeallocateTempTfLiteTensor(TfLiteTensor*); + // Returns a pointer to a buffer from the temporary arena memory and is only + // guaranteed until a call is made to ResetTempAllocations(). + virtual uint8_t* AllocateTempBuffer(size_t size, size_t alignment); + + // Signals that the temporary buffer no longer needed. + virtual void DeallocateTempBuffer(uint8_t* buffer); + // Resets all temporary allocations. This method should be called after a // chain of temp allocations (e.g. chain of TfLiteTensor objects via // AllocateTfLiteTensor()). diff --git a/src/tensorflow/lite/micro/micro_context.cpp b/src/tensorflow/lite/micro/micro_context.cpp index bb78fe70..b06252ac 100644 --- a/src/tensorflow/lite/micro/micro_context.cpp +++ b/src/tensorflow/lite/micro/micro_context.cpp @@ -19,26 +19,34 @@ limitations under the License. #include #include +#include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/micro/micro_log.h" namespace tflite { MicroContext::MicroContext(MicroAllocator* allocator, const Model* model, MicroGraph* graph) - : allocator_(*allocator), graph_(*graph), model_(model) {} + : allocator_(*allocator), + graph_(*graph), + model_(model), + state_(InterpreterState::kInit) {} MicroContext::~MicroContext() {} void* MicroContext::AllocatePersistentBuffer(size_t bytes) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare || + state_ == InterpreterState::kInit); return allocator_.AllocatePersistentBuffer(bytes); } TfLiteStatus MicroContext::RequestScratchBufferInArena(size_t bytes, int* buffer_idx) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare); return allocator_.RequestScratchBufferInArena( bytes, graph_.GetCurrentSubgraphIndex(), buffer_idx); } void* MicroContext::GetScratchBuffer(int buffer_idx) { + TFLITE_DCHECK(state_ == InterpreterState::kInvoke); ScratchBufferHandle* handle = scratch_buffer_handles_ + buffer_idx; return handle->data; } @@ -94,6 +102,16 @@ void MicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) { return allocator_.DeallocateTempTfLiteTensor(tensor); } +uint8_t* MicroContext::AllocateTempBuffer(size_t size, size_t alignment) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare); + return allocator_.AllocateTempBuffer(size, alignment); +} + +void MicroContext::DeallocateTempBuffer(uint8_t* buffer) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare); + allocator_.DeallocateTempBuffer(buffer); +} + TfLiteEvalTensor* MicroContext::GetEvalTensor(int tensor_idx) { return &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()] .tensors[tensor_idx]; @@ -106,6 +124,8 @@ void MicroContext::SetScratchBufferHandles( TfLiteStatus MicroContext::set_external_context( void* external_context_payload) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare || + state_ == InterpreterState::kInvoke); if (external_context_payload == nullptr || external_context_payload_ != nullptr) { MicroPrintf( @@ -126,4 +146,12 @@ void MicroContextReportOpError(struct TfLiteContext* context, va_end(args); } +void MicroContext::SetInterpreterState(MicroContext::InterpreterState state) { + state_ = state; +} + +MicroContext::InterpreterState MicroContext::GetInterpreterState() const { + return state_; +} + } // namespace tflite diff --git a/src/tensorflow/lite/micro/micro_context.h b/src/tensorflow/lite/micro/micro_context.h index e7be6544..63b4b7d5 100644 --- a/src/tensorflow/lite/micro/micro_context.h +++ b/src/tensorflow/lite/micro/micro_context.h @@ -29,6 +29,15 @@ namespace tflite { // micro_context-> class MicroContext { public: + // Enum that allows MicroContext to keep track of the stages different memory + // planning APIs are available to kernels. + enum class InterpreterState { + kInit, + kPrepare, + kMemoryPlanning, + kInvoke, + }; + // Does not take any ownership, and all pointers must refer to valid objects // that outlive the one constructed. explicit MicroContext(MicroAllocator* allocator, const Model* model, @@ -84,10 +93,26 @@ class MicroContext { // Virtual so that it can be faked for kernel tests. virtual void DeallocateTempTfLiteTensor(TfLiteTensor* tensor); + // Returns a pointer to a temporary buffer (from the arena). + // This API is only valid from the kernel's Prepare function and + // the buffer's lifetime is also that of the Prepare function. + // Virtual so that it can be faked for kernel tests. + virtual uint8_t* AllocateTempBuffer(size_t size, size_t alignment); + + // Signals that the temporary buffer is no longer needed. + // Virtual so that it can be faked for kernel tests. + virtual void DeallocateTempBuffer(uint8_t* buffer); + // Returns a TfLiteEvalTensor struct for a given index. // Virtual so that it can be faked for kernel tests. virtual TfLiteEvalTensor* GetEvalTensor(int tensor_idx); + // Sets the State of MemoryPlanning MicroContext + void SetInterpreterState(MicroContext::InterpreterState state); + + // Sets the State of MemoryPlanning MicroContext + MicroContext::InterpreterState GetInterpreterState() const; + // Does not take ownership of the pointer and the pointer must refer to valid // an object that outlive this class instance. // This can only be called once to set one external context. @@ -110,6 +135,7 @@ class MicroContext { MicroAllocator& allocator_; MicroGraph& graph_; const Model* model_; + InterpreterState state_; ScratchBufferHandle* scratch_buffer_handles_ = nullptr; void* external_context_payload_ = nullptr; diff --git a/src/tensorflow/lite/micro/micro_graph.cpp b/src/tensorflow/lite/micro/micro_graph.cpp index 6007e2d3..4d412e73 100644 --- a/src/tensorflow/lite/micro/micro_graph.cpp +++ b/src/tensorflow/lite/micro/micro_graph.cpp @@ -27,7 +27,7 @@ limitations under the License. namespace tflite { namespace { -const char* OpNameFromRegistration(const TfLiteRegistration* registration) { +const char* OpNameFromRegistration(const TfLiteRegistration_V1* registration) { if (registration->builtin_code == BuiltinOperator_CUSTOM) { return registration->custom_name; } else { @@ -62,7 +62,7 @@ TfLiteStatus MicroGraph::InitSubgraphs() { for (size_t i = 0; i < operators_size; ++i) { TfLiteNode* node = &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node); - const TfLiteRegistration* registration = + const TfLiteRegistration_V1* registration = subgraph_allocations_[subgraph_idx] .node_and_registrations[i] .registration; @@ -96,7 +96,7 @@ TfLiteStatus MicroGraph::PrepareSubgraphs() { for (size_t i = 0; i < operators_size; ++i) { TfLiteNode* node = &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node); - const TfLiteRegistration* registration = + const TfLiteRegistration_V1* registration = subgraph_allocations_[subgraph_idx] .node_and_registrations[i] .registration; @@ -126,7 +126,7 @@ TfLiteStatus MicroGraph::FreeSubgraphs() { for (size_t i = 0; i < operators_size; ++i) { TfLiteNode* node = &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node); - const TfLiteRegistration* registration = + const TfLiteRegistration_V1* registration = subgraph_allocations_[subgraph_idx] .node_and_registrations[i] .registration; @@ -155,9 +155,10 @@ TfLiteStatus MicroGraph::InvokeSubgraph(int subgraph_idx) { for (size_t i = 0; i < operators_size; ++i) { TfLiteNode* node = &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node); - const TfLiteRegistration* registration = subgraph_allocations_[subgraph_idx] - .node_and_registrations[i] - .registration; + const TfLiteRegistration_V1* registration = + subgraph_allocations_[subgraph_idx] + .node_and_registrations[i] + .registration; // This ifdef is needed (even though ScopedMicroProfiler itself is a no-op with // -DTF_LITE_STRIP_ERROR_STRINGS) because the function OpNameFromRegistration is diff --git a/src/tensorflow/lite/micro/micro_graph.h b/src/tensorflow/lite/micro/micro_graph.h index 942082ac..ce93d339 100644 --- a/src/tensorflow/lite/micro/micro_graph.h +++ b/src/tensorflow/lite/micro/micro_graph.h @@ -38,20 +38,20 @@ class MicroGraph { MicroResourceVariables* resource_variables); virtual ~MicroGraph(); - // Sets up builtin data and calls TfLiteRegistration->Init for every operator - // in every subgraph in the model. + // Sets up builtin data and calls TfLiteRegistration_V1->Init for every + // operator in every subgraph in the model. virtual TfLiteStatus InitSubgraphs(); - // Calls TfLiteRegistration->Prepare for every operator in every subgraph in - // the model. + // Calls TfLiteRegistration_V1->Prepare for every operator in every subgraph + // in the model. virtual TfLiteStatus PrepareSubgraphs(); - // Calls TfLiteRegistration->Free for every operator in every subgraph in the - // model. + // Calls TfLiteRegistration_V1->Free for every operator in every subgraph in + // the model. virtual TfLiteStatus FreeSubgraphs(); - // Calls TfLiteRegistration->Invoke for every operator in a single subgraph in - // the model. + // Calls TfLiteRegistration_V1->Invoke for every operator in a single subgraph + // in the model. virtual TfLiteStatus InvokeSubgraph(int subgraph_idx); // Zeros out all variable tensors in all subgraphs in the model. diff --git a/src/tensorflow/lite/micro/micro_interpreter.cpp b/src/tensorflow/lite/micro/micro_interpreter.cpp index 91c7481d..75c3f628 100644 --- a/src/tensorflow/lite/micro/micro_interpreter.cpp +++ b/src/tensorflow/lite/micro/micro_interpreter.cpp @@ -24,11 +24,11 @@ limitations under the License. #include "tensorflow/lite/micro/flatbuffer_utils.h" #include "tensorflow/lite/micro/memory_helpers.h" #include "tensorflow/lite/micro/micro_allocator.h" +#include "tensorflow/lite/micro/micro_context.h" #include "tensorflow/lite/micro/micro_log.h" #include "tensorflow/lite/micro/micro_op_resolver.h" #include "tensorflow/lite/micro/micro_profiler_interface.h" #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h" -#include "tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h" #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/schema/schema_utils.h" @@ -77,11 +77,17 @@ MicroInterpreter::~MicroInterpreter() { } void MicroInterpreter::Init(MicroProfilerInterface* profiler) { + micro_context_.SetInterpreterState(MicroContext::InterpreterState::kInit); context_.impl_ = static_cast(µ_context_); context_.ReportError = MicroContextReportOpError; context_.GetTensor = MicroContextGetTensor; context_.GetEvalTensor = MicroContextGetEvalTensor; context_.profiler = profiler; + context_.RequestScratchBufferInArena = + MicroContextRequestScratchBufferInArena; + context_.GetExternalContext = MicroContextGetExternalContext; + context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer; + context_.GetScratchBuffer = MicroContextGetScratchBuffer; initialization_status_ = kTfLiteOk; } @@ -192,27 +198,15 @@ TfLiteStatus MicroInterpreter::AllocateTensors() { TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer()); - // Only allow AllocatePersistentBuffer in Init stage. - context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer; - context_.RequestScratchBufferInArena = nullptr; - context_.GetScratchBuffer = nullptr; - context_.GetExternalContext = nullptr; + micro_context_.SetInterpreterState(MicroContext::InterpreterState::kInit); TF_LITE_ENSURE_STATUS(graph_.InitSubgraphs()); - // Both AllocatePersistentBuffer and RequestScratchBufferInArena is - // available in Prepare stage. - context_.RequestScratchBufferInArena = - MicroContextRequestScratchBufferInArena; - // external_context become available in Prepare stage. - context_.GetExternalContext = MicroContextGetExternalContext; + micro_context_.SetInterpreterState(MicroContext::InterpreterState::kPrepare); TF_LITE_ENSURE_STATUS(graph_.PrepareSubgraphs()); - // Prepare is done, we're ready for Invoke. Memory allocation is no longer - // allowed. Kernels can only fetch scratch buffers via GetScratchBuffer. - context_.AllocatePersistentBuffer = nullptr; - context_.RequestScratchBufferInArena = nullptr; - context_.GetScratchBuffer = MicroContextGetScratchBuffer; + micro_context_.SetInterpreterState( + MicroContext::InterpreterState::kMemoryPlanning); TF_LITE_ENSURE_OK(&context_, allocator_.FinishModelAllocation( model_, graph_.GetAllocations(), @@ -267,6 +261,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() { TF_LITE_ENSURE_STATUS(Reset()); tensors_allocated_ = true; + micro_context_.SetInterpreterState(MicroContext::InterpreterState::kInvoke); return kTfLiteOk; } diff --git a/src/tensorflow/lite/micro/micro_mutable_op_resolver.h b/src/tensorflow/lite/micro/micro_mutable_op_resolver.h index 2898d193..c9a2c8fd 100644 --- a/src/tensorflow/lite/micro/micro_mutable_op_resolver.h +++ b/src/tensorflow/lite/micro/micro_mutable_op_resolver.h @@ -37,7 +37,7 @@ limitations under the License. #include "tensorflow/lite/schema/schema_generated.h" namespace tflite { -TfLiteRegistration* Register_DETECTION_POSTPROCESS(); +TfLiteRegistration_V1* Register_DETECTION_POSTPROCESS(); template class MicroMutableOpResolver : public MicroOpResolver { @@ -46,11 +46,12 @@ class MicroMutableOpResolver : public MicroOpResolver { explicit MicroMutableOpResolver() {} - const TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override { + const TfLiteRegistration_V1* FindOp( + tflite::BuiltinOperator op) const override { if (op == BuiltinOperator_CUSTOM) return nullptr; for (unsigned int i = 0; i < registrations_len_; ++i) { - const TfLiteRegistration& registration = registrations_[i]; + const TfLiteRegistration_V1& registration = registrations_[i]; if (registration.builtin_code == op) { return ®istration; } @@ -58,9 +59,9 @@ class MicroMutableOpResolver : public MicroOpResolver { return nullptr; } - const TfLiteRegistration* FindOp(const char* op) const override { + const TfLiteRegistration_V1* FindOp(const char* op) const override { for (unsigned int i = 0; i < registrations_len_; ++i) { - const TfLiteRegistration& registration = registrations_[i]; + const TfLiteRegistration_V1& registration = registrations_[i]; if ((registration.builtin_code == BuiltinOperator_CUSTOM) && (strcmp(registration.custom_name, op) == 0)) { return ®istration; @@ -84,7 +85,8 @@ class MicroMutableOpResolver : public MicroOpResolver { // function is called again for a previously added Custom Operator, the // MicroOpResolver will be unchanged and this function will return // kTfLiteError. - TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration) { + TfLiteStatus AddCustom(const char* name, + TfLiteRegistration_V1* registration) { if (registrations_len_ >= tOpCount) { MicroPrintf( "Couldn't register custom op '%s', resolver size is too" @@ -99,7 +101,8 @@ class MicroMutableOpResolver : public MicroOpResolver { return kTfLiteError; } - TfLiteRegistration* new_registration = ®istrations_[registrations_len_]; + TfLiteRegistration_V1* new_registration = + ®istrations_[registrations_len_]; registrations_len_ += 1; *new_registration = *registration; @@ -112,11 +115,11 @@ class MicroMutableOpResolver : public MicroOpResolver { // MicroMutableOpResolver object. TfLiteStatus AddAbs() { - return AddBuiltin(BuiltinOperator_ABS, tflite::ops::micro::Register_ABS(), - ParseAbs); + return AddBuiltin(BuiltinOperator_ABS, Register_ABS(), ParseAbs); } - TfLiteStatus AddAdd(const TfLiteRegistration& registration = Register_ADD()) { + TfLiteStatus AddAdd( + const TfLiteRegistration_V1& registration = Register_ADD()) { return AddBuiltin(BuiltinOperator_ADD, registration, ParseAdd); } @@ -139,7 +142,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddAveragePool2D( - const TfLiteRegistration& registration = Register_AVERAGE_POOL_2D()) { + const TfLiteRegistration_V1& registration = Register_AVERAGE_POOL_2D()) { return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, registration, ParsePool); } @@ -181,13 +184,12 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddConv2D( - const TfLiteRegistration& registration = Register_CONV_2D()) { + const TfLiteRegistration_V1& registration = Register_CONV_2D()) { return AddBuiltin(BuiltinOperator_CONV_2D, registration, ParseConv2D); } TfLiteStatus AddCos() { - return AddBuiltin(BuiltinOperator_COS, tflite::ops::micro::Register_COS(), - ParseCos); + return AddBuiltin(BuiltinOperator_COS, tflite::Register_COS(), ParseCos); } TfLiteStatus AddCumSum() { @@ -200,8 +202,8 @@ class MicroMutableOpResolver : public MicroOpResolver { tflite::Register_DEPTH_TO_SPACE(), ParseDepthToSpace); } - TfLiteStatus AddDepthwiseConv2D( - const TfLiteRegistration& registration = Register_DEPTHWISE_CONV_2D()) { + TfLiteStatus AddDepthwiseConv2D(const TfLiteRegistration_V1& registration = + Register_DEPTHWISE_CONV_2D()) { return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, registration, ParseDepthwiseConv2D); } @@ -229,7 +231,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddEthosU() { - TfLiteRegistration* registration = tflite::Register_ETHOSU(); + TfLiteRegistration_V1* registration = tflite::Register_ETHOSU(); if (registration) { return AddCustom(tflite::GetString_ETHOSU(), registration); } @@ -264,7 +266,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddFullyConnected( - const TfLiteRegistration& registration = Register_FULLY_CONNECTED()) { + const TfLiteRegistration_V1& registration = Register_FULLY_CONNECTED()) { return AddBuiltin(BuiltinOperator_FULLY_CONNECTED, registration, ParseFullyConnected); } @@ -323,8 +325,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddLog() { - return AddBuiltin(BuiltinOperator_LOG, tflite::ops::micro::Register_LOG(), - ParseLog); + return AddBuiltin(BuiltinOperator_LOG, Register_LOG(), ParseLog); } TfLiteStatus AddLogicalAnd() { @@ -333,8 +334,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddLogicalNot() { - return AddBuiltin(BuiltinOperator_LOGICAL_NOT, - tflite::ops::micro::Register_LOGICAL_NOT(), + return AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT(), ParseLogicalNot); } @@ -359,7 +359,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddMaxPool2D( - const TfLiteRegistration& registration = Register_MAX_POOL_2D()) { + const TfLiteRegistration_V1& registration = Register_MAX_POOL_2D()) { return AddBuiltin(BuiltinOperator_MAX_POOL_2D, registration, ParsePool); } @@ -377,7 +377,8 @@ class MicroMutableOpResolver : public MicroOpResolver { ParseMinimum); } - TfLiteStatus AddMul(const TfLiteRegistration& registration = Register_MUL()) { + TfLiteStatus AddMul( + const TfLiteRegistration_V1& registration = Register_MUL()) { return AddBuiltin(BuiltinOperator_MUL, registration, ParseMul); } @@ -394,7 +395,8 @@ class MicroMutableOpResolver : public MicroOpResolver { return AddBuiltin(BuiltinOperator_PACK, Register_PACK(), ParsePack); } - TfLiteStatus AddPad(const TfLiteRegistration& registration = Register_PAD()) { + TfLiteStatus AddPad( + const TfLiteRegistration_V1& registration = Register_PAD()) { return AddBuiltin(BuiltinOperator_PAD, registration, ParsePad); } @@ -453,8 +455,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddRsqrt() { - return AddBuiltin(BuiltinOperator_RSQRT, - tflite::ops::micro::Register_RSQRT(), ParseRsqrt); + return AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT(), ParseRsqrt); } TfLiteStatus AddSelectV2() { @@ -467,8 +468,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddSin() { - return AddBuiltin(BuiltinOperator_SIN, tflite::ops::micro::Register_SIN(), - ParseSin); + return AddBuiltin(BuiltinOperator_SIN, Register_SIN(), ParseSin); } TfLiteStatus AddSlice() { @@ -476,7 +476,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddSoftmax( - const TfLiteRegistration& registration = Register_SOFTMAX()) { + const TfLiteRegistration_V1& registration = Register_SOFTMAX()) { return AddBuiltin(BuiltinOperator_SOFTMAX, registration, ParseSoftmax); } @@ -504,13 +504,11 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddSqrt() { - return AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT(), - ParseSqrt); + return AddBuiltin(BuiltinOperator_SQRT, Register_SQRT(), ParseSqrt); } TfLiteStatus AddSquare() { - return AddBuiltin(BuiltinOperator_SQUARE, - tflite::ops::micro::Register_SQUARE(), ParseSquare); + return AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE(), ParseSquare); } TfLiteStatus AddSquaredDifference() { @@ -533,7 +531,7 @@ class MicroMutableOpResolver : public MicroOpResolver { } TfLiteStatus AddSvdf( - const TfLiteRegistration& registration = Register_SVDF()) { + const TfLiteRegistration_V1& registration = Register_SVDF()) { return AddBuiltin(BuiltinOperator_SVDF, registration, ParseSvdf); } @@ -555,10 +553,11 @@ class MicroMutableOpResolver : public MicroOpResolver { return AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(), ParseUnpack); } - TfLiteStatus AddUnidirectionalSequenceLSTM() { + TfLiteStatus AddUnidirectionalSequenceLSTM( + const TfLiteRegistration_V1& registration = + Register_UNIDIRECTIONAL_SEQUENCE_LSTM()) { return AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, - Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), - ParseUnidirectionalSequenceLSTM); + registration, ParseUnidirectionalSequenceLSTM); } TfLiteStatus AddVarHandle() { @@ -579,7 +578,7 @@ class MicroMutableOpResolver : public MicroOpResolver { private: TfLiteStatus AddBuiltin(tflite::BuiltinOperator op, - const TfLiteRegistration& registration, + const TfLiteRegistration_V1& registration, TfLiteBridgeBuiltinParseFunction parser) { if (op == BuiltinOperator_CUSTOM) { MicroPrintf("Invalid parameter BuiltinOperator_CUSTOM to the "); @@ -612,7 +611,7 @@ class MicroMutableOpResolver : public MicroOpResolver { return kTfLiteOk; } - TfLiteRegistration registrations_[tOpCount]; + TfLiteRegistration_V1 registrations_[tOpCount]; unsigned int registrations_len_ = 0; // Arrays (and counter) to store the builtin codes and their corresponding diff --git a/src/tensorflow/lite/micro/micro_op_resolver.cpp b/src/tensorflow/lite/micro/micro_op_resolver.cpp new file mode 100644 index 00000000..7463e5af --- /dev/null +++ b/src/tensorflow/lite/micro/micro_op_resolver.cpp @@ -0,0 +1,55 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/micro_op_resolver.h" + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/micro_log.h" +#include "tensorflow/lite/schema/schema_utils.h" + +namespace tflite { + +TfLiteStatus GetRegistrationFromOpCode( + const OperatorCode* opcode, const MicroOpResolver& op_resolver, + const TfLiteRegistration_V1** registration) { + TfLiteStatus status = kTfLiteOk; + *registration = nullptr; + auto builtin_code = GetBuiltinCode(opcode); + + if (builtin_code > BuiltinOperator_MAX) { + MicroPrintf("Op builtin_code out of range: %d.", builtin_code); + status = kTfLiteError; + } else if (builtin_code != BuiltinOperator_CUSTOM) { + *registration = op_resolver.FindOp(builtin_code); + if (*registration == nullptr) { + MicroPrintf("Didn't find op for builtin opcode '%s'", + EnumNameBuiltinOperator(builtin_code)); + status = kTfLiteError; + } + } else if (!opcode->custom_code()) { + MicroPrintf("Operator with CUSTOM builtin_code has no custom_code.\n"); + status = kTfLiteError; + } else { + const char* name = opcode->custom_code()->c_str(); + *registration = op_resolver.FindOp(name); + if (*registration == nullptr) { + // Do not report error for unresolved custom op, we do the final check + // while preparing ops. + status = kTfLiteError; + } + } + return status; +} +} // namespace tflite diff --git a/src/tensorflow/lite/micro/micro_op_resolver.h b/src/tensorflow/lite/micro/micro_op_resolver.h index 02b07313..ed8b10e1 100644 --- a/src/tensorflow/lite/micro/micro_op_resolver.h +++ b/src/tensorflow/lite/micro/micro_op_resolver.h @@ -17,7 +17,6 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h" -#include "tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h" #include "tensorflow/lite/schema/schema_generated.h" namespace tflite { @@ -31,38 +30,32 @@ namespace tflite { // We need an interface class instead of directly using MicroMutableOpResolver // because MicroMutableOpResolver is a class template with the number of // registered Ops as the template parameter. -class MicroOpResolver : public TfLiteBridgeOpResolver { +class MicroOpResolver { public: // Returns the Op registration struct corresponding to the enum code from the // flatbuffer schema. Returns nullptr if the op is not found or if op == // BuiltinOperator_CUSTOM. - virtual const TfLiteRegistration* FindOp(BuiltinOperator op) const = 0; + virtual const TfLiteRegistration_V1* FindOp(BuiltinOperator op) const = 0; // Returns the Op registration struct corresponding to the custom operator by // name. - virtual const TfLiteRegistration* FindOp(const char* op) const = 0; - - // This implementation exists for compatibility with the OpResolver base class - // and disregards the version parameter. - const TfLiteRegistration* FindOp(BuiltinOperator op, - int version) const final { - return FindOp(op); - } - - // This implementation exists for compatibility with the OpResolver base class - // and disregards the version parameter. - const TfLiteRegistration* FindOp(const char* op, int version) const final { - return FindOp(op); - } + virtual const TfLiteRegistration_V1* FindOp(const char* op) const = 0; // Returns the operator specific parsing function for the OpData for a // BuiltinOperator (if registered), else nullptr. virtual TfLiteBridgeBuiltinParseFunction GetOpDataParser( BuiltinOperator op) const = 0; - ~MicroOpResolver() override {} + virtual ~MicroOpResolver() {} }; +// Handles the logic for converting between an OperatorCode structure extracted +// from a flatbuffer and information about a registered operator +// implementation. +TfLiteStatus GetRegistrationFromOpCode( + const OperatorCode* opcode, const MicroOpResolver& op_resolver, + const TfLiteRegistration_V1** registration); + } // namespace tflite #endif // TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_ diff --git a/src/tensorflow/lite/micro/micro_profiler.cpp b/src/tensorflow/lite/micro/micro_profiler.cpp index e9eb5e54..c3f0f4f1 100644 --- a/src/tensorflow/lite/micro/micro_profiler.cpp +++ b/src/tensorflow/lite/micro/micro_profiler.cpp @@ -26,7 +26,11 @@ namespace tflite { uint32_t MicroProfiler::BeginEvent(const char* tag) { if (num_events_ == kMaxEvents) { - num_events_ = 0; + MicroPrintf( + "MicroProfiler errored out because total number of events exceeded the " + "maximum of %d.", + kMaxEvents); + TFLITE_ASSERT_FALSE; } tags_[num_events_] = tag; @@ -52,8 +56,7 @@ void MicroProfiler::Log() const { #if !defined(TF_LITE_STRIP_ERROR_STRINGS) for (int i = 0; i < num_events_; ++i) { uint32_t ticks = end_ticks_[i] - start_ticks_[i]; - MicroPrintf("%s took %" PRIu32 " ticks (%d ms).", tags_[i], ticks, - TicksToMs(ticks)); + MicroPrintf("%s took %u ticks (%d ms).", tags_[i], ticks, TicksToMs(ticks)); } #endif } diff --git a/src/tensorflow/lite/micro/micro_profiler.h b/src/tensorflow/lite/micro/micro_profiler.h index d1136474..1c39ea1c 100644 --- a/src/tensorflow/lite/micro/micro_profiler.h +++ b/src/tensorflow/lite/micro/micro_profiler.h @@ -69,7 +69,7 @@ class MicroProfiler : public MicroProfilerInterface { // Maximum number of events that this class can keep track of. If we call // AddEvent more than kMaxEvents number of times, then the oldest event's // profiling information will be overwritten. - static constexpr int kMaxEvents = 1024; + static constexpr int kMaxEvents = 4096; const char* tags_[kMaxEvents]; uint32_t start_ticks_[kMaxEvents]; diff --git a/src/tensorflow/lite/micro/test_helper_custom_ops.cpp b/src/tensorflow/lite/micro/test_helper_custom_ops.cpp index b87cb5ae..15c450a6 100644 --- a/src/tensorflow/lite/micro/test_helper_custom_ops.cpp +++ b/src/tensorflow/lite/micro/test_helper_custom_ops.cpp @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/micro/all_ops_resolver.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/micro/micro_utils.h" #include "tensorflow/lite/schema/schema_generated.h" @@ -36,12 +35,12 @@ limitations under the License. namespace tflite { namespace testing { -const TfLiteRegistration* PackerOp::getRegistration() { +const TfLiteRegistration_V1* PackerOp::getRegistration() { return GetMutableRegistration(); } -TfLiteRegistration* PackerOp::GetMutableRegistration() { - static TfLiteRegistration r; +TfLiteRegistration_V1* PackerOp::GetMutableRegistration() { + static TfLiteRegistration_V1 r; r.init = Init; r.prepare = Prepare; r.invoke = Invoke; diff --git a/src/tensorflow/lite/micro/test_helper_custom_ops.h b/src/tensorflow/lite/micro/test_helper_custom_ops.h index 9c950fc9..1ae95e4a 100644 --- a/src/tensorflow/lite/micro/test_helper_custom_ops.h +++ b/src/tensorflow/lite/micro/test_helper_custom_ops.h @@ -23,7 +23,6 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" -#include "tensorflow/lite/micro/all_ops_resolver.h" #include "tensorflow/lite/micro/micro_utils.h" #include "tensorflow/lite/portable_type_to_tflitetype.h" #include "tensorflow/lite/schema/schema_generated.h" @@ -33,8 +32,8 @@ namespace testing { class PackerOp { public: - static const TfLiteRegistration* getRegistration(); - static TfLiteRegistration* GetMutableRegistration(); + static const TfLiteRegistration_V1* getRegistration(); + static TfLiteRegistration_V1* GetMutableRegistration(); static void* Init(TfLiteContext* context, const char* buffer, size_t length); static void Free(TfLiteContext* context, void* buffer); static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); diff --git a/src/tensorflow/lite/micro/test_helpers.cpp b/src/tensorflow/lite/micro/test_helpers.cpp index bf81f140..d69a8e23 100644 --- a/src/tensorflow/lite/micro/test_helpers.cpp +++ b/src/tensorflow/lite/micro/test_helpers.cpp @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/micro/all_ops_resolver.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/micro/memory_helpers.h" #include "tensorflow/lite/micro/micro_arena_constants.h" @@ -1431,12 +1430,12 @@ const Model* BuildSimpleMockModelWithNullInputsOutputs() { } // namespace -const TfLiteRegistration* SimpleStatefulOp::getRegistration() { +const TfLiteRegistration_V1* SimpleStatefulOp::getRegistration() { return GetMutableRegistration(); } -TfLiteRegistration* SimpleStatefulOp::GetMutableRegistration() { - static TfLiteRegistration r; +TfLiteRegistration_V1* SimpleStatefulOp::GetMutableRegistration() { + static TfLiteRegistration_V1 r; r.init = Init; r.prepare = Prepare; r.invoke = Invoke; @@ -1445,10 +1444,6 @@ TfLiteRegistration* SimpleStatefulOp::GetMutableRegistration() { void* SimpleStatefulOp::Init(TfLiteContext* context, const char* buffer, size_t length) { - TFLITE_DCHECK(context->AllocateBufferForEval == nullptr); - TFLITE_DCHECK(context->GetScratchBuffer == nullptr); - TFLITE_DCHECK(context->RequestScratchBufferInArena == nullptr); - void* raw = context->AllocatePersistentBuffer(context, sizeof(OpData)); OpData* data = reinterpret_cast(raw); *data = {}; @@ -1521,12 +1516,12 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context, return kTfLiteOk; } -const TfLiteRegistration* MockCustom::getRegistration() { +const TfLiteRegistration_V1* MockCustom::getRegistration() { return GetMutableRegistration(); } -TfLiteRegistration* MockCustom::GetMutableRegistration() { - static TfLiteRegistration r; +TfLiteRegistration_V1* MockCustom::GetMutableRegistration() { + static TfLiteRegistration_V1 r; r.init = Init; r.prepare = Prepare; r.invoke = Invoke; @@ -1569,12 +1564,12 @@ TfLiteStatus MockCustom::Invoke(TfLiteContext* context, TfLiteNode* node) { bool MockCustom::freed_ = false; -const TfLiteRegistration* MultipleInputs::getRegistration() { +const TfLiteRegistration_V1* MultipleInputs::getRegistration() { return GetMutableRegistration(); } -TfLiteRegistration* MultipleInputs::GetMutableRegistration() { - static TfLiteRegistration r; +TfLiteRegistration_V1* MultipleInputs::GetMutableRegistration() { + static TfLiteRegistration_V1 r; r.init = Init; r.prepare = Prepare; r.invoke = Invoke; @@ -1624,12 +1619,12 @@ TfLiteStatus MultipleInputs::Invoke(TfLiteContext* context, TfLiteNode* node) { bool MultipleInputs::freed_ = false; -const TfLiteRegistration* NoOp::getRegistration() { +const TfLiteRegistration_V1* NoOp::getRegistration() { return GetMutableRegistration(); } -TfLiteRegistration* NoOp::GetMutableRegistration() { - static TfLiteRegistration r; +TfLiteRegistration_V1* NoOp::GetMutableRegistration() { + static TfLiteRegistration_V1 r; r.init = Init; r.prepare = Prepare; r.invoke = Invoke; @@ -1658,16 +1653,20 @@ TfLiteStatus NoOp::Invoke(TfLiteContext* context, TfLiteNode* node) { bool NoOp::freed_ = false; -AllOpsResolver GetOpResolver() { - AllOpsResolver op_resolver; - op_resolver.AddCustom("mock_custom", MockCustom::GetMutableRegistration()); - op_resolver.AddCustom("simple_stateful_op", - SimpleStatefulOp::GetMutableRegistration()); - op_resolver.AddCustom("multiple_inputs_op", - MultipleInputs::GetMutableRegistration()); - op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration()); - op_resolver.AddCustom("custom_packer_op", PackerOp::GetMutableRegistration()); - return op_resolver; +TfLiteStatus GetTestingOpResolver( + tflite::testing::TestingOpResolver& op_resolver) { + TF_LITE_ENSURE_STATUS(op_resolver.AddCustom( + "mock_custom", MockCustom::GetMutableRegistration())); + TF_LITE_ENSURE_STATUS(op_resolver.AddCustom( + "simple_stateful_op", SimpleStatefulOp::GetMutableRegistration())); + TF_LITE_ENSURE_STATUS(op_resolver.AddCustom( + "multiple_inputs_op", MultipleInputs::GetMutableRegistration())); + TF_LITE_ENSURE_STATUS( + op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration())); + TF_LITE_ENSURE_STATUS(op_resolver.AddCustom( + "custom_packer_op", PackerOp::GetMutableRegistration())); + TF_LITE_ENSURE_STATUS(op_resolver.AddIf()); + return kTfLiteOk; } const Model* GetModelWithUnusedInputs() { diff --git a/src/tensorflow/lite/micro/test_helpers.h b/src/tensorflow/lite/micro/test_helpers.h index 728a0bbf..20dbc090 100644 --- a/src/tensorflow/lite/micro/test_helpers.h +++ b/src/tensorflow/lite/micro/test_helpers.h @@ -23,7 +23,7 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" -#include "tensorflow/lite/micro/all_ops_resolver.h" +#include "tensorflow/lite/micro/micro_mutable_op_resolver.h" #include "tensorflow/lite/micro/micro_utils.h" #include "tensorflow/lite/portable_type_to_tflitetype.h" #include "tensorflow/lite/schema/schema_generated.h" @@ -32,6 +32,7 @@ namespace tflite { namespace testing { constexpr int kOfflinePlannerHeaderSize = 3; +using TestingOpResolver = tflite::MicroMutableOpResolver<10>; struct NodeConnection_ { std::initializer_list input; @@ -55,8 +56,8 @@ class SimpleStatefulOp { }; public: - static const TfLiteRegistration* getRegistration(); - static TfLiteRegistration* GetMutableRegistration(); + static const TfLiteRegistration_V1* getRegistration(); + static TfLiteRegistration_V1* GetMutableRegistration(); static void* Init(TfLiteContext* context, const char* buffer, size_t length); static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node); @@ -64,8 +65,8 @@ class SimpleStatefulOp { class MockCustom { public: - static const TfLiteRegistration* getRegistration(); - static TfLiteRegistration* GetMutableRegistration(); + static const TfLiteRegistration_V1* getRegistration(); + static TfLiteRegistration_V1* GetMutableRegistration(); static void* Init(TfLiteContext* context, const char* buffer, size_t length); static void Free(TfLiteContext* context, void* buffer); static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); @@ -78,8 +79,8 @@ class MockCustom { // the sum of the inputs. class MultipleInputs { public: - static const TfLiteRegistration* getRegistration(); - static TfLiteRegistration* GetMutableRegistration(); + static const TfLiteRegistration_V1* getRegistration(); + static TfLiteRegistration_V1* GetMutableRegistration(); static void* Init(TfLiteContext* context, const char* buffer, size_t length); static void Free(TfLiteContext* context, void* buffer); static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); @@ -91,8 +92,8 @@ class MultipleInputs { // A simple no-op operator. class NoOp { public: - static const TfLiteRegistration* getRegistration(); - static TfLiteRegistration* GetMutableRegistration(); + static const TfLiteRegistration_V1* getRegistration(); + static TfLiteRegistration_V1* GetMutableRegistration(); static void* Init(TfLiteContext* context, const char* buffer, size_t length); static void Free(TfLiteContext* context, void* buffer); static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); @@ -102,7 +103,7 @@ class NoOp { }; // Returns an Op Resolver that can be used in the testing code. -AllOpsResolver GetOpResolver(); +TfLiteStatus GetTestingOpResolver(TestingOpResolver& op_resolver); // Returns a simple example flatbuffer TensorFlow Lite model. Contains 1 input, // 1 layer of weights, 1 output Tensor, and 1 operator. @@ -216,7 +217,6 @@ TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims, result.is_variable = is_variable; result.allocation_type = kTfLiteMemNone; result.data.data = const_cast(data); - result.quantization = {kTfLiteAffineQuantization, nullptr}; result.bytes = ElementCount(*dims) * sizeof(T); result.data.data = const_cast(data); diff --git a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.cpp b/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.cpp deleted file mode 100644 index b8f3eb08..00000000 --- a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h" - -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/core/api/error_reporter.h" -#include "tensorflow/lite/core/api/op_resolver.h" -#include "tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h" -#include "tensorflow/lite/schema/schema_utils.h" - -namespace tflite { - -TfLiteStatus GetRegistrationFromOpCode( - const OperatorCode* opcode, const OpResolver& op_resolver, - const TfLiteRegistration** registration) { - return GetRegistrationFromOpCode( - opcode, op_resolver, tflite::GetMicroErrorReporter(), registration); -} -} // namespace tflite diff --git a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h b/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h deleted file mode 100644 index 252df6e8..00000000 --- a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_MICRO_TFLITE_BRIDGE_OP_RESOLVER_BRIDGE_H_ -#define TENSORFLOW_LITE_MICRO_TFLITE_BRIDGE_OP_RESOLVER_BRIDGE_H_ - -#include "tensorflow/lite/c/c_api_types.h" -#include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/core/api/op_resolver.h" - -namespace tflite { - -// Forward declaration of the classes and structs used here. -struct OperatorCode; - -using TfLiteBridgeOpResolver = OpResolver; - -// Handles the logic for converting between an OperatorCode structure extracted -// from a flatbuffer and information about a registered operator -// implementation. -TfLiteStatus GetRegistrationFromOpCode(const OperatorCode* opcode, - const OpResolver& op_resolver, - const TfLiteRegistration** registration); - -} // namespace tflite - -#endif // TENSORFLOW_LITE_MICRO_TFLITE_BRIDGE_OP_RESOLVER_BRIDGE_H_ diff --git a/src/tensorflow/lite/schema/schema_generated.h b/src/tensorflow/lite/schema/schema_generated.h index a475fcd2..856e15dd 100644 --- a/src/tensorflow/lite/schema/schema_generated.h +++ b/src/tensorflow/lite/schema/schema_generated.h @@ -543,6 +543,18 @@ struct SignOptions; struct SignOptionsBuilder; struct SignOptionsT; +struct BitcastOptions; +struct BitcastOptionsBuilder; +struct BitcastOptionsT; + +struct BitwiseXorOptions; +struct BitwiseXorOptionsBuilder; +struct BitwiseXorOptionsT; + +struct RightShiftOptions; +struct RightShiftOptionsBuilder; +struct RightShiftOptionsT; + struct OperatorCode; struct OperatorCodeBuilder; struct OperatorCodeT; @@ -1059,11 +1071,14 @@ enum BuiltinOperator : int32_t { BuiltinOperator_ATAN2 = 156, BuiltinOperator_UNSORTED_SEGMENT_MIN = 157, BuiltinOperator_SIGN = 158, + BuiltinOperator_BITCAST = 159, + BuiltinOperator_BITWISE_XOR = 160, + BuiltinOperator_RIGHT_SHIFT = 161, BuiltinOperator_MIN = BuiltinOperator_ADD, - BuiltinOperator_MAX = BuiltinOperator_SIGN + BuiltinOperator_MAX = BuiltinOperator_RIGHT_SHIFT }; -inline const BuiltinOperator (&EnumValuesBuiltinOperator())[159] { +inline const BuiltinOperator (&EnumValuesBuiltinOperator())[162] { static const BuiltinOperator values[] = { BuiltinOperator_ADD, BuiltinOperator_AVERAGE_POOL_2D, @@ -1223,13 +1238,16 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[159] { BuiltinOperator_UNSORTED_SEGMENT_SUM, BuiltinOperator_ATAN2, BuiltinOperator_UNSORTED_SEGMENT_MIN, - BuiltinOperator_SIGN + BuiltinOperator_SIGN, + BuiltinOperator_BITCAST, + BuiltinOperator_BITWISE_XOR, + BuiltinOperator_RIGHT_SHIFT }; return values; } inline const char * const *EnumNamesBuiltinOperator() { - static const char * const names[160] = { + static const char * const names[163] = { "ADD", "AVERAGE_POOL_2D", "CONCATENATION", @@ -1389,13 +1407,16 @@ inline const char * const *EnumNamesBuiltinOperator() { "ATAN2", "UNSORTED_SEGMENT_MIN", "SIGN", + "BITCAST", + "BITWISE_XOR", + "RIGHT_SHIFT", nullptr }; return names; } inline const char *EnumNameBuiltinOperator(BuiltinOperator e) { - if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_SIGN)) return ""; + if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_RIGHT_SHIFT)) return ""; const size_t index = static_cast(e); return EnumNamesBuiltinOperator()[index]; } @@ -1525,11 +1546,14 @@ enum BuiltinOptions : uint8_t { BuiltinOptions_UnsortedSegmentSumOptions = 121, BuiltinOptions_ATan2Options = 122, BuiltinOptions_SignOptions = 123, + BuiltinOptions_BitcastOptions = 124, + BuiltinOptions_BitwiseXorOptions = 125, + BuiltinOptions_RightShiftOptions = 126, BuiltinOptions_MIN = BuiltinOptions_NONE, - BuiltinOptions_MAX = BuiltinOptions_SignOptions + BuiltinOptions_MAX = BuiltinOptions_RightShiftOptions }; -inline const BuiltinOptions (&EnumValuesBuiltinOptions())[124] { +inline const BuiltinOptions (&EnumValuesBuiltinOptions())[127] { static const BuiltinOptions values[] = { BuiltinOptions_NONE, BuiltinOptions_Conv2DOptions, @@ -1654,13 +1678,16 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[124] { BuiltinOptions_UnsortedSegmentMinOptions, BuiltinOptions_UnsortedSegmentSumOptions, BuiltinOptions_ATan2Options, - BuiltinOptions_SignOptions + BuiltinOptions_SignOptions, + BuiltinOptions_BitcastOptions, + BuiltinOptions_BitwiseXorOptions, + BuiltinOptions_RightShiftOptions }; return values; } inline const char * const *EnumNamesBuiltinOptions() { - static const char * const names[125] = { + static const char * const names[128] = { "NONE", "Conv2DOptions", "DepthwiseConv2DOptions", @@ -1785,13 +1812,16 @@ inline const char * const *EnumNamesBuiltinOptions() { "UnsortedSegmentSumOptions", "ATan2Options", "SignOptions", + "BitcastOptions", + "BitwiseXorOptions", + "RightShiftOptions", nullptr }; return names; } inline const char *EnumNameBuiltinOptions(BuiltinOptions e) { - if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_SignOptions)) return ""; + if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_RightShiftOptions)) return ""; const size_t index = static_cast(e); return EnumNamesBuiltinOptions()[index]; } @@ -2292,6 +2322,18 @@ template<> struct BuiltinOptionsTraits { static const BuiltinOptions enum_value = BuiltinOptions_SignOptions; }; +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions; +}; + +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions; +}; + +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions; +}; + template struct BuiltinOptionsUnionTraits { static const BuiltinOptions enum_value = BuiltinOptions_NONE; }; @@ -2788,6 +2830,18 @@ template<> struct BuiltinOptionsUnionTraits { static const BuiltinOptions enum_value = BuiltinOptions_SignOptions; }; +template<> struct BuiltinOptionsUnionTraits { + static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions; +}; + +template<> struct BuiltinOptionsUnionTraits { + static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions; +}; + +template<> struct BuiltinOptionsUnionTraits { + static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions; +}; + struct BuiltinOptionsUnion { BuiltinOptions type; void *value; @@ -3802,6 +3856,30 @@ struct BuiltinOptionsUnion { return type == BuiltinOptions_SignOptions ? reinterpret_cast(value) : nullptr; } + tflite::BitcastOptionsT *AsBitcastOptions() { + return type == BuiltinOptions_BitcastOptions ? + reinterpret_cast(value) : nullptr; + } + const tflite::BitcastOptionsT *AsBitcastOptions() const { + return type == BuiltinOptions_BitcastOptions ? + reinterpret_cast(value) : nullptr; + } + tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() { + return type == BuiltinOptions_BitwiseXorOptions ? + reinterpret_cast(value) : nullptr; + } + const tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() const { + return type == BuiltinOptions_BitwiseXorOptions ? + reinterpret_cast(value) : nullptr; + } + tflite::RightShiftOptionsT *AsRightShiftOptions() { + return type == BuiltinOptions_RightShiftOptions ? + reinterpret_cast(value) : nullptr; + } + const tflite::RightShiftOptionsT *AsRightShiftOptions() const { + return type == BuiltinOptions_RightShiftOptions ? + reinterpret_cast(value) : nullptr; + } }; bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type); @@ -11556,6 +11634,123 @@ inline flatbuffers::Offset CreateSignOptions( flatbuffers::Offset CreateSignOptions(flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct BitcastOptionsT : public flatbuffers::NativeTable { + typedef BitcastOptions TableType; +}; + +struct BitcastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef BitcastOptionsT NativeTableType; + typedef BitcastOptionsBuilder Builder; + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } + BitcastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(BitcastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct BitcastOptionsBuilder { + typedef BitcastOptions Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + explicit BitcastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateBitcastOptions( + flatbuffers::FlatBufferBuilder &_fbb) { + BitcastOptionsBuilder builder_(_fbb); + return builder_.Finish(); +} + +flatbuffers::Offset CreateBitcastOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +struct BitwiseXorOptionsT : public flatbuffers::NativeTable { + typedef BitwiseXorOptions TableType; +}; + +struct BitwiseXorOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef BitwiseXorOptionsT NativeTableType; + typedef BitwiseXorOptionsBuilder Builder; + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } + BitwiseXorOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(BitwiseXorOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct BitwiseXorOptionsBuilder { + typedef BitwiseXorOptions Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + explicit BitwiseXorOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateBitwiseXorOptions( + flatbuffers::FlatBufferBuilder &_fbb) { + BitwiseXorOptionsBuilder builder_(_fbb); + return builder_.Finish(); +} + +flatbuffers::Offset CreateBitwiseXorOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +struct RightShiftOptionsT : public flatbuffers::NativeTable { + typedef RightShiftOptions TableType; +}; + +struct RightShiftOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef RightShiftOptionsT NativeTableType; + typedef RightShiftOptionsBuilder Builder; + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } + RightShiftOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(RightShiftOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct RightShiftOptionsBuilder { + typedef RightShiftOptions Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + explicit RightShiftOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateRightShiftOptions( + flatbuffers::FlatBufferBuilder &_fbb) { + RightShiftOptionsBuilder builder_(_fbb); + return builder_.Finish(); +} + +flatbuffers::Offset CreateRightShiftOptions(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct OperatorCodeT : public flatbuffers::NativeTable { typedef OperatorCode TableType; int8_t deprecated_builtin_code = 0; @@ -12068,6 +12263,15 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const tflite::SignOptions *builtin_options_as_SignOptions() const { return builtin_options_type() == tflite::BuiltinOptions_SignOptions ? static_cast(builtin_options()) : nullptr; } + const tflite::BitcastOptions *builtin_options_as_BitcastOptions() const { + return builtin_options_type() == tflite::BuiltinOptions_BitcastOptions ? static_cast(builtin_options()) : nullptr; + } + const tflite::BitwiseXorOptions *builtin_options_as_BitwiseXorOptions() const { + return builtin_options_type() == tflite::BuiltinOptions_BitwiseXorOptions ? static_cast(builtin_options()) : nullptr; + } + const tflite::RightShiftOptions *builtin_options_as_RightShiftOptions() const { + return builtin_options_type() == tflite::BuiltinOptions_RightShiftOptions ? static_cast(builtin_options()) : nullptr; + } const flatbuffers::Vector *custom_options() const { return GetPointer *>(VT_CUSTOM_OPTIONS); } @@ -12596,6 +12800,18 @@ template<> inline const tflite::SignOptions *Operator::builtin_options_as inline const tflite::BitcastOptions *Operator::builtin_options_as() const { + return builtin_options_as_BitcastOptions(); +} + +template<> inline const tflite::BitwiseXorOptions *Operator::builtin_options_as() const { + return builtin_options_as_BitwiseXorOptions(); +} + +template<> inline const tflite::RightShiftOptions *Operator::builtin_options_as() const { + return builtin_options_as_RightShiftOptions(); +} + struct OperatorBuilder { typedef Operator Table; flatbuffers::FlatBufferBuilder &fbb_; @@ -16931,6 +17147,75 @@ inline flatbuffers::Offset CreateSignOptions(flatbuffers::FlatBuffe _fbb); } +inline BitcastOptionsT *BitcastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = std::unique_ptr(new BitcastOptionsT()); + UnPackTo(_o.get(), _resolver); + return _o.release(); +} + +inline void BitcastOptions::UnPackTo(BitcastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; +} + +inline flatbuffers::Offset BitcastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateBitcastOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateBitcastOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BitcastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + return tflite::CreateBitcastOptions( + _fbb); +} + +inline BitwiseXorOptionsT *BitwiseXorOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = std::unique_ptr(new BitwiseXorOptionsT()); + UnPackTo(_o.get(), _resolver); + return _o.release(); +} + +inline void BitwiseXorOptions::UnPackTo(BitwiseXorOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; +} + +inline flatbuffers::Offset BitwiseXorOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateBitwiseXorOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateBitwiseXorOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BitwiseXorOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + return tflite::CreateBitwiseXorOptions( + _fbb); +} + +inline RightShiftOptionsT *RightShiftOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = std::unique_ptr(new RightShiftOptionsT()); + UnPackTo(_o.get(), _resolver); + return _o.release(); +} + +inline void RightShiftOptions::UnPackTo(RightShiftOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; +} + +inline flatbuffers::Offset RightShiftOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateRightShiftOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateRightShiftOptions(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RightShiftOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + return tflite::CreateRightShiftOptions( + _fbb); +} + inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = std::unique_ptr(new OperatorCodeT()); UnPackTo(_o.get(), _resolver); @@ -17966,6 +18251,18 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } + case BuiltinOptions_BitcastOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case BuiltinOptions_BitwiseXorOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case BuiltinOptions_RightShiftOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } default: return true; } } @@ -18477,6 +18774,18 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c auto ptr = reinterpret_cast(obj); return ptr->UnPack(resolver); } + case BuiltinOptions_BitcastOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } + case BuiltinOptions_BitwiseXorOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } + case BuiltinOptions_RightShiftOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } default: return nullptr; } } @@ -18976,6 +19285,18 @@ inline flatbuffers::Offset BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff auto ptr = reinterpret_cast(value); return CreateSignOptions(_fbb, ptr, _rehasher).Union(); } + case BuiltinOptions_BitcastOptions: { + auto ptr = reinterpret_cast(value); + return CreateBitcastOptions(_fbb, ptr, _rehasher).Union(); + } + case BuiltinOptions_BitwiseXorOptions: { + auto ptr = reinterpret_cast(value); + return CreateBitwiseXorOptions(_fbb, ptr, _rehasher).Union(); + } + case BuiltinOptions_RightShiftOptions: { + auto ptr = reinterpret_cast(value); + return CreateRightShiftOptions(_fbb, ptr, _rehasher).Union(); + } default: return 0; } } @@ -19474,6 +19795,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) : value = new tflite::SignOptionsT(*reinterpret_cast(u.value)); break; } + case BuiltinOptions_BitcastOptions: { + value = new tflite::BitcastOptionsT(*reinterpret_cast(u.value)); + break; + } + case BuiltinOptions_BitwiseXorOptions: { + value = new tflite::BitwiseXorOptionsT(*reinterpret_cast(u.value)); + break; + } + case BuiltinOptions_RightShiftOptions: { + value = new tflite::RightShiftOptionsT(*reinterpret_cast(u.value)); + break; + } default: break; } @@ -20096,6 +20429,21 @@ inline void BuiltinOptionsUnion::Reset() { delete ptr; break; } + case BuiltinOptions_BitcastOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } + case BuiltinOptions_BitwiseXorOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } + case BuiltinOptions_RightShiftOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } default: break; } value = nullptr; diff --git a/src/third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h b/src/third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h new file mode 100644 index 00000000..e4472639 --- /dev/null +++ b/src/third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h @@ -0,0 +1,308 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_compiler.h + * Description: Generic compiler header + * + * $Date: 31 January 2023 + * $Revision: V.1.1.0 + * + * Target : Arm(R) M-Profile Architecture + * -------------------------------------------------------------------- */ + +#ifndef ARM_NN_COMPILER_H +#define ARM_NN_COMPILER_H + +/** + * + * @brief Arm C-Language Extension(ACLE) Includes + * + */ + +#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE __inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static __inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline + #endif + #ifndef __RESTRICT + #define __RESTRICT __restrict + #endif + +#elif defined(__ICCARM__) + + #warning IAR support is not tested + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __FORCEINLINE + #define __FORCEINLINE _Pragma("inline=forced") + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE + #endif + +#elif defined(_MSC_VER) + + // Build for non Arm Cortex-M processors is not tested or supported. + // Use this section to stub any macros or intrinsics + #warning Unsupported compiler + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE static __forceinline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static __inline + #endif + #ifndef __ALIGNED + #define __ALIGNED(x) __declspec(align(x)) + #endif + +#elif defined(__GNUC__) + + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline + #endif + #ifndef __RESTRICT + #define __RESTRICT __restrict + #endif + +#else + + #error Unsupported compiler. Add support as needed + +#endif + +/** + * + * @brief Compiler specific diagnostic adjustment / fixes if applicable + * + */ + +// Note: __ARM_ARCH is used with M-profile architecture as the target here. +#if defined(__GNUC__) + #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH) + // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x + // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html + #define ARM_GCC_12_2_ICE + #endif +#endif + +#if ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1) + #include +#endif + +#if defined(__ARM_ARCH) || defined(__ARM_ACLE) + #include +#endif + +/** + * + * @brief ACLE and Intrinsics + * + */ + +// Note: Have __GNUC__, that is used to check for GCC , checks at the end +// as __GNUC__ is defined by non-GCC compilers as well + +/* Common intrinsics for all architectures */ +#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__) + #define CLZ __clz +#elif defined(__GNUC__) +/** + \brief Count leading zeros + \details Counts the number of leading zeros of a data value. + \param [in] value Value to count the leading zeros + \return number of leading zeros in value + */ +__STATIC_FORCEINLINE uint8_t CLZ(uint32_t value) +{ + /* Even though __builtin_clz produces a CLZ instruction on ARM, formally + __builtin_clz(0) is undefined behaviour, so handle this case specially. + This guarantees Arm-compatible results if compiling on a non-Arm + target, and ensures the compiler doesn't decide to activate any + optimisations using the logic "value was passed to __builtin_clz, so it + is non-zero". + ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a + single CLZ instruction. + */ + if (value == 0U) + { + return 32U; + } + return __builtin_clz(value); +} +#endif + +// ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32 + +// Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups + +#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) + + // Common intrinsics + #define SMLABB __smlabb + #define SMLATT __smlatt + #define QADD __qadd + #define QSUB8 __qsub8 + #define QSUB16 __qsub16 + #define SADD16 __sadd16 + + // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed + #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__) + + #define SMULBB __smulbb + #define SMULTT __smultt + #define ROR __ror + #define SXTB16 __sxtb16 + #define SXTAB16 __sxtab16 + #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2)) + #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3)) + #define SMLAD __smlad + // PKH translates into pkh on AC6 + #define PKHBT(ARG1, ARG2, ARG3) \ + (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)) + #define PKHTB(ARG1, ARG2, ARG3) \ + (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)) + + #elif defined(__GNUC__) + + #define PKHBT(ARG1, ARG2, ARG3) \ + __extension__({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \ + __RES; \ + }) + #define PKHTB(ARG1, ARG2, ARG3) \ + __extension__({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + if (ARG3 == 0) \ + __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2)); \ + else \ + __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \ + __RES; \ + }) + +__STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2)); + return (result); +} + +__STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1) +{ + uint32_t result; + + __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1)); + return (result); +} + +// __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x) +__STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3)); + return (result); +} + +__STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2) +{ + op2 %= 32U; + if (op2 == 0U) + { + return op1; + } + return (op1 >> op2) | (op1 << (32U - op2)); +} + +__STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate) +{ + uint32_t result; + if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U))) + { + __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate)); + } + else + { + result = SXTB16(ROR(op1, rotate)); + } + return result; +} + +__STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate) +{ + uint32_t result; + if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U))) + { + __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate)); + } + else + { + result = SXTAB16(op1, ROR(op2, rotate)); + } + return result; +} + +// Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain +__STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2)); + return (result); +} + +__STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2)); + return (result); +} + #endif + +#endif + +#endif /* #ifndef ARM_NN_COMPILER_H */ \ No newline at end of file diff --git a/src/third_party/cmsis_nn/Include/arm_nn_math_types.h b/src/third_party/cmsis_nn/Include/arm_nn_math_types.h index 3c6c437f..a4c95587 100644 --- a/src/third_party/cmsis_nn/Include/arm_nn_math_types.h +++ b/src/third_party/cmsis_nn/Include/arm_nn_math_types.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -16,157 +16,63 @@ * limitations under the License. */ -/****************************************************************************** - * @file arm_nn_math_types.h - * @brief Compiler include and basic types - * @version V1.2.0 - * @date 20 June 2022 - * Target Processor: Cortex-M - ******************************************************************************/ - -/** - Copied from CMSIS/DSP/arm_math_types.h and modified -*/ - -#ifndef _ARM_NN_MATH_TYPES_H_ +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_math_types.h + * Description: Compiler include and basic types + * + * $Date: 4 January 2023 + * $Revision: V.1.3.2 + * + * Target : Arm(R) M-Profile Architecture + * -------------------------------------------------------------------- */ -#define _ARM_NN_MATH_TYPES_H_ +#ifndef ARM_NN_MATH_TYPES_H -#ifdef __cplusplus -extern "C" { -#endif +#define ARM_NN_MATH_TYPES_H -#include #include -#include #include #include -/* Integer aliases */ -typedef int8_t q7_t; -typedef int16_t q15_t; -typedef int32_t q31_t; -typedef int64_t q63_t; - -/* Compiler specific diagnostic adjustment */ -#if defined(__CC_ARM) - -#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) - -#elif defined(__GNUC__) - -#elif defined(__ICCARM__) - -#elif defined(__TI_ARM__) - -#elif defined(__CSMC__) - -#elif defined(__TASKING__) - -#elif defined(_MSC_VER) - -#else -#error Unknown compiler -#endif - -/* Included for instrinsics definitions */ -#if defined(_MSC_VER) -#ifndef __STATIC_FORCEINLINE -#define __STATIC_FORCEINLINE static __forceinline -#endif -#ifndef __STATIC_INLINE -#define __STATIC_INLINE static __inline -#endif -#ifndef __ALIGNED -#define __ALIGNED(x) __declspec(align(x)) -#endif - -#elif defined(__GNUC_PYTHON__) -#ifndef __ALIGNED -#define __ALIGNED(x) __attribute__((aligned(x))) -#endif -#ifndef __STATIC_FORCEINLINE -#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) -#endif -#ifndef __STATIC_INLINE -#define __STATIC_INLINE static inline -#endif - -#else -#include "third_party/cmsis/CMSIS/Core/Include/cmsis_compiler.h" -#endif - -/* evaluate ARM DSP feature */ -#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) -#ifndef ARM_MATH_DSP -#define ARM_MATH_DSP 1 -#endif -#endif - -#if __ARM_FEATURE_MVE -#ifndef ARM_MATH_MVEI -#define ARM_MATH_MVEI -#endif -#endif - -/* Compiler specific diagnostic adjustment */ -#if defined(__CC_ARM) - -#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) - -#elif defined(__GNUC__) -// #pragma GCC diagnostic pop - -#elif defined(__ICCARM__) - -#elif defined(__TI_ARM__) - -#elif defined(__CSMC__) - -#elif defined(__TASKING__) - -#elif defined(_MSC_VER) - -#else -#error Unknown compiler -#endif - -#ifdef __cplusplus -} -#endif - -#if __ARM_FEATURE_MVE -#include -#endif - #ifdef __cplusplus extern "C" { #endif /** - * @brief Add necessary typedefs + * + * @brief Translate architecture feature flags to CMSIS-NN defines + * */ -#define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL)) -#define NN_Q15_MAX ((q15_t)(0x7FFF)) -#define NN_Q7_MAX ((q7_t)(0x7F)) -#define NN_Q31_MIN ((q31_t)(0x80000000L)) -#define NN_Q15_MIN ((q15_t)(0x8000)) -#define NN_Q7_MIN ((q7_t)(0x80)) +// CMSIS-NN uses the same macro names as CMSIS-DSP +#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) + #ifndef ARM_MATH_DSP + #define ARM_MATH_DSP 1 + #endif +#endif + +#if defined(__ARM_FEATURE_MVE) + #ifndef ARM_MATH_MVEI + #define ARM_MATH_MVEI 1 + #endif +#endif /** - * @brief Error status returned by some functions in the library. + * + * @brief Limits macros + * */ -typedef enum -{ - ARM_CMSIS_NN_SUCCESS = 0, /**< No error */ - ARM_CMSIS_NN_ARG_ERROR = -1, /**< One or more arguments are incorrect */ - ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**< No implementation available */ -} arm_cmsis_nn_status; +#define NN_Q31_MAX ((int32_t)(0x7FFFFFFFL)) +#define NN_Q15_MAX ((int16_t)(0x7FFF)) +#define NN_Q7_MAX ((int8_t)(0x7F)) +#define NN_Q31_MIN ((int32_t)(0x80000000L)) +#define NN_Q15_MIN ((int16_t)(0x8000)) +#define NN_Q7_MIN ((int8_t)(0x80)) #ifdef __cplusplus } #endif -#endif /*ifndef _ARM_NN_MATH_TYPES_H_ */ +#endif /*ifndef ARM_NN_MATH_TYPES_H */ diff --git a/src/third_party/cmsis_nn/Include/arm_nn_tables.h b/src/third_party/cmsis_nn/Include/arm_nn_tables.h index c9090e31..c52b7ed8 100644 --- a/src/third_party/cmsis_nn/Include/arm_nn_tables.h +++ b/src/third_party/cmsis_nn/Include/arm_nn_tables.h @@ -21,8 +21,8 @@ * Title: arm_nn_tables.h * Description: Extern declaration for NN tables * - * $Date: 30. September 2022 - * $Revision: V.2.0.0 + * $Date: 28 October 2022 + * $Revision: V.2.1.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ @@ -37,5 +37,6 @@ * */ +extern const uint16_t sigmoid_table_uint16[256]; #endif /* ARM_NN_TABLES_H */ \ No newline at end of file diff --git a/src/third_party/cmsis_nn/Include/arm_nn_types.h b/src/third_party/cmsis_nn/Include/arm_nn_types.h index 64182bec..915fbec9 100644 --- a/src/third_party/cmsis_nn/Include/arm_nn_types.h +++ b/src/third_party/cmsis_nn/Include/arm_nn_types.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: Public header file to contain the CMSIS-NN structs for the * TensorFlowLite micro compliant functions * - * $Date: 22. Februari 2022 - * $Revision: V.2.1.0 + * $Date: 8 Februari 2023 + * $Revision: V.2.4.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ #ifndef _ARM_NN_TYPES_H @@ -33,6 +33,21 @@ #include +/** Enum for specifying activation function types */ +typedef enum +{ + ARM_SIGMOID = 0, /**< Sigmoid activation function */ + ARM_TANH = 1, /**< Tanh activation function */ +} arm_nn_activation_type; + +/** Function return codes */ +typedef enum +{ + ARM_CMSIS_NN_SUCCESS = 0, /**< No error */ + ARM_CMSIS_NN_ARG_ERROR = -1, /**< One or more arguments are incorrect */ + ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**< No implementation available */ +} arm_cmsis_nn_status; + /** CMSIS-NN object to contain the width and height of a tile */ typedef struct { @@ -57,6 +72,15 @@ typedef struct int32_t c; /**< Input channels */ } cmsis_nn_dims; +/** CMSIS-NN object to contain LSTM specific input parameters related to dimensions */ +typedef struct +{ + int32_t max_time; + int32_t num_inputs; + int32_t num_batches; + int32_t num_outputs; +} cmsis_nn_lstm_dims; + /** CMSIS-NN object for the per-channel quantization parameters */ typedef struct { @@ -134,4 +158,100 @@ typedef struct const int16_t *one_by_one_lut; } cmsis_nn_softmax_lut_s16; +/** LSTM guard parameters */ +typedef struct +{ + int32_t input_variance; + int32_t forget_variance; + int32_t cell_variance; + int32_t output_variance; +} cmsis_nn_lstm_guard_params; + +/** LSTM scratch buffer container */ +typedef struct +{ + int16_t *input_gate; + int16_t *forget_gate; + int16_t *cell_gate; + int16_t *output_gate; +} cmsis_nn_lstm_context; + +/** Quantized clip value for cell and projection of LSTM input. Zero value means no clipping. */ +typedef struct +{ + int16_t cell; + int8_t projection; +} cmsis_nn_lstm_clip_params; + +/** CMSIS-NN object for quantization parameters */ +typedef struct +{ + int32_t multiplier; /**< Multiplier value */ + int32_t shift; /**< Shift value */ +} cmsis_nn_scaling; + +/** CMSIS-NN norm layer coefficients */ +typedef struct +{ + int16_t *input_weight; + int16_t *forget_weight; + int16_t *cell_weight; + int16_t *output_weight; +} cmsis_nn_layer_norm; + +/** Parameters for integer LSTM, as defined in TFLM */ +typedef struct +{ + int32_t time_major; /**< Nonzero (true) if first row of data is timestamps for input */ + cmsis_nn_scaling input_to_input_scaling; + cmsis_nn_scaling input_to_forget_scaling; + cmsis_nn_scaling input_to_cell_scaling; + cmsis_nn_scaling input_to_output_scaling; + cmsis_nn_scaling recurrent_to_input_scaling; + cmsis_nn_scaling recurrent_to_forget_scaling; + cmsis_nn_scaling recurrent_to_cell_scaling; + cmsis_nn_scaling recurrent_to_output_scaling; + cmsis_nn_scaling cell_to_input_scaling; + cmsis_nn_scaling cell_to_forget_scaling; + cmsis_nn_scaling cell_to_output_scaling; + cmsis_nn_scaling projection_scaling; + cmsis_nn_scaling hidden_scaling; + cmsis_nn_scaling layer_norm_input_scaling; /**< layer normalization for input layer */ + cmsis_nn_scaling layer_norm_forget_scaling; /**< layer normalization for forget gate */ + cmsis_nn_scaling layer_norm_cell_scaling; /**< layer normalization for cell */ + cmsis_nn_scaling layer_norm_output_scaling; /**< layer normalization for outpus layer */ + + int32_t cell_state_shift; + int32_t hidden_offset; + int32_t output_state_offset; + + cmsis_nn_lstm_clip_params clip; + cmsis_nn_lstm_guard_params guard; + cmsis_nn_layer_norm layer_norm; + + /* Effective bias is precalculated as bias + zero_point * weight. + Only applicable to when input/output are s8 and weights are s16 */ + const int32_t *i2i_effective_bias; /**< input to input effective bias */ + const int32_t *i2f_effective_bias; /**< input to forget gate effective bias */ + const int32_t *i2c_effective_bias; /**< input to cell effective bias */ + const int32_t *i2o_effective_bias; /**< input to output effective bias */ + + const int32_t *r2i_effective_bias; /**< recurrent gate to input effective bias */ + const int32_t *r2f_effective_bias; /**< recurrent gate to forget gate effective bias */ + const int32_t *r2c_effective_bias; /**< recurrent gate to cell effective bias */ + const int32_t *r2o_effective_bias; /**< recurrent gate to output effective bias */ + + const int32_t *projection_effective_bias; + + /* Not precalculated bias */ + const int32_t *input_gate_bias; + const int32_t *forget_gate_bias; + const int32_t *cell_gate_bias; + const int32_t *output_gate_bias; + + /* Activation min and max */ + cmsis_nn_activation activation; + +} cmsis_nn_lstm_params; + #endif // _ARM_NN_TYPES_H diff --git a/src/third_party/cmsis_nn/Include/arm_nnfunctions.h b/src/third_party/cmsis_nn/Include/arm_nnfunctions.h index 1712da41..f338ca60 100644 --- a/src/third_party/cmsis_nn/Include/arm_nnfunctions.h +++ b/src/third_party/cmsis_nn/Include/arm_nnfunctions.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_nnfunctions.h * Description: Public header file for CMSIS NN Library * - * $Date: 30 September 2022 - * $Revision: V.11.0.0 + * $Date: 13 January 2023 + * $Revision: V.11.3.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ /** @@ -32,24 +32,24 @@ * * \tableofcontents * \section Introduction - * + * * * This user manual describes the CMSIS NN software library, * a collection of efficient neural network kernels developed to maximize the * performance and minimize the memory footprint of neural networks on Arm Cortex-M processors. * * The library is divided into a number of functions each covering a specific category: - * - \ref NNConv Convolution Functions - * - \ref Acti "Activation Functions" - * - \ref FC Fully-connected Layer Functions - * - \ref SVDF Layer Functions - * - \ref Pooling Functions - * - \ref Softmax Functions - * - \ref groupElementwise Basic math Functions - * + * - \ref NNConv + * - \ref Acti + * - \ref FC + * - \ref SVDF + * - \ref Pooling + * - \ref Softmax + * - \ref groupElementwise + * - \ref LSTM * * \section Processors Supported Processors - * + * * CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each * targets a different group of processors. * - Processors without Single Instruction Multiple Data(SIMD) capability (e.g, Cortex-M0) @@ -59,40 +59,45 @@ * * \section Framework Quantization Specification * The library follows the [int8](https://www.tensorflow.org/lite/performance/quantization_spec) and int16 - * quantization specification of TensorFlow Lite for Microcontrollers. + * quantization specification of TensorFlow Lite for Microcontrollers. * \section Overview Block Diagram - * + * * \image html CMSIS-NN-OVERVIEW.PNG * * \section Examples - * + * * * An example image recognition application using TensorFlow Flow Lite for Microcontrollers as an inference engine * and CMSIS-NN as the optimized library can be found in the Examples directory. * * \section Macros Pre-processor Macros - * + * * \subsection Feature Feature flag based * The macros below are defined in a build system based on feature flags for a chosen processor or architecture * input to a compiler. - * These tie in to the classification in \ref Macros. - * - * For a CMSIS-NN file compiled as *armclang -mcpu=cortex-m4 --target=arm-arm-none-eabi -I + * These tie in to the classification in \ref Macros. + * + * For a CMSIS-NN file compiled as *armclang -mcpu=cortex-m4 --target=arm-arm-none-eabi -I * -Ofast -O file.c* , ARM_MATH_DSP is enabled as Cortex-M4 has the DSP extension as a feature. - * + * * - `ARM_MATH_DSP` - Selects code for processors with DSP extension. * * - `ARM_MATH_MVEI` - Selects code for processors which supports MVE instructions. * - * \subsection MiscFlags User Set + * \subsection MiscFlags User Set * - `ARM_MATH_AUTOVECTORIZE` - * Applicable when ARM_MATH_MVEI is active to let the compiler auto vectorize functions, if available, that uses inline + * Applicable when ARM_MATH_MVEI is active to let the compiler auto vectorize functions, if available, that uses + inline * assembly. This has to be explicitly set at compile time. * + * \section Inclusive Inclusive Language + * This product confirms to Arm’s inclusive language policy and, to the best of our knowledge, + * does not contain any non-inclusive language. If you find something that concerns you, email terms@arm.com. + * * \section Copyright Copyright Notice - * * - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * */ @@ -115,18 +120,6 @@ extern "C" { #endif -/** - * @brief Struct for specifying activation function types - * - */ -typedef enum -{ - ARM_SIGMOID = 0, - /**< Sigmoid activation function */ - ARM_TANH = 1, - /**< Tanh activation function */ -} arm_nn_activation_type; - /** * @defgroup NNConv Convolution Functions * @@ -175,13 +168,13 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get the required buffer size for arm_convolve_wrapper_s8 @@ -194,7 +187,7 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, * filter dimensions * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] * - * @return The function returns required buffer size(bytes) + * @return The function returns required buffer size(bytes) * */ int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, @@ -202,6 +195,32 @@ int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims); +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s8 for Arm(R) Helium Architecture case. + * Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_convolve_wrapper_s8_get_buffer_size(). + * + */ +int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s8 for processors with DSP extension. + * Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_convolve_wrapper_s8_get_buffer_size(). + * + */ +int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + /** * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in * cmsis-nn to perform the convolution. @@ -233,16 +252,16 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** - * @brief Get the required buffer size for arm_convolve_wrapper_s16 + * @brief Get the required buffer size for arm_convolve_wrapper_s16. * * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). * conv_params->input_offset : Not used @@ -252,7 +271,7 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, * filter dimensions * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] * - * @return The function returns required buffer size(bytes) + * @return The function returns required buffer size(bytes) * */ int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, @@ -260,6 +279,32 @@ int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *con const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims); +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s16 for for processors with DSP extension. + * Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_convolve_wrapper_s16_get_buffer_size(). + * + */ +int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s16 for Arm(R) Helium Architecture case. + * Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_convolve_wrapper_s16_get_buffer_size(). + * + */ +int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + /** * @brief Basic s8 convolution function * @param[in, out] ctx Function context that contains the additional buffer if required by the function. @@ -284,21 +329,20 @@ int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *con * * @details * 1. Supported framework: TensorFlow Lite micro - * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. - * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. * */ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get the required buffer size for s8 convolution function @@ -306,7 +350,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK * are the spatial filter dimensions - * @return The function returns required buffer size(bytes) + * @return The function returns required buffer size(bytes) * */ int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); @@ -335,21 +379,20 @@ int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const c * * @details * 1. Supported framework: TensorFlow Lite micro - * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs. - * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. * */ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** * @brief Optimized s16 convolution function * @param[in, out] ctx Function context that contains the additional buffer if required by the function. @@ -375,9 +418,8 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx, * * @details * 1. Supported framework: TensorFlow Lite micro - * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs. - * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. - * 4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512. + * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * 3. Implementation supports kernel volumes (filter width * filter height * input channels) < 512. * */ @@ -385,13 +427,13 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** * @brief Get the required buffer size for s16 convolution function @@ -399,7 +441,7 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK * are the spatial filter dimensions - * @return The function returns required buffer size(bytes) + * @return The function returns required buffer size(bytes) * */ int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); @@ -442,7 +484,6 @@ int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, c * @details * - Supported framework : TensorFlow Lite Micro * - The following constrains on the arguments apply - * -# input_dims->c is a multiple of 4 * -# conv_params->padding.w = conv_params->padding.h = 0 * -# conv_params->stride.w = conv_params->stride.h = 1 * @@ -451,13 +492,13 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get the required buffer size for arm_convolve_1x1_s8_fast @@ -468,6 +509,46 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, */ int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims); +/** + * @brief s8 version for 1x1 convolution with support for non-unity stride values + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + * None is required by this function. + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * ARM_CMSIS_NN_ARG_ERROR if argument constraints fail. or, + * ARM_CMSIS_NN_SUCCESS on successful completion. + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# conv_params->padding.w = conv_params->padding.h = 0 + * + */ +arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const int8_t *input_data, + const cmsis_nn_dims *filter_dims, + const int8_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + int8_t *output_data); + /** * @brief 1xn convolution * @@ -509,13 +590,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get the required additional buffer size for 1xn convolution @@ -523,7 +604,7 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the * horizontal spatial filter dimension - * @return The function returns required buffer size(bytes) + * @return The function returns required buffer size(bytes) * */ int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); @@ -561,7 +642,6 @@ int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, c * -# arm_depthwise_conv_s8() * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only * -# arm_depthwise_conv_s8_opt() - * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the * boundary. */ @@ -569,13 +649,13 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() @@ -595,6 +675,32 @@ int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_par const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims); +/** + * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for processors with DSP extension. + * Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_depthwise_conv_wrapper_s8_get_buffer_size(). + * + */ +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for Arm(R) Helium Architecture case. + * Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_depthwise_conv_wrapper_s8_get_buffer_size(). + * + */ +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + /** * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions. * @@ -623,19 +729,18 @@ int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_par * * @details * - Supported framework: TensorFlow Lite - * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. */ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions. @@ -665,19 +770,18 @@ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, * * @details * - Supported framework: TensorFlow Lite - * - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs. */ arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** * @brief Wrapper function to pick the right optimized s16 depthwise convolution function @@ -711,19 +815,18 @@ arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, * - Picks one of the the following functions * -# arm_depthwise_conv_s16() * -# arm_depthwise_conv_fast_s16() - Cortex-M CPUs with DSP extension only - * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. */ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() @@ -743,6 +846,32 @@ int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_pa const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims); +/** + * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for processors with DSP extension. + * Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_depthwise_conv_wrapper_s16_get_buffer_size(). + * + */ +int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for Arm(R) Helium Architecture + * case. Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_depthwise_conv_wrapper_s16_get_buffer_size(). + * + */ +int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + /** * @brief Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel. * Refer arm_depthwise_conv_s16() for function argument details. @@ -759,7 +888,6 @@ int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_pa * - Supported framework: TensorFlow Lite * - The following constrains on the arguments apply * -# Number of input channel equals number of output channels or ch_mult equals 1 - * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. * - Reccomended when number of channels is 4 or greater. * */ @@ -767,13 +895,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** * @brief Get the required buffer size for optimized s16 depthwise convolution @@ -781,7 +909,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] * Batch argument N is not used. * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] - * @return The function returns required buffer size in bytes + * @return The function returns required buffer size in bytes * */ int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); @@ -808,13 +936,13 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. @@ -835,7 +963,6 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, * - Supported framework: TensorFlow Lite * - The following constrains on the arguments apply * -# Number of input channel equals number of output channels or ch_mult equals 1 - * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. * - Reccomended when number of channels is 4 or greater. * */ @@ -843,13 +970,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get the required buffer size for optimized s8 depthwise convolution @@ -857,7 +984,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] * Batch argument N is not used. * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] - * @return The function returns required buffer size in bytes + * @return The function returns required buffer size in bytes * */ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); @@ -909,29 +1036,47 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim * * @details * - Supported framework: TensorFlow Lite - * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. */ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, const cmsis_nn_fc_params *fc_params, const cmsis_nn_per_tensor_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** - * @brief Get the required buffer size for S8 basic fully-connected and - * matrix multiplication layer function for TF Lite + * @brief Get size of additional buffer required by arm_fully_connected_s8(). * @param[in] filter_dims dimension of filter * @return The function returns required buffer size in bytes * */ int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); +/** + * @brief Get size of additional buffer required by arm_fully_connected_s8() for processors with DSP extension. + * Refer to arm_fully_connected_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_fully_connected_s8_get_buffer_size(). + * + */ +int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims); + +/** + * @brief Get size of additional buffer required by arm_fully_connected_s8() for Arm(R) Helium Architecture case. + * Refer to arm_fully_connected_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_fully_connected_s8_get_buffer_size(). + * + */ +int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims); + /** * @brief Basic s16 Fully Connected function. * @@ -966,23 +1111,21 @@ int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) * * @details * - Supported framework: TensorFlow Lite - * - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs. */ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx, const cmsis_nn_fc_params *fc_params, const cmsis_nn_per_tensor_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data); + int16_t *output_data); /** - * @brief Get the required buffer size for S16 basic fully-connected and - * matrix multiplication layer function for TF Lite + * @brief Get size of additional buffer required by arm_fully_connected_s16(). * @param[in] filter_dims dimension of filter * @return The function returns required buffer size in bytes * @@ -990,20 +1133,24 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx, int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims); /** - * @brief Q7 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @param[in,out] vec_buffer pointer to buffer space for input - * @return The function returns ARM_CMSIS_NN_SUCCESS + * @brief Get size of additional buffer required by arm_fully_connected_s16() for processors with DSP extension. + * Refer to arm_fully_connected_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_fully_connected_s16_get_buffer_size(). * */ +int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims); +/** + * @brief Get size of additional buffer required by arm_fully_connected_s16() for Arm(R) Helium Architecture case. + * Refer to arm_fully_connected_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_fully_connected_s16_get_buffer_size(). + * + */ +int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims); /** * @defgroup groupElementwise Elementwise Functions @@ -1157,24 +1304,46 @@ arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect, * @param[in,out] data pointer to input * @param[in] size number of elements */ - -void arm_relu_q7(q7_t *data, uint16_t size); +void arm_relu_q7(int8_t *data, uint16_t size); /** * @brief s8 ReLU6 function * @param[in,out] data pointer to input * @param[in] size number of elements */ - -void arm_relu6_s8(q7_t *data, uint16_t size); +void arm_relu6_s8(int8_t *data, uint16_t size); /** * @brief Q15 RELU function * @param[in,out] data pointer to input * @param[in] size number of elements */ +void arm_relu_q15(int16_t *data, uint16_t size); -void arm_relu_q15(q15_t *data, uint16_t size); +/** + * @brief s16 neural network activation function using direct table look-up + * @param[in] input pointer to input data + * @param[out] output pointer to output + * @param[in] size number of elements + * @param[in] left_shift bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * + * @details Supported framework: TensorFlow Lite for Microcontrollers. + * This activation function must be bit precise congruent with the corresponding TFLM tanh and sigmoid actication + * functions + */ +void arm_nn_activation_s16(const int16_t *input, + int16_t *output, + const uint16_t size, + const uint16_t left_shift, + const arm_nn_activation_type type); + +/** + * @defgroup Pooling Pooling Functions + * + * Perform max and average pooling operations + * + */ /** * @brief s8 average pooling function. @@ -1204,20 +1373,40 @@ void arm_relu_q15(q15_t *data, uint16_t size); arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, const cmsis_nn_pool_params *pool_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief Get the required buffer size for S8 average pooling function * @param[in] dim_dst_width output tensor dimension * @param[in] ch_src number of input tensor channels - * @return The function returns required buffer size in bytes + * @return The function returns required buffer size in bytes * */ int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src); +/** + * @brief Get the required buffer size for S8 average pooling function for processors with DSP extension. + * Refer to arm_avgpool_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_avgpool_s8_get_buffer_size(). + * + */ +int32_t arm_avgpool_s8_get_buffer_size_dsp(const int dim_dst_width, const int ch_src); + +/** + * @brief Get the required buffer size for S8 average pooling function for Arm(R) Helium Architecture case. + * Refer to arm_avgpool_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_avgpool_s8_get_buffer_size(). + * + */ +int32_t arm_avgpool_s8_get_buffer_size_mve(const int dim_dst_width, const int ch_src); + /** * @brief s16 average pooling function. * @@ -1256,11 +1445,31 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, * @brief Get the required buffer size for S16 average pooling function * @param[in] dim_dst_width output tensor dimension * @param[in] ch_src number of input tensor channels - * @return The function returns required buffer size in bytes + * @return The function returns required buffer size in bytes * */ int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src); +/** + * @brief Get the required buffer size for S16 average pooling function for processors with DSP extension. + * Refer to arm_avgpool_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_avgpool_s16_get_buffer_size(). + * + */ +int32_t arm_avgpool_s16_get_buffer_size_dsp(const int dim_dst_width, const int ch_src); + +/** + * @brief Get the required buffer size for S16 average pooling function for Arm(R) Helium Architecture case. + * Refer to arm_avgpool_s16_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_avgpool_s16_get_buffer_size(). + * + */ +int32_t arm_avgpool_s16_get_buffer_size_mve(const int dim_dst_width, const int ch_src); + /** * @brief s8 max pooling function. * @@ -1290,10 +1499,10 @@ int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_sr arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx, const cmsis_nn_pool_params *pool_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief s16 max pooling function. @@ -1431,7 +1640,6 @@ void arm_softmax_u8(const uint8_t *input, const int32_t diff_min, uint8_t *output); - /** * @defgroup Reshape Reshape Functions * @@ -1669,8 +1877,6 @@ void arm_concatenation_s8_w(const int8_t *input, * * @details * 1. Supported framework: TensorFlow Lite micro - * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. - * */ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_context *output_ctx, @@ -1678,17 +1884,17 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_per_tensor_quant_params *input_quant_params, const cmsis_nn_per_tensor_quant_params *output_quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *state_dims, - q7_t *state_data, + int8_t *state_data, const cmsis_nn_dims *weights_feature_dims, - const q7_t *weights_feature_data, + const int8_t *weights_feature_data, const cmsis_nn_dims *weights_time_dims, - const q7_t *weights_time_data, + const int8_t *weights_time_data, const cmsis_nn_dims *bias_dims, - const q31_t *bias_data, + const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); /** * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights @@ -1719,8 +1925,6 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, * * @details * 1. Supported framework: TensorFlow Lite micro - * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. - * */ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_context *output_ctx, @@ -1728,17 +1932,85 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_per_tensor_quant_params *input_quant_params, const cmsis_nn_per_tensor_quant_params *output_quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *state_dims, - q15_t *state_data, + int16_t *state_data, const cmsis_nn_dims *weights_feature_dims, - const q7_t *weights_feature_data, + const int8_t *weights_feature_data, const cmsis_nn_dims *weights_time_dims, - const q15_t *weights_time_data, + const int16_t *weights_time_data, const cmsis_nn_dims *bias_dims, - const q31_t *bias_data, + const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data); + int8_t *output_data); + +/** + * @defgroup LSTM LSTM Layer Functions + * + */ + +/** + * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output + * Peephole connections, projection, clipping, combined input/forget gate and layer normalization are not supported. + * + * @param[in] scratch_buffers Struct containing scratch buffers + * Expected size for each scratch buffer is + * lstm_dims->num_batches * lstm_dims->num_outputs. + * @param[in] input_data Pointer to input data + * @param[in] lstm_dims LSTM input parameters related to dimensions + * @param[in] input_to_input_weights Input to input weights + * @param[in] input_to_forget_weights Input to forget weights + * @param[in] input_to_cell_weights Input to cell weights + * @param[in] input_to_output_weights Input to output weights + * @param[in] recurrent_to_input_weights Recurrent to input weights + * @param[in] recurrent_to_forget_weights Recurrent to forget weights + * @param[in] recurrent_to_cell_weights Recurrent to cell weights + * @param[in] recurrent_to_output_weights Recurrent to output weights + * @param[in] cell_to_input_weights Cell to input weights. Not used. + * @param[in] cell_to_forget_weights Cell to forget weights. Not used. + * @param[in] cell_to_output_weights Cell to output weights. Not used. + * @param[in] projection_weights Projection weights. Not used. + * @param[in] lstm LSTM parameters. See struct declaration + * @param[in] output_state Pointer to (recurrent) output state + * @param[in] cell_state Pointer to cell state + * @param[in] output_data Pointer to output state + * + * @note Following assumptions are done based on LSTM functionality as supported by + * Keras version 2.9.0 at the time of development. As stated here, + * https://github.com/tensorflow/community/blob/master/rfcs/20180920-unify-rnn-interface.md + * Keras's LSTMCell is equivalent to TensorFlow's BasicLSTMCell, + * which does not support peephole, clipping or projection. + * Layer normalization and combined input/forget gate are not supported either. + * + * 1 Input to input weight can not be nullptr. Otherwise nullptr for combined input/forgat gate. + * 2 Cell weights are not used and should be nullptr. Otherwise needed for peephole connections. + * 3 Projection weight is not used and should be nullpr. Otherwise needed for projection. + * + * @return The function returns ARM_CMSIS_NN_SUCCESS + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * + */ +arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratch_buffers, + const int8_t *input_data, + const cmsis_nn_lstm_dims *lstm_dims, + const int8_t *input_to_input_weights, + const int8_t *input_to_forget_weights, + const int8_t *input_to_cell_weights, + const int8_t *input_to_output_weights, + const int8_t *recurrent_to_input_weights, + const int8_t *recurrent_to_forget_weights, + const int8_t *recurrent_to_cell_weights, + const int8_t *recurrent_to_output_weights, + const int16_t *cell_to_input_weights, + const int16_t *cell_to_forget_weights, + const int16_t *cell_to_output_weights, + const int8_t *projection_weights, + const cmsis_nn_lstm_params *lstm, + int8_t *output_state, + int16_t *cell_state, + int8_t *output_data); #ifdef __cplusplus } diff --git a/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h b/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h index 8860f299..b1deaba7 100644 --- a/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h +++ b/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,15 +21,16 @@ * Title: arm_nnsupportfunctions.h * Description: Public header file of support functions for CMSIS NN Library * - * $Date: 30 September 2022 - * $Revision: V.11.0.0 + * $Date: 13 Februari 2023 + * $Revision: V.15.0.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ #ifndef _ARM_NNSUPPORTFUNCTIONS_H_ #define _ARM_NNSUPPORTFUNCTIONS_H_ +#include "third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h" #include "third_party/cmsis_nn/Include/arm_nn_math_types.h" #include "third_party/cmsis_nn/Include/arm_nn_types.h" @@ -39,6 +40,10 @@ extern "C" { #endif +#define USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims) \ + (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 && \ + filter_dims->w * filter_dims->h < 512) + #define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0) #define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift) #define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0 @@ -61,7 +66,7 @@ extern "C" { /** * @brief definition to pack four 8 bit values. */ -#define PACK_Q7x4_32x1(v0, v1, v2, v3) \ +#define PACK_S8x4_32x1(v0, v1, v2, v3) \ ((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \ (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000)) @@ -71,16 +76,16 @@ extern "C" { #define PACK_Q15x2_32x1(v0, v1) (((int32_t)v0 & (int32_t)0xFFFF) | ((int32_t)v1 << 16)) /** - * @brief Union for SIMD access of q31/q15/q7 types + * @brief Union for SIMD access of q31/s16/s8 types */ union arm_nnword { - q31_t word; + int32_t word; /**< q31 type */ - q15_t half_words[2]; - /**< q15 type */ - q7_t bytes[4]; - /**< q7 type */ + int16_t half_words[2]; + /**< s16 type */ + int8_t bytes[4]; + /**< s8 type */ }; /** @@ -105,7 +110,6 @@ union arm_nn_long_long * */ - /** * @defgroup supportConversion Data Conversion * @@ -114,22 +118,22 @@ union arm_nn_long_long */ /** - * @brief Converts the elements from a q7 vector to a q15 vector with an added offset - * @param[in] src pointer to the q7 input vector - * @param[out] dst pointer to the q15 output vector + * @brief Converts the elements from a s8 vector to a s16 vector with an added offset + * @param[in] src pointer to the s8 input vector + * @param[out] dst pointer to the s16 output vector * @param[in] block_size length of the input vector - * @param[in] offset q7 offset to be added to each input vector element. + * @param[in] offset s8 offset to be added to each input vector element. * * \par Description: * * The equation used for the conversion process is: * *
- *  dst[n] = (q15_t) src[n] + offset;   0 <= n < block_size.
+ *  dst[n] = (int16_t) src[n] + offset;   0 <= n < block_size.
  * 
* */ -void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset); +void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, uint32_t block_size, int16_t offset); /** * @brief Depthwise conv on an im2col buffer where the input channel equals output channel. @@ -150,17 +154,17 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, * * @details Supported framework: TensorFlow Lite micro. */ -q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, - const q15_t *col, - const uint16_t num_ch, - const int32_t *out_shift, - const int32_t *out_mult, - const int32_t out_offset, - const int32_t activation_min, - const int32_t activation_max, - const uint16_t kernel_size, - const int32_t *const output_bias, - q7_t *out); +int8_t *arm_nn_depthwise_conv_s8_core(const int8_t *row, + const int16_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + int8_t *out); /** * @brief General Matrix-multiplication function with per-channel requantization. @@ -184,20 +188,20 @@ q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, * * @details Supported framework: TensorFlow Lite */ -q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, - const q7_t *input_col, - const uint16_t output_ch, - const uint16_t col_batches, - const int32_t *output_shift, - const int32_t *output_mult, - const int32_t out_offset, - const int32_t col_offset, - const int32_t row_offset, - const int16_t out_activation_min, - const int16_t out_activation_max, - const uint16_t row_len, - const int32_t *const bias, - q7_t *out); +int8_t *arm_nn_mat_mult_s8(const int8_t *input_row, + const int8_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t out_activation_min, + const int16_t out_activation_max, + const uint16_t row_len, + const int32_t *const bias, + int8_t *out); /** * @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution. * @param[in] input_a pointer to operand A @@ -219,16 +223,16 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, * clamped in the range provided by activation min and max. * Supported framework: TensorFlow Lite micro. */ -q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, - const q15_t *input_b, - const int32_t output_ch, - const int32_t *out_shift, - const int32_t *out_mult, - const int16_t activation_min, - const int16_t activation_max, - const int32_t num_col_a, - const int64_t *const output_bias, - q15_t *out_0); +int16_t *arm_nn_mat_mult_kernel_s16(const int8_t *input_a, + const int16_t *input_b, + const int32_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int16_t activation_min, + const int16_t activation_max, + const int32_t num_col_a, + const int64_t *const output_bias, + int16_t *out_0); /** * @brief General Vector by Matrix multiplication with requantization and storage of result. @@ -319,14 +323,16 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, * @param[in] dst_offset Offset to be applied the output result * @param[in] activation_min Minimum value to clamp down the output. Range : int8 * @param[in] activation_max Maximum value to clamp up the output. Range : int8 + * @param[in] rhs_cols_offset Offset between input columns. Used to handle non-unity strides + * Expected value : x * rhs_cols, where x >= 1 * * @return The function returns ARM_CMSIS_NN_SUCCESS * */ -arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, - const q7_t *rhs, - const q31_t *bias, - q7_t *dst, +arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs, + const int8_t *rhs, + const int32_t *bias, + int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, @@ -335,7 +341,8 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, - const int32_t activation_max); + const int32_t activation_max, + const int32_t rhs_cols_offset); /** * @brief s8 Vector by Matrix (transposed) multiplication @@ -346,7 +353,6 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, * @param[out] dst Output vector * @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector. * Range: -127 to 128 - * @param[in] rhs_offset Not used * @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128 * @param[in] dst_multiplier Output multiplier * @param[in] dst_shift Output shift @@ -360,12 +366,11 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, * @return The function returns ARM_CMSIS_NN_SUCCESS * */ -arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, - const q7_t *rhs, - const q31_t *bias, - q7_t *dst, +arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, + const int8_t *rhs, + const int32_t *bias, + int8_t *dst, const int32_t lhs_offset, - const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, @@ -392,10 +397,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, * @return The function returns ARM_CMSIS_NN_SUCCESS * */ -arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, - const q7_t *rhs, - const q63_t *bias, - q15_t *dst, +arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const int16_t *lhs, + const int8_t *rhs, + const int64_t *bias, + int16_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, @@ -411,7 +416,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, * @param[out] dst Output vector * @param[in] lhs_offset Offset to be added to the input values of the left-hand side * vector. Range: -127 to 128 - * @param[in] rhs_offset Not used * @param[in] scatter_offset Address offset for dst. First output is stored at 'dst', the * second at 'dst + scatter_offset' and so on. * @param[in] dst_multiplier Output multiplier @@ -424,11 +428,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, * @return The function returns ARM_CMSIS_NN_SUCCESS * */ -arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, - const q7_t *rhs, - q15_t *dst, +arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const int8_t *lhs, + const int8_t *rhs, + int16_t *dst, const int32_t lhs_offset, - const int32_t rhs_offset, const int32_t scatter_offset, const int32_t dst_multiplier, const int32_t dst_shift, @@ -466,8 +469,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, * - Output bias * - rhs */ -arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, - const q7_t *rhs, +arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const int8_t *lhs, + const int8_t *rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, @@ -478,7 +481,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, - q7_t *out); + int8_t *out); /** * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. @@ -509,8 +512,8 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, * - Output bias * - rhs */ -arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, - const q7_t *rhs, +arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs, + const int8_t *rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, @@ -521,7 +524,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, - q7_t *out); + int8_t *out); /** * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. @@ -550,7 +553,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, * - rhs */ int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs, - const q7_t *rhs, + const int8_t *rhs, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, @@ -561,36 +564,13 @@ int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs, int16_t *out); /** - *@brief Matrix-multiplication function for convolution with reordered columns - *@param[in] pA pointer to operand A - *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors - *@param[in] ch_im_out numRow of A - *@param[in] numCol_A numCol of A - *@param[in] bias_shift amount of left-shift for bias - *@param[in] out_shift amount of right-shift for output - *@param[in] bias the bias - *@param[in,out] pOut pointer to output - *@return The function returns the incremented output pointer - * - *@details This function assumes that data in pInBuffer are reordered - */ -q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, - const q15_t *pInBuffer, - const uint16_t ch_im_out, - const uint16_t numCol_A, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t *bias, - q7_t *pOut); - -/** - @brief Read 2 q15 elements and post increment pointer. + @brief Read 2 s16 elements and post increment pointer. @param[in] in_q15 Pointer to pointer that holds address of input. @return q31 value */ -__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15) +__STATIC_FORCEINLINE int32_t arm_nn_read_q15x2_ia(const int16_t **in_q15) { - q31_t val; + int32_t val; memcpy(&val, *in_q15, 4); *in_q15 += 2; @@ -599,51 +579,51 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15) } /** - @brief Read 4 q7 from q7 pointer and post increment pointer. - @param[in] in_q7 Pointer to pointer that holds address of input. + @brief Read 4 s8 from s8 pointer and post increment pointer. + @param[in] in_s8 Pointer to pointer that holds address of input. @return q31 value */ -__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7) +__STATIC_FORCEINLINE int32_t arm_nn_read_s8x4_ia(const int8_t **in_s8) { - q31_t val; - memcpy(&val, *in_q7, 4); - *in_q7 += 4; + int32_t val; + memcpy(&val, *in_s8, 4); + *in_s8 += 4; return (val); } /** - @brief Read 2 q15 from q15 pointer. - @param[in] in_q15 pointer to address of input. - @return q31 value + @brief Read 2 int16 values from int16 pointer. + @param[in] in pointer to address of input. + @return s32 value */ -__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15) +__STATIC_FORCEINLINE int32_t arm_nn_read_s16x2(const int16_t *in) { - q31_t val; - memcpy(&val, in_q15, 4); + int32_t val; + memcpy(&val, in, 4); return (val); } /** - @brief Read 4 q7 values. - @param[in] in_q7 pointer to address of input. - @return q31 value + @brief Read 4 s8 values. + @param[in] in_s8 pointer to address of input. + @return s32 value */ -__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7) +__STATIC_FORCEINLINE int32_t arm_nn_read_s8x4(const int8_t *in_s8) { - q31_t val; - memcpy(&val, in_q7, 4); + int32_t val; + memcpy(&val, in_s8, 4); return (val); } /** - @brief Write four q7 to q7 pointer and increment pointer afterwards. + @brief Write four s8 to s8 pointer and increment pointer afterwards. @param[in] in Double pointer to input value @param[in] value Four bytes to copy */ -__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value) +__STATIC_FORCEINLINE void arm_nn_write_s8x4_ia(int8_t **in, int32_t value) { memcpy(*in, &value, 4); *in += 4; @@ -656,7 +636,7 @@ __STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value) * @param[in] block_size Number of bytes to copy. * */ -__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size) +__STATIC_FORCEINLINE void arm_memset_s8(int8_t *dst, const int8_t val, uint32_t block_size) { #if defined(ARM_MATH_MVEI) __asm volatile(" vdup.8 q0, %[set_val] \n" @@ -676,61 +656,40 @@ __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t bloc #if defined(ARM_MATH_DSP) /** - * @brief read and expand one q7 word into two q15 words + * @brief read and expand one s8 word into two s16 words */ -__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2) +__STATIC_FORCEINLINE const int8_t *read_and_pad(const int8_t *source, int32_t *out1, int32_t *out2) { - q31_t inA = arm_nn_read_q7x4_ia(&source); - q31_t inAbuf1 = __SXTB16_RORn((uint32_t)inA, 8); - q31_t inAbuf2 = __SXTB16(inA); - -#ifndef ARM_MATH_BIG_ENDIAN - *out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16)); - *out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16)); -#else - *out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16)); - *out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16)); -#endif + int32_t inA = arm_nn_read_s8x4_ia(&source); + int32_t inAbuf1 = SXTB16_RORn((uint32_t)inA, 8); + int32_t inAbuf2 = SXTB16(inA); + + #ifndef ARM_MATH_BIG_ENDIAN + *out2 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16)); + *out1 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16)); + #else + *out1 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16)); + *out2 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16)); + #endif return source; } /** - * @brief read and expand one q7 word into two q15 words with reordering + * @brief read and expand one s8 word into two s16 words with reordering */ -__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2) +__STATIC_FORCEINLINE const int8_t *read_and_pad_reordered(const int8_t *source, int32_t *out1, int32_t *out2) { - q31_t inA = arm_nn_read_q7x4_ia(&source); -#ifndef ARM_MATH_BIG_ENDIAN - *out2 = __SXTB16(__ROR((uint32_t)inA, 8)); - *out1 = __SXTB16(inA); -#else - *out1 = __SXTB16(__ROR((uint32_t)inA, 8)); - *out2 = __SXTB16(inA); -#endif - - return source; -} - -/** - * @brief read and expand one q7 word into two q15 words with reordering and add an offset - */ -__STATIC_FORCEINLINE const q7_t * -read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset) -{ - q31_t inA = arm_nn_read_q7x4_ia(&source); - -#ifndef ARM_MATH_BIG_ENDIAN - *out2 = __SXTB16(__ROR((uint32_t)inA, 8)); - *out1 = __SXTB16(inA); -#else - *out1 = __SXTB16(__ROR((uint32_t)inA, 8)); - *out2 = __SXTB16(inA); -#endif - *out1 = __QADD16(*out1, offset); - *out2 = __QADD16(*out2, offset); + int32_t inA = arm_nn_read_s8x4_ia(&source); + #ifndef ARM_MATH_BIG_ENDIAN + *out2 = SXTB16(ROR((uint32_t)inA, 8)); + *out1 = SXTB16(inA); + #else + *out1 = SXTB16(ROR((uint32_t)inA, 8)); + *out2 = SXTB16(inA); + #endif return source; } @@ -759,17 +718,17 @@ read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, * clamped in the range provided by activation min and max. * Supported framework: TensorFlow Lite micro. */ -q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, - const q15_t *input_b, - const uint16_t output_ch, - const int32_t *out_shift, - const int32_t *out_mult, - const int32_t out_offset, - const int16_t activation_min, - const int16_t activation_max, - const uint16_t num_col_a, - const int32_t *const output_bias, - q7_t *out_0); +int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a, + const int16_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + int8_t *out_0); /** * @brief Common softmax function for s8 input and s8 or s16 output @@ -799,9 +758,9 @@ void arm_nn_softmax_common_s8(const int8_t *input, * @brief macro for adding rounding offset */ #ifndef ARM_NN_TRUNCATE -#define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1) + #define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1) #else -#define NN_ROUND(out_shift) 0 + #define NN_ROUND(out_shift) 0 #endif // Macros for shortening quantization functions' names and avoid long lines @@ -823,18 +782,18 @@ void arm_nn_softmax_common_s8(const int8_t *input, * @return Result of multiplication. * */ -__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2) +__STATIC_FORCEINLINE int32_t arm_nn_doubling_high_mult(const int32_t m1, const int32_t m2) { - q31_t result = 0; + int32_t result = 0; // Rounding offset to add for a right shift of 31 - q63_t mult = 1 << 30; + int64_t mult = 1 << 30; if ((m1 < 0) ^ (m2 < 0)) { mult = 1 - mult; } // Gets resolved as a SMLAL instruction - mult = mult + (q63_t)m1 * m2; + mult = mult + (int64_t)m1 * m2; // Utilize all of the upper 32 bits. This is the doubling step // as well. @@ -861,9 +820,9 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t * this function. * */ -__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2) +__STATIC_FORCEINLINE int32_t arm_nn_doubling_high_mult_no_sat(const int32_t m1, const int32_t m2) { - q31_t result = 0; + int32_t result = 0; union arm_nn_long_long mult; // Rounding offset to add for a right shift of 31 @@ -871,7 +830,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, cons mult.word.high = 0; // Gets resolved as a SMLAL instruction - mult.long_long = mult.long_long + (q63_t)m1 * m2; + mult.long_long = mult.long_long + (int64_t)m1 * m2; // Utilize all of the upper 32 bits. This is the doubling step // as well. @@ -888,17 +847,17 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, cons * @return Rounded result of division. Midpoint is rounded away from zero. * */ -__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent) +__STATIC_FORCEINLINE int32_t arm_nn_divide_by_power_of_two(const int32_t dividend, const int32_t exponent) { - q31_t result = 0; - const q31_t remainder_mask = (1 << exponent) - 1; + int32_t result = 0; + const int32_t remainder_mask = (1 << exponent) - 1; int32_t remainder = remainder_mask & dividend; // Basic division result = dividend >> exponent; // Adjust 'result' for rounding (mid point away from zero) - q31_t threshold = remainder_mask >> 1; + int32_t threshold = remainder_mask >> 1; if (result < 0) { threshold++; @@ -920,7 +879,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, c * @return Returns (val * multiplier)/(2 ^ shift) * */ -__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift) +__STATIC_FORCEINLINE int32_t arm_nn_requantize(const int32_t val, const int32_t multiplier, const int32_t shift) { #ifdef CMSIS_NN_USE_SINGLE_ROUNDING const int64_t total_shift = 31 - shift; @@ -946,12 +905,14 @@ __STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multip * @return Returns (val * multiplier)/(2 ^ shift) * */ -__STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t reduced_multiplier, const q31_t shift) +__STATIC_FORCEINLINE int32_t arm_nn_requantize_s64(const int64_t val, + const int32_t reduced_multiplier, + const int32_t shift) { - const q63_t new_val = val * reduced_multiplier; + const int64_t new_val = val * reduced_multiplier; - q31_t result = new_val >> (14 - shift); // 64->32 bit reduction - result = (result + 1) >> 1; // Last shift position and insert round + int32_t result = new_val >> (14 - shift); // 64->32 bit reduction + result = (result + 1) >> 1; // Last shift position and insert round return result; } @@ -963,7 +924,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t re * @param[in] block_size Number of bytes to copy. * */ -__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size) +__STATIC_FORCEINLINE void arm_memcpy_s8(int8_t *__RESTRICT dst, const int8_t *__RESTRICT src, uint32_t block_size) { #if defined(ARM_MATH_MVEI) __asm volatile(" wlstp.8 lr, %[cnt], 1f \n" @@ -987,7 +948,7 @@ __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__REST * @param[in] block_size Number of bytes to copy. * */ -__STATIC_FORCEINLINE void arm_memcpy_q15(q15_t *__RESTRICT dst, const q15_t *__RESTRICT src, uint32_t block_size) +__STATIC_FORCEINLINE void arm_memcpy_q15(int16_t *__RESTRICT dst, const int16_t *__RESTRICT src, uint32_t block_size) { memcpy(dst, src, block_size); } @@ -1000,7 +961,7 @@ __STATIC_FORCEINLINE void arm_memcpy_q15(q15_t *__RESTRICT dst, const q15_t *__R * @return Result of multiplication. * */ -__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2) +__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const int32_t m2) { return vqrdmulhq_n_s32(m1, m2); } @@ -1013,7 +974,7 @@ __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, co * @return Rounded result of division. Midpoint is rounded away from zero. * */ -__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent) +__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const int32_t exponent) { const int32x4_t shift = vdupq_n_s32(-exponent); const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31); @@ -1030,9 +991,9 @@ __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t di * @return Returns (val * multiplier)/(2 ^ shift) * */ -__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift) +__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const int32_t multiplier, const int32_t shift) { -#ifdef CMSIS_NN_USE_SINGLE_ROUNDING + #ifdef CMSIS_NN_USE_SINGLE_ROUNDING const int right_shift = MIN(-1, shift); const int left_shift = shift - right_shift; @@ -1043,10 +1004,10 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31 result = vrshlq_s32(result, right_shift_dup); return result; -#else + #else return arm_divide_by_power_of_two_mve( arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift)); -#endif + #endif } __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2) @@ -1066,7 +1027,7 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, const int32x4_t multiplier, const int32x4_t shift) { -#ifdef CMSIS_NN_USE_SINGLE_ROUNDING + #ifdef CMSIS_NN_USE_SINGLE_ROUNDING const int32x4_t right_shift = vminq_s32(vdupq_n_s32(-1), shift); const int32x4_t left_shift = vqsubq_s32(shift, right_shift); @@ -1074,7 +1035,7 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, result = vrshlq_s32(result, right_shift); return result; -#else + #else const int32x4_t zz = vdupq_n_s32(0); const mve_pred16_t p = vcmpgtq_n_s32(shift, 0); @@ -1083,7 +1044,7 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier), right_shift); -#endif + #endif } #endif @@ -1122,7 +1083,7 @@ __STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val) return SELECT_USING_MASK(mask, NN_Q31_MAX, result); } -__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp) +__STATIC_FORCEINLINE int32_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp) { const int32_t thresh = ((1 << (31 - exp)) - 1); int32_t result = val << exp; @@ -1146,18 +1107,191 @@ __STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val } /** - @brief Write 2 q15 elements and post increment pointer. + @brief Write 2 s16 elements and post increment pointer. @param[in] dest_q15 Pointer to pointer that holds address of destination. @param[in] src_q31 Input value to be written. */ -__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31) +__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(int16_t **dest_q15, int32_t src_q31) { - q31_t val = src_q31; + int32_t val = src_q31; memcpy(*dest_q15, &val, 4); *dest_q15 += 2; } +/** + @brief Write 2 s8 elements and post increment pointer. + @param[in] dst Pointer to pointer that holds address of destination. + @param[in] src Input value to be written. + */ +__STATIC_FORCEINLINE void arm_nn_write_s8x2_ia(int8_t **dst, int16_t src) +{ + memcpy(*dst, &src, 2); + *dst += 2; +} + +// Support functions for LSTM +/** + * @brief Update LSTM function for an iteration step + * + * param[in] input Input data + * param[in] input_to_input_weight Input to input gate weights + * param[in] input_to_forget_weight Input to forget gate weights + * param[in] input_to_cell_weight Input to cell gate weights + * param[in] input_to_output_weight Input to output weights + * param[in] recurrent_to_input_weight Recurrent signal to input weights + * param[in] recurrent_to_forget_weight Recurrent signal to forget gate weights + * param[in] recurrent_to_cell_weight Recurrent signal to cell gate weighst + * param[in] recurrent_to_output_weight Recurrent signal to output weights + * param[in] lstm LSTM parameters + * param[in] n_batch Batch size + * param[in] n_cell Cell size + * param[in] n_input Input size + * param[in] n_output Output size + * param[out] output_state Output state + * param[out] cell_state Internal state + * param[out] output Output signal + * param[in] *scratch_buffers Struct containing scratch buffers + */ +arm_cmsis_nn_status arm_nn_lstm_step_s8_s16(const int8_t *input, + const int8_t *input_to_input_weight, + const int8_t *input_to_forget_weight, + const int8_t *input_to_cell_weight, + const int8_t *input_to_output_weight, + const int8_t *recurrent_to_input_weight, + const int8_t *recurrent_to_forget_weight, + const int8_t *recurrent_to_cell_weight, + const int8_t *recurrent_to_output_weight, + const cmsis_nn_lstm_params *lstm, + const int n_batch, + const int n_cell, + const int n_input, + const int n_output, + int8_t *output_state, + int16_t *cell_state, + int8_t *output, + cmsis_nn_lstm_context *scratch_buffers); + +/** + * @brief Updates a LSTM gate for an iteration step of LSTM function, int8x8_16 version. + * + * param[in] input Input data + * param[in] input_to_gate_weights Input to gate weights + * param[in] input_to_gate_bias Input to gate weights + * param[in] input_to_gate_scaling Input to gate scaling + * param[in] activation Actival min and max values + * param[in] output_state Output state + * param[in] recurrent_to_gate_weights Recurrent to gate weights + * param[in] recurrent_to_gate_bias Recurrent to gate bias + * param[in] recurrent_to_gate_scaling Recurrent to gate scaling + * param[in] n_batch Batch size + * param[in] n_input Input size + * param[out] n_output Output size + * param[in] activation_type Activation type (sigmoid or tanh) + * param[out] n_cell Cell size + */ +void arm_nn_lstm_calculate_gate_s8_s16(const int8_t *input, + const int8_t *input_to_gate_weights, + const int32_t *input_to_gate_bias, + const cmsis_nn_scaling input_to_gate_scaling, + const int8_t *output_state, + const int8_t *recurrent_to_gate_weights, + const int32_t *recurrent_to_gate_bias, + const cmsis_nn_scaling recurrent_to_gate_scaling, + const int32_t n_batch, + const int32_t n_input, + const int32_t n_output, + const int32_t n_cell, + const arm_nn_activation_type activation_type, + int16_t *gate); + +/** + * @brief Update cell state for a single LSTM iteration step, int8x8_16 version. + * @param[in] n_block total number of cells for all batches + * @param[in] cell_state_scale Scaling factor of cell state + * @param[in] cell_state Input/output vector, size n_batch*n_cell + * @param[in] input_gate Input vector of size n_block + * @param[in] forget_gate Input/scratch vector of size n_block, always modified + * @param[in] cell_gate Input vector of size, n_block + */ +void arm_nn_lstm_update_cell_state_s16(const int32_t n_block, + const int32_t cell_state_scale, + int16_t *cell_state, + const int16_t *input_gate, + const int16_t *forget_gate, + const int16_t *cell_gate); + +/** + * @brief Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version. + * + * @param[in] n_batch The number of distinct vectors in each array + * @param[in] n_cell Number of cells + * @param[in,out] cell_state Cell state, size n_batch*n_cell + * @param[in] cell_state_scale Scaling of cell_state + * @param[in] output_gate Output gate + * @param[in] hidden_scale Effective scaling of cell_state .* output_gate + * @param[in] hidden_offset Zero point for cell_state .* output_gate + * @param[out] output_state Output state + * @param[in] cell_gate_scratch Scratch buffer + */ +void arm_nn_lstm_update_output_s8_s16(const int n_batch, + const int n_cell, + int16_t *cell_state, + const int32_t cell_state_scale, + const int16_t *output_gate, + const cmsis_nn_scaling hidden_scale, + const int32_t hidden_offset, + int8_t *output_state, + int16_t *cell_gate_scratch); + +/** + * @brief The result of the multiplication is accumulated to the passed result buffer. + * Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent + * from each other). + * + * @param[in] lhs_in Batched vector + * @param[in] rhs_in Weights - input matrix (H(Rows)xW(Columns)) + * @param[in] bias Bias vector + * @param[out] dst Output + * @param[in] dst_offset Output offset + * @param[in] dst_multiplier Multiplier for quantization + * @param[in] dst_shift Shift for quantization + * @param[in] rhs_cols Vector/matarix column length + * @param[in] rhs_rows Row count of matrix + * @param[in] batch Batch size + */ +void arm_nn_vec_mat_mul_result_acc_s8(const int8_t *lhs_in, + const int8_t *rhs_in, + const int32_t *bias, + int16_t *dst, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t batch); + +/** + * @brief s16 elementwise multiplication with s8 output + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] block_size number of samples + * @return The function returns ARM_CMSIS_NN_SUCCESS + * + * @details Supported framework: TensorFlow Lite micro + */ +arm_cmsis_nn_status arm_elementwise_mul_s16_s8(const int16_t *input_1_vect, + const int16_t *input_2_vect, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t block_size); + #ifdef __cplusplus } #endif diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_nn_activation_s16.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_nn_activation_s16.c new file mode 100644 index 00000000..51b736f6 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_nn_activation_s16.c @@ -0,0 +1,119 @@ +/* + * SPDX-FileCopyrightText: Copyright 2010-2020, 2022 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_activations_q15.c + * Description: Q15 neural network activation function using direct table look-up + * + * $Date: 8 September 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nn_tables.h" +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/* + * @brief Neural network activation function using direct table look-up + * + * @note Refer header file for details. + * + */ + +void arm_nn_activation_s16(const int16_t *input, + int16_t *output, + const uint16_t size, + const uint16_t left_shift, + const arm_nn_activation_type type) +{ + uint32_t abs_input_shift, max_saturation; + switch (type) + { + case ARM_SIGMOID: + abs_input_shift = 9; + max_saturation = 0x7FFF << 10; + break; + case ARM_TANH: + default: + abs_input_shift = 8; + max_saturation = 0xFFFF << 8; + break; + } + + // Use the LUT for sigmoid and take into account, that + // tanh(x) = 2*sigmoid(2*x) - 1 + int32_t input_multiplier = ((int32_t)3) << left_shift; + + for (int i = 0; i < size; ++i, input++, output++) + { + int32_t input_data = ((*input) * input_multiplier); + + uint32_t abs_input_data = input_data > 0 ? input_data : -input_data; + + uint32_t uh = abs_input_data >> abs_input_shift; + + uint32_t result; + + if (uh >= 255) + { + result = max_saturation; + } + else + { + uint32_t ua = sigmoid_table_uint16[uh]; + uint32_t ub = sigmoid_table_uint16[uh + 1]; + uint32_t ut; + if (type == ARM_SIGMOID) + { + ut = abs_input_data & 0x1ff; + } + else + { + ut = abs_input_data & 0x0ff; + } + result = (ua << abs_input_shift) + ut * (ub - ua); + } + if (type == ARM_SIGMOID) + { + result = (input_data >= 0) ? (result + (1 << 9)) : ((1 << 25) - result + (1 << 9) - 1); + result >>= 10; + } + else + { + result = (input_data >= 0) ? (result - (1 << 23)) + (1 << 7) : ((-result + (1 << 23)) + (1 << 7) - 1); + result >>= 8; + } + *output = (int16_t)result; + } +} + +/** + * @} end of Acti group + */ diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c index 3d66927f..a9ecb127 100644 --- a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c +++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2019, 2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_relu6_s8.c * Description: Basic s8 version of ReLU6 * - * $Date: 09. October 2020 - * $Revision: V.1.0.1 + * $Date: 26 October 2022 + * $Revision: V.1.0.2 * * Target Processor: Cortex-M cores * @@ -47,7 +47,7 @@ * */ -void arm_relu6_s8(q7_t *data, uint16_t size) +void arm_relu6_s8(int8_t *data, uint16_t size) { int32_t i; diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c index cede0e9c..d079167b 100644 --- a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c +++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_relu_q15.c * Description: Q15 version of ReLU * - * $Date: 4 Aug 2022 - * $Revision: V.1.0.3 + * $Date: 31 January 2023 + * $Revision: V.1.1.1 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -47,28 +47,28 @@ * */ -void arm_relu_q15(q15_t *data, uint16_t size) +void arm_relu_q15(int16_t *data, uint16_t size) { #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) /* Run the following code for M cores with DSP extension */ uint16_t i = size >> 1; - q15_t *input = data; - q15_t *output = data; - q31_t in; - q31_t buf; - q31_t mask; + int16_t *input = data; + int16_t *output = data; + int32_t in; + int32_t buf; + int32_t mask; while (i) { - in = arm_nn_read_q15x2_ia((const q15_t **)&input); + in = arm_nn_read_q15x2_ia((const int16_t **)&input); /* extract the first bit */ - buf = __ROR(in & 0x80008000, 15); + buf = ROR(in & 0x80008000, 15); /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ - mask = __QSUB16(0x00000000, buf); + mask = QSUB16(0x00000000, buf); arm_nn_write_q15x2_ia(&output, in & (~mask)); i--; diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c index 7c7a187f..58d22848 100644 --- a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c +++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_relu_q7.c * Description: Q7 version of ReLU * - * $Date: 4 Aug 2022 - * $Revision: V.1.1.4 + * $Date: 31 January 2023 + * $Revision: V.1.2.1 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -47,30 +47,30 @@ * */ -void arm_relu_q7(q7_t *data, uint16_t size) +void arm_relu_q7(int8_t *data, uint16_t size) { #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) /* Run the following code for M cores with DSP extension */ uint16_t i = size >> 2; - q7_t *input = data; - q7_t *output = data; - q31_t in; - q31_t buf; - q31_t mask; + int8_t *input = data; + int8_t *output = data; + int32_t in; + int32_t buf; + int32_t mask; while (i) { - in = arm_nn_read_q7x4_ia((const q7_t **)&input); + in = arm_nn_read_s8x4_ia((const int8_t **)&input); /* extract the first bit */ - buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7); + buf = (int32_t)ROR((uint32_t)in & 0x80808080, 7); /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ - mask = __QSUB8(0x00000000, buf); + mask = QSUB8(0x00000000, buf); - arm_nn_write_q7x4_ia(&output, in & (~mask)); + arm_nn_write_s8x4_ia(&output, in & (~mask)); i--; } diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c index 54c88349..7e3dce05 100644 --- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c +++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_elementwise_add_s16 * Description: Elementwise add * - * $Date: 10 May 2022 - * $Revision: V.2.1.0 + * $Date: 24 Oct 2022 + * $Revision: V.2.2.0 * * Target Processor: Cortex-M CPUs * @@ -69,13 +69,47 @@ arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect, (void)input_1_offset; (void)input_2_offset; (void)out_offset; + +#if defined(ARM_MATH_MVEI) + + int32_t count = block_size; + + while (count > 0) + { + + mve_pred16_t pred = vctp32q(count); + + int32x4_t vect_1 = vldrhq_z_s32(input_1_vect, pred); + int32x4_t vect_2 = vldrhq_z_s32(input_2_vect, pred); + + vect_1 = vshlq_r_s32(vect_1, left_shift); + vect_2 = vshlq_r_s32(vect_2, left_shift); + + vect_1 = arm_requantize_mve(vect_1, input_1_mult, input_1_shift); + vect_2 = arm_requantize_mve(vect_2, input_2_mult, input_2_shift); + + vect_1 = vaddq_s32(vect_1, vect_2); + vect_1 = arm_requantize_mve(vect_1, out_mult, out_shift); + + vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min)); + vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max)); + + input_1_vect += 4; + input_2_vect += 4; + + vstrhq_p_s32(output, vect_1, pred); + + output += 4; + count -= 4; + } + +#else // #if defined(ARM_MATH_MVEI) int32_t input_1; int32_t input_2; int32_t sum; int32_t two_halfword_1, two_halfword_2; int16_t sum_1, sum_2; int32_t loop_count = block_size / 2; - while (loop_count > 0) { two_halfword_1 = arm_nn_read_q15x2_ia(&input_1_vect); @@ -127,10 +161,10 @@ arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect, /* Decrement loop counter */ loop_count--; } - +#endif // #if defined(ARM_MATH_MVEI) return (ARM_CMSIS_NN_SUCCESS); } /** * @} end of Doxygen group - */ + */ \ No newline at end of file diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c index be222c12..e2d895b9 100644 --- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c +++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_elementwise_add_s8 * Description: Elementwise add * - * $Date: 19 April 2022 - * $Revision: V.3.0.0 + * $Date: 5 January 2023 + * $Revision: V.3.1.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -109,7 +109,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, int32_t input_2; int32_t sum; -#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_DSP) int32_t a_1, b_1, a_2, b_2; int32_t offset_1_packed, offset_2_packed; @@ -128,11 +128,11 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1); input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2); - a_1 = __SADD16(a_1, offset_1_packed); - b_1 = __SADD16(b_1, offset_1_packed); + a_1 = SADD16(a_1, offset_1_packed); + b_1 = SADD16(b_1, offset_1_packed); - a_2 = __SADD16(a_2, offset_2_packed); - b_2 = __SADD16(b_2, offset_2_packed); + a_2 = SADD16(a_2, offset_2_packed); + b_2 = SADD16(b_2, offset_2_packed); /* Sum 1 */ input_1 = (b_1 & 0x0FFFF) << left_shift; @@ -147,7 +147,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, sum += out_offset; sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - r1 = (q7_t)sum; + r1 = (int8_t)sum; /* Sum 3 */ input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift; @@ -161,7 +161,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, sum += out_offset; sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - r3 = (q7_t)sum; + r3 = (int8_t)sum; /* Sum 2 */ input_1 = (a_1 & 0x0FFFF) << left_shift; @@ -175,7 +175,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, sum += out_offset; sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - r2 = (q7_t)sum; + r2 = (int8_t)sum; /* Sum 4 */ input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift; @@ -189,17 +189,17 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, sum += out_offset; sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - r4 = (q7_t)sum; + r4 = (int8_t)sum; - arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4)); + arm_nn_write_s8x4_ia(&output, PACK_S8x4_32x1(r1, r2, r3, r4)); loop_count--; } loop_count = block_size & 0x3; -#else + #else loop_count = block_size; -#endif + #endif while (loop_count > 0) { @@ -218,7 +218,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - *output++ = (q7_t)sum; + *output++ = (int8_t)sum; /* Decrement loop counter */ loop_count--; diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c index 7a85d8a1..7315b9c3 100644 --- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c +++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_elementwise_mul_s16 * Description: Element wise multiplication * - * $Date: 10 May 2022 - * $Revision: V.2.1.0 + * $Date: 20 January 2023 + * $Revision: V.2.4.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -61,29 +61,66 @@ arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect, (void)input_1_offset; (void)input_2_offset; (void)out_offset; + int32_t loop_count; + +#if defined(ARM_MATH_MVEI) + + loop_count = block_size; + + while (loop_count > 0) + { + mve_pred16_t pred = vctp32q(loop_count); + + int32x4_t input_1 = vldrhq_z_s32(input_1_vect, pred); + int32x4_t input_2 = vldrhq_z_s32(input_2_vect, pred); + + int32x4_t res_0 = vmulq_s32(input_1, input_2); + + res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift)); + + res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min)); + res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max)); + + vstrhq_p_s32(output, res_0, pred); + input_1_vect += 4; + input_2_vect += 4; + + output += 4; + loop_count -= 4; + } + +#else int32_t input_1; int32_t input_2; int32_t mul_res; int32_t two_halfword_1, two_halfword_2; int16_t mul_1, mul_2; - int32_t loop_count = block_size / 2; + loop_count = block_size / 2; while (loop_count > 0) { two_halfword_1 = arm_nn_read_q15x2_ia(&input_1_vect); two_halfword_2 = arm_nn_read_q15x2_ia(&input_2_vect); + #if defined(ARM_MATH_DSP) + mul_res = SMULBB(two_halfword_1, two_halfword_2); + #else input_1 = (int16_t)(two_halfword_1 & 0xFFFF); input_2 = (int16_t)(two_halfword_2 & 0xFFFF); mul_res = input_1 * input_2; + #endif mul_res = arm_nn_requantize(mul_res, out_mult, out_shift); mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); mul_1 = (int16_t)mul_res; + #if defined(ARM_MATH_DSP) + mul_res = SMULTT(two_halfword_1, two_halfword_2); + #else input_1 = (int16_t)(two_halfword_1 >> 16); input_2 = (int16_t)(two_halfword_2 >> 16); mul_res = input_1 * input_2; + #endif mul_res = arm_nn_requantize(mul_res, out_mult, out_shift); mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); @@ -113,7 +150,7 @@ arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect, /* Decrement loop counter */ loop_count--; } - +#endif // #if defined(ARM_MATH_MVEI) return ARM_CMSIS_NN_SUCCESS; } diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c new file mode 100644 index 00000000..16296849 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c @@ -0,0 +1,122 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_mul_s16_s8.c + * Description: Elementwise multiplication of 16 bit input with 8 bit output + * + * $Date: 20 January 2023 + * $Revision: V.1.2.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/* + * s16 elementwise multiplication with s8 output + * + * Refer header file for details. + * + */ +arm_cmsis_nn_status arm_elementwise_mul_s16_s8(const int16_t *input_1_vect, + const int16_t *input_2_vect, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t block_size) +{ + int32_t loop_count = block_size; + +#if defined(ARM_MATH_MVEI) + + while (loop_count > 0) + { + mve_pred16_t pred = vctp32q(loop_count); + + int32x4_t input_1 = vldrhq_z_s32(input_1_vect, pred); + int32x4_t input_2 = vldrhq_z_s32(input_2_vect, pred); + + int32x4_t res_0 = vmulq_s32(input_1, input_2); + + res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift)); + res_0 = vaddq_n_s32(res_0, out_offset); + + res_0 = vmaxq_s32(res_0, vdupq_n_s32(NN_Q7_MIN)); + res_0 = vminq_s32(res_0, vdupq_n_s32(NN_Q7_MAX)); + + vstrbq_p_s32(output, res_0, pred); + input_1_vect += 4; + input_2_vect += 4; + + output += 4; + loop_count -= 4; + } + +#else + #if defined(ARM_MATH_DSP) + + while (loop_count > 1) + { + int32_t input_1 = arm_nn_read_q15x2_ia(&input_1_vect); + int32_t input_2 = arm_nn_read_q15x2_ia(&input_2_vect); + + int32_t mul_res = SMULBB(input_1, input_2); + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = CLAMP(mul_res, NN_Q7_MAX, NN_Q7_MIN); + int32_t mul = (int16_t)(mul_res & 0xFF); + + mul_res = SMULTT(input_1, input_2); + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = CLAMP(mul_res, NN_Q7_MAX, NN_Q7_MIN); + mul |= (int16_t)mul_res << 8; + + arm_nn_write_s8x2_ia(&output, mul); + loop_count -= 2; + } + #endif + for (int i = 0; i < loop_count; i++) + { + /* C = A * B */ + int32_t mul_res = input_1_vect[i] * input_2_vect[i]; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + + mul_res = CLAMP(mul_res, NN_Q7_MAX, NN_Q7_MIN); + + output[i] = (int8_t)mul_res; + } + +#endif + + return ARM_CMSIS_NN_SUCCESS; +} +/** + * @} end of BasicMath group + */ diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c index 42a53997..484f214e 100644 --- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c +++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_elementwise_mul_s8 * Description: Element wise multiplication * - * $Date: 4 Aug 2022 - * $Revision: V.2.0.1 + * $Date: 20 January 2023 + * $Revision: V.2.2.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -97,7 +97,7 @@ arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect, int32_t input_2; int32_t mul_res; -#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_DSP) int32_t a_1, b_1, a_2, b_2; int32_t offset_1_packed, offset_2_packed; @@ -116,62 +116,50 @@ arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect, input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1); input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2); - a_1 = __SADD16(a_1, offset_1_packed); - b_1 = __SADD16(b_1, offset_1_packed); + a_1 = SADD16(a_1, offset_1_packed); + b_1 = SADD16(b_1, offset_1_packed); - a_2 = __SADD16(a_2, offset_2_packed); - b_2 = __SADD16(b_2, offset_2_packed); + a_2 = SADD16(a_2, offset_2_packed); + b_2 = SADD16(b_2, offset_2_packed); /* Mul 1 */ - input_1 = (int16_t)(b_1 & 0x0FFFFL); - input_2 = (int16_t)(b_2 & 0x0FFFFL); - - mul_res = input_1 * input_2; + mul_res = SMULBB(b_1, b_2); mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); - r1 = (q7_t)mul_res; + r1 = (int8_t)mul_res; /* Mul 3 */ - input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL); - input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL); - - mul_res = input_1 * input_2; + mul_res = SMULTT(b_1, b_2); mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); - r3 = (q7_t)mul_res; + r3 = (int8_t)mul_res; /* Mul 2 */ - input_1 = (int16_t)(a_1 & 0x0FFFFL); - input_2 = (int16_t)(a_2 & 0x0FFFFL); - - mul_res = input_1 * input_2; + mul_res = SMULBB(a_1, a_2); mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); - r2 = (q7_t)mul_res; + r2 = (int8_t)mul_res; /* Mul 4 */ - input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL); - input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL); - - mul_res = input_1 * input_2; + mul_res = SMULTT(a_1, a_2); mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); - r4 = (q7_t)mul_res; + r4 = (int8_t)mul_res; - arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4)); + arm_nn_write_s8x4_ia(&output, PACK_S8x4_32x1(r1, r2, r3, r4)); loop_count--; } loop_count = block_size & 0x3; -#else + #else loop_count = block_size; -#endif + #endif while (loop_count > 0) { @@ -186,7 +174,7 @@ arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect, mul_res = MAX(mul_res, out_activation_min); mul_res = MIN(mul_res, out_activation_max); - *output++ = (q7_t)mul_res; + *output++ = (int8_t)mul_res; /* Decrement loop counter */ loop_count--; diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c index ca1520d7..9ea9e02b 100644 --- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c +++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_concatenation_s8_w.c * Description: s8 version of concatenation along the W axis * - * $Date: October 2019 - * $Revision: V.1.0.0 + * $Date: 26 October 2022 + * $Revision: V.1.0.1 * * Target Processor: Cortex-M cores * @@ -58,7 +58,7 @@ void arm_concatenation_s8_w(const int8_t *input, output += offset_w * (input_x * input_y * input_z); - arm_memcpy_q7(output, input, input_copy_size); + arm_memcpy_s8(output, input, input_copy_size); } /** diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c index b316a73d..d02be297 100644 --- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c +++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_concatenation_s8_x.c * Description: s8 version of concatenation along the X axis * - * $Date: October 2019 - * $Revision: V.1.0.0 + * $Date: 26 October 2022 + * $Revision: V.1.0.2 * * Target Processor: Cortex-M cores * @@ -64,7 +64,7 @@ void arm_concatenation_s8_x(const int8_t *input, // Copy per row for (i = 0; i < num_iterations; ++i) { - arm_memcpy_q7(output, input, input_x); + arm_memcpy_s8(output, input, input_x); input += input_x; output += output_x; } diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c index fa953003..78131fd1 100644 --- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c +++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_concatenation_s8_y.c * Description: s8 version of concatenation along the Y axis * - * $Date: October 2019 - * $Revision: V.1.0.0 + * $Date: 26 October 2022 + * $Revision: V.1.0.1 * * Target Processor: Cortex-M cores * @@ -65,7 +65,7 @@ void arm_concatenation_s8_y(const int8_t *input, // Copy per tile for (i = 0; i < num_iterations; ++i) { - arm_memcpy_q7(output, input, input_copy_size); + arm_memcpy_s8(output, input, input_copy_size); input += input_copy_size; output += output_stride; } diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c index a13d5fbd..b742c3dd 100644 --- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c +++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_concatenation_s8_z.c * Description: s8 version of concatenation along the Z axis * - * $Date: October 2019 - * $Revision: V.1.0.0 + * $Date: 26 October 2022 + * $Revision: V.1.0.1 * * Target Processor: Cortex-M cores * @@ -64,7 +64,7 @@ void arm_concatenation_s8_z(const int8_t *input, for (i = 0; i < input_w; ++i) { - arm_memcpy_q7(output, input, input_copy_size); + arm_memcpy_s8(output, input, input_copy_size); input += input_copy_size; output += output_stride; } diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c index d17513a7..2e030ce5 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_convolve_1_x_n_s8.c * Description: s8 version of 1xN convolution using symmetric quantization. * - * $Date: 20 June 2022 - * $Revision: V.3.1.0 + * $Date: 30 January 2023 + * $Revision: V.3.3.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -51,13 +51,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data) + int8_t *output_data) { (void)bias_dims; arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS; @@ -101,15 +101,15 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, for (int i = 0; i < 4; i++) { const int32_t actual_kernel_len = ker_end_idx[i] - ker_begin_idx[i]; - arm_nn_mat_mul_core_1x_s8(actual_kernel_len * input_ch, - (kernel_x - actual_kernel_len) * input_ch, - input_data + input_begin_idx[i] * input_ch, - filter_data + (ker_begin_idx[i] * input_ch), - output_ch, - conv_params, - quant_params, - bias_data, - output_data); + status = arm_nn_mat_mul_core_1x_s8(actual_kernel_len * input_ch, + (kernel_x - actual_kernel_len) * input_ch, + input_data + input_begin_idx[i] * input_ch, + filter_data + (ker_begin_idx[i] * input_ch), + output_ch, + conv_params, + quant_params, + bias_data, + output_data); output_data += output_ch; } } @@ -125,7 +125,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, bias_data, output_data); } + + if (status != ARM_CMSIS_NN_SUCCESS || output_data == NULL) + { + return ARM_CMSIS_NN_NO_IMPL_ERROR; + } } + /* Advance to the next batch */ input_data += (input_x * input_ch); } @@ -149,17 +155,6 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, return status; } -int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) -{ -#if !defined(ARM_MATH_MVEI) - return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); -#else - (void)input_dims; - (void)filter_dims; - return 0; -#endif -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c new file mode 100644 index 00000000..a31e23c3 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c @@ -0,0 +1,115 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_s8.c + * Description: Generic s8 version of 1x1 convolution + * + * $Date: 20 January 2023 + * $Revision: V.1.0.1 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup Public + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * A more generic version of s8 1x1 convolution intended for non-unity strides. This is slower + * than the _fast() version if used for unity stride values. + * + * Refer header file for details. + * + */ +arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const int8_t *input_data, + const cmsis_nn_dims *filter_dims, + const int8_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + int8_t *output_data) +{ + (void)ctx; + (void)filter_dims; + (void)bias_dims; + if (conv_params->padding.w != 0 || conv_params->padding.h != 0) + { + return ARM_CMSIS_NN_ARG_ERROR; + } + + const int32_t lhs_rows = output_dims->w; + const int32_t rhs_rows = output_dims->c; + const int32_t rhs_cols = input_dims->c; + const int32_t stride_w = conv_params->stride.w; + const int32_t input_inc = input_dims->w * conv_params->stride.h * rhs_cols; + const int32_t output_inc = output_dims->w * rhs_rows; + const int32_t output_h = output_dims->h; + const int32_t batch = input_dims->n; + const int8_t *input_data_ref = input_data; + + for (int i_batch = 0; i_batch < batch; i_batch++) + { + input_data = input_data_ref + (i_batch * rhs_cols * input_dims->w * input_dims->h); + for (int i_output_h = 0; i_output_h < output_h; i_output_h++) + { + // Process one input row + arm_cmsis_nn_status result = arm_nn_mat_mult_nt_t_s8(input_data, + filter_data, + bias_data, + output_data, + quant_params->multiplier, + quant_params->shift, + lhs_rows, + rhs_rows, + rhs_cols, + conv_params->input_offset, + conv_params->output_offset, + conv_params->activation.min, + conv_params->activation.max, + rhs_cols * stride_w); + if (result != ARM_CMSIS_NN_SUCCESS) + { + return result; + } + input_data += input_inc; + output_data += output_inc; + } + } + + /* Return to application */ + return ARM_CMSIS_NN_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c index 741051d7..c6c6cad0 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -19,18 +19,17 @@ /* ---------------------------------------------------------------------- * Project: CMSIS NN Library * Title: arm_convolve_1x1_s8_fast.c - * Description: Fast q7 version of 1x1 convolution (non-square shape) + * Description: Fast s8 version of 1x1 convolution (non-square shape) * - * $Date: 20 june 2022 - * $Revision: V.3.0.1 + * $Date: 30 January 2023 + * $Revision: V.3.1.0 * - * Target Processor: Cortex-M Processors + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" -#include /** * @ingroup Public @@ -52,13 +51,13 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data) + int8_t *output_data) { if (conv_params->padding.w != 0 || conv_params->padding.h != 0 || conv_params->stride.w != 1 || conv_params->stride.h != 1) @@ -70,43 +69,6 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, (void)filter_dims; (void)bias_dims; -#if defined(ARM_MATH_MVEI) - - const int32_t col_len = input_dims->w * input_dims->h * input_dims->n; - const int32_t output_ch = output_dims->c; - const int32_t input_ch = input_dims->c; - - for (int i_items = 0; i_items <= (col_len - 4); i_items += 4) - { - output_data = arm_nn_mat_mul_core_4x_s8(input_ch, - input_ch, - input_data + i_items * input_ch, - filter_data, - output_ch, - conv_params, - quant_params, - bias_data, - output_data); - } - - /* Handle left over elements */ - for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++) - { - arm_nn_mat_mul_core_1x_s8(input_ch, - 0, - input_data + i_items * input_ch, - filter_data, - output_ch, - conv_params, - quant_params, - bias_data, - output_data); - output_data += output_ch; - } - -#else - /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */ - const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n; const int32_t rhs_rows = output_dims->c; const int32_t rhs_cols = input_dims->c; @@ -123,20 +85,13 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, conv_params->input_offset, conv_params->output_offset, conv_params->activation.min, - conv_params->activation.max); - -#endif + conv_params->activation.max, + rhs_cols); /* Return to application */ return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims) -{ - (void)input_dims; - return 0; -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c index 13703f02..7819f720 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_convolve_fast_s16.c * Description: Optimized s16 version of convolution. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 30 January 2023 + * $Revision: V.2.2.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data) + int16_t *output_data) { (void)bias_dims; if (filter_dims->w * filter_dims->h * input_dims->c >= 512) @@ -70,7 +70,7 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, { return ARM_CMSIS_NN_ARG_ERROR; } - q15_t *buffer_a = (q15_t *)ctx->buf; + int16_t *buffer_a = (int16_t *)ctx->buf; const int32_t input_batches = input_dims->n; const int32_t input_x = input_dims->w; @@ -96,8 +96,8 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, { #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) /* Generate two columns from the input tensor a GEMM computation */ - q15_t *two_column_buf = buffer_a; - q15_t *out = output_data; + int16_t *two_column_buf = buffer_a; + int16_t *out = output_data; /* This part implements the im2col function */ for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++) { @@ -112,13 +112,13 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) { /* Filling 0 for out-of-bound paddings */ - arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch); + arm_memset_s8((int8_t *)two_column_buf, 0, sizeof(int16_t) * input_ch); } else { - arm_memcpy_q7((q7_t *)two_column_buf, - (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch), - input_ch * sizeof(q15_t)); + arm_memcpy_s8((int8_t *)two_column_buf, + (const int8_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch), + input_ch * sizeof(int16_t)); } two_column_buf += input_ch; } @@ -146,31 +146,31 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, /* Left-over because odd number of output pixels */ if (two_column_buf != buffer_a) { - const q7_t *ker_a = filter_data; + const int8_t *ker_a = filter_data; int i; for (i = 0; i < output_ch; i++) { /* Init the accumulator*/ - q31_t sum = 0; + int32_t sum = 0; /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ - const q15_t *ip_as_col = buffer_a; + const int16_t *ip_as_col = buffer_a; /* 4 multiply and accumulates are done in one loop. */ uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; while (col_count) { - q31_t ker_a1, ker_a2; - q31_t ip_b1, ip_b2; + int32_t ker_a1, ker_a2; + int32_t ip_b1, ip_b2; ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); - sum = __SMLAD(ker_a1, ip_b1, sum); + sum = SMLAD(ker_a1, ip_b1, sum); ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); - sum = __SMLAD(ker_a2, ip_b2, sum); + sum = SMLAD(ker_a2, ip_b2, sum); col_count--; } @@ -178,15 +178,15 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, col_count = input_ch * kernel_y * kernel_x & 0x3; while (col_count) { - q7_t ker_a1 = *ker_a++; - q15_t ip_b1 = *ip_as_col++; + int8_t ker_a1 = *ker_a++; + int16_t ip_b1 = *ip_as_col++; sum += ker_a1 * ip_b1; col_count--; } if (bias_data) { - q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]); - q63_t acc_64 = sum + bias_data[i]; + int32_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]); + int64_t acc_64 = sum + bias_data[i]; sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]); } else @@ -195,7 +195,7 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, } sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - *out++ = (q15_t)sum; + *out++ = (int16_t)sum; } } #else @@ -225,17 +225,6 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) -{ -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); -#else - (void)input_dims; - (void)filter_dims; - return 0; -#endif -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c new file mode 100644 index 00000000..ee286a36 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c @@ -0,0 +1,120 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_get_buffer_sizes_s16.c + * Description: Collection of get buffer size functions for the various s16 convolution layer functions. + * + * $Date: 30 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h" +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup NNConv + */ + +/** + * @addtogroup GetBufferSizeNNConv + * @{ + */ + +__STATIC_INLINE int32_t arm_convolve_fast_s16_get_buffer_size_dsp(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +} + +int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return arm_convolve_fast_s16_get_buffer_size_dsp(input_dims, filter_dims); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ + (void)input_dims; + (void)filter_dims; + return 0; +} + +/* + * Get the required buffer size for arm_convolve_wrapper_s16. This is the recommended function convolve wrapper s16 + * function. + * + * Refer to header file for details. + * + */ +int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return arm_convolve_wrapper_s16_get_buffer_size_dsp(conv_params, input_dims, filter_dims, output_dims); +#else + (void)conv_params; + (void)output_dims; + + // MVE and scalar implementation have same buffer requirements + return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); +#endif +} + +int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)output_dims; + + if (filter_dims->w * filter_dims->h * input_dims->c < 512 && + (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_fast_s16_get_buffer_size_dsp(input_dims, filter_dims); + } + else + { + + return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); + } +} + +int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + return arm_convolve_wrapper_s16_get_buffer_size(conv_params, input_dims, filter_dims, output_dims); +} + +/** + * @} end of GetBufferSizeNNConv group + */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c new file mode 100644 index 00000000..330622ac --- /dev/null +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c @@ -0,0 +1,164 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_get_buffer_sizes_s8.c + * Description: Collection of get buffer size functions for the various s8 convolution layer functions. + * + * $Date: 31 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h" +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup NNConv + */ + +/** + * @addtogroup GetBufferSizeNNConv + * @{ + */ + +__STATIC_INLINE int32_t arm_convolve_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h; + // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on + // implementation of arm_nn_mat_mult_s8 + col_length = (col_length + 7) / 8; + // 4 -> number of im2col buffers, 8 -> 8 elements per Q register + return 4 * col_length * 8 * (int32_t)sizeof(int8_t); +} + +__STATIC_INLINE int32_t arm_convolve_1_x_n_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + (void)input_dims; + (void)filter_dims; + return 0; +} + +int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + return arm_convolve_s8_get_buffer_size_mve(input_dims, filter_dims); +#else + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +#endif +} + +int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if !defined(ARM_MATH_MVEI) + return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); +#else + return arm_convolve_1_x_n_s8_get_buffer_size_mve(input_dims, filter_dims); +#endif +} + +int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims) +{ + (void)input_dims; + return 0; +} + +/* + * Get the required buffer size for arm_convolve_wrapper_s8. This is the recommended function convolve wrapper s8 + * function. + * + * Refer to header file for details. + * + */ +int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ +#if defined(ARM_MATH_MVEI) + return arm_convolve_wrapper_s8_get_buffer_size_mve(conv_params, input_dims, filter_dims, output_dims); +#else + + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + if ((conv_params->stride.w == 1) && (conv_params->stride.h == 1)) + { + return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims); + } + else + { + return 0; + } + } + else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && (conv_params->dilation.w == 1) && + (filter_dims->h == 1)) + { + return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims); + } + else + { + return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); + } +#endif +} + +int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + if ((conv_params->stride.w == 1) && (conv_params->stride.h == 1)) + { + return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims); + } + else + { + return 0; + } + } + else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && (conv_params->dilation.w == 1) && + (filter_dims->h == 1)) + { + return arm_convolve_1_x_n_s8_get_buffer_size_mve(input_dims, filter_dims); + } + else + { + return arm_convolve_s8_get_buffer_size_mve(input_dims, filter_dims); + } +} + +int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + return arm_convolve_wrapper_s8_get_buffer_size(conv_params, input_dims, filter_dims, output_dims); +} + +/** + * @} end of GetBufferSizeNNConv group + */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c index 2a895657..395a3561 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_convolve_s16.c * Description: s16 version of convolution using symmetric quantization. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 30 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data) + int16_t *output_data) { (void)bias_dims; (void)ctx; @@ -90,7 +90,7 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx, /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) { - const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]); + const int32_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]); for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) { @@ -144,13 +144,6 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx, return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) -{ - (void)input_dims; - (void)filter_dims; - return 0; -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c index a35d0784..7c80bc00 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_convolve_s8.c * Description: s8 version of convolution using symmetric quantization. * - * $Date: 19 April 2022 - * $Revision: V.3.0.0 + * $Date: 30 January 2023 + * $Revision: V.3.2.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data) + int8_t *output_data) { (void)bias_dims; @@ -66,7 +66,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, { return ARM_CMSIS_NN_ARG_ERROR; } - q15_t *buffer_a = (q15_t *)ctx->buf; + int16_t *buffer_a = (int16_t *)ctx->buf; const int32_t input_batches = input_dims->n; const uint16_t input_x = input_dims->w; @@ -95,8 +95,8 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, { #if defined(ARM_MATH_MVEI) /* Generate upto four columns from the input tensor a GEMM computation */ - q7_t *im2col_buf = (q7_t *)buffer_a; - q7_t *out = output_data; + int8_t *im2col_buf = (int8_t *)buffer_a; + int8_t *out = output_data; int32_t buffer_fill_cnt = 0; int32_t padded = 0; const int32_t num_elem = kernel_x * kernel_y * input_ch; @@ -120,12 +120,12 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x) { - memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch); + memset(im2col_buf, (int8_t)-input_offset, sizeof(int8_t) * input_ch); padded = 1; } else { - arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch); + arm_memcpy_s8(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch); } im2col_buf += input_ch; } @@ -139,20 +139,20 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, buffer_fill_cnt = 0; out = arm_nn_mat_mul_core_4x_s8(num_elem, num_elem, - (q7_t *)buffer_a, + (int8_t *)buffer_a, filter_data, output_ch, conv_params, quant_params, bias_data, out); - im2col_buf = (q7_t *)buffer_a; + im2col_buf = (int8_t *)buffer_a; } else if (buffer_fill_cnt == 4 && (padded != 0)) { buffer_fill_cnt = 0; out = arm_nn_mat_mult_s8(filter_data, - (q7_t *)buffer_a, + (int8_t *)buffer_a, output_ch, 4, output_shift, @@ -166,16 +166,20 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, bias_data, out); - im2col_buf = (q7_t *)buffer_a; + im2col_buf = (int8_t *)buffer_a; padded = 0; } } + if (out == NULL) + { + return ARM_CMSIS_NN_NO_IMPL_ERROR; + } } /* Handle left over columns */ if (buffer_fill_cnt != 0) { out = arm_nn_mat_mult_s8(filter_data, - (q7_t *)buffer_a, + (int8_t *)buffer_a, output_ch, buffer_fill_cnt, output_shift, @@ -196,8 +200,8 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, int32_t i_out_y, i_out_x, i_ker_y, i_ker_x; /* Generate two columns from the input tensor a GEMM computation */ - q15_t *two_column_buf = buffer_a; - q7_t *out = output_data; + int16_t *two_column_buf = buffer_a; + int8_t *out = output_data; /* This part implements the im2col function */ for (i_out_y = 0; i_out_y < output_y; i_out_y++) @@ -217,7 +221,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x) { /* Filling 0 for out-of-bound paddings */ - memset(two_column_buf, 0, sizeof(q15_t) * input_ch); + memset(two_column_buf, 0, sizeof(int16_t) * input_ch); } else { @@ -253,48 +257,48 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, /* left-over because odd number of output pixels */ if (two_column_buf != buffer_a) { - const q7_t *ker_a = filter_data; + const int8_t *ker_a = filter_data; int i; for (i = 0; i < output_ch; i++) { /* Load the accumulator with bias first */ - q31_t sum = 0; + int32_t sum = 0; if (bias_data) { sum = bias_data[i]; } /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ - const q15_t *ip_as_col = buffer_a; + const int16_t *ip_as_col = buffer_a; /* 4 multiply and accumulates are done in one loop. */ -#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_DSP) uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; while (col_count) { - q31_t ker_a1, ker_a2; - q31_t ip_b1, ip_b2; + int32_t ker_a1, ker_a2; + int32_t ip_b1, ip_b2; ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); - sum = __SMLAD(ker_a1, ip_b1, sum); + sum = SMLAD(ker_a1, ip_b1, sum); ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); - sum = __SMLAD(ker_a2, ip_b2, sum); + sum = SMLAD(ker_a2, ip_b2, sum); col_count--; } /* Handle left over mac */ col_count = input_ch * kernel_y * kernel_x & 0x3; -#else + #else uint16_t col_count = input_ch * kernel_y * kernel_x; -#endif + #endif while (col_count) { - q7_t ker_a1 = *ker_a++; - q15_t ip_b1 = *ip_as_col++; + int8_t ker_a1 = *ker_a++; + int16_t ip_b1 = *ip_as_col++; sum += ker_a1 * ip_b1; col_count--; } @@ -303,7 +307,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, sum += out_offset; sum = MAX(sum, out_activation_min); sum = MIN(sum, out_activation_max); - *out++ = (q7_t)sum; + *out++ = (int8_t)sum; } } #endif // #if defined(ARM_MATH_MVEI) @@ -316,20 +320,6 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) -{ -#if defined(ARM_MATH_MVEI) - int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h; - // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on - // implementation of arm_nn_mat_mult_s8 - col_length = (col_length + 7) / 8; - // 4 -> number of im2col buffers, 8 -> 8 elements per Q register - return 4 * col_length * 8 * (int32_t)sizeof(int8_t); -#else - return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); -#endif -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c index 357ef593..7f38d554 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in * cmsis-nn to perform the convolution. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 30 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -51,13 +51,13 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input_data, + const int16_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int64_t *bias_data, const cmsis_nn_dims *output_dims, - q15_t *output_data) + int16_t *output_data) { #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) if (filter_dims->w * filter_dims->h * input_dims->c < 512 && @@ -104,27 +104,6 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, #endif } -int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, - const cmsis_nn_dims *input_dims, - const cmsis_nn_dims *filter_dims, - const cmsis_nn_dims *output_dims) -{ - (void)conv_params; - (void)output_dims; - -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - if (filter_dims->w * filter_dims->h * input_dims->c < 512 && - (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) - { - return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims); - } - - return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); -#else - return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); -#endif -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c index 235a87e9..2735408c 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in * cmsis-nn to perform the convolution. * - * $Date: 4 August 2022 - * $Revision: V.2.1.1 + * $Date: 11 January 2023 + * $Revision: V.2.3.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -51,29 +51,45 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *filter_dims, - const q7_t *filter_data, + const int8_t *filter_data, const cmsis_nn_dims *bias_dims, const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data) + int8_t *output_data) { - if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (conv_params->stride.w == 1) && - (conv_params->stride.h == 1) && (filter_dims->w == 1) && (filter_dims->h == 1) && - (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) { - return arm_convolve_1x1_s8_fast(ctx, - conv_params, - quant_params, - input_dims, - input_data, - filter_dims, - filter_data, - bias_dims, - bias_data, - output_dims, - output_data); + if ((conv_params->stride.w == 1) && (conv_params->stride.h == 1)) + { + return arm_convolve_1x1_s8_fast(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else + { + return arm_convolve_1x1_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } } else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && conv_params->dilation.w == 1 && (filter_dims->h == 1)) { @@ -105,28 +121,6 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, } } -int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, - const cmsis_nn_dims *input_dims, - const cmsis_nn_dims *filter_dims, - const cmsis_nn_dims *output_dims) -{ - if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (conv_params->stride.w == 1) && - (conv_params->stride.h == 1) && (filter_dims->w == 1) && (filter_dims->h == 1) && - (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) - { - return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims); - } - else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && (conv_params->dilation.w == 1) && - (filter_dims->h == 1)) - { - return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims); - } - else - { - return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); - } -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c index 0893ea54..3d8e6c2a 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: Optimized s8 depthwise convolution function for channel * multiplier of 1 and 3x3 kernel size. * - * $Date: 19 July 2022 - * $Revision: V.3.1.0 + * $Date: 5 January 2023 + * $Revision: V.3.2.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -53,13 +53,13 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input, + const int8_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int32_t *bias, const cmsis_nn_dims *output_dims, - q7_t *output) + int8_t *output) { (void)ctx; (void)bias_dims; @@ -116,25 +116,82 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; +#if defined(ARM_MATH_DSP) + const uint32_t lhs_offset_s16x2 = PKHBT(input_offset, input_offset, 16); for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) { int32_t in_val = 0; int32_t ker_val = 0; + int32_t in_val_1 = 0; + int32_t ker_val_1 = 0; if (ker_w_start == 0) { - in_val = arm_nn_read_q7x4(input_ptr); - ker_val = arm_nn_read_q7x4(kernel_ptr); + in_val = arm_nn_read_s8x4(input_ptr); + ker_val = arm_nn_read_s8x4(kernel_ptr); + in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8); + ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8); + + out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1); + in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val); + out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3); + ker_val = SXTB16((uint32_t)ker_val); + out_buff0 = SMLABB(in_val, ker_val, out_buff0); + out_buff2 = SMLATT(in_val, ker_val, out_buff2); + } + + in_val = arm_nn_read_s8x4(input_ptr + input_ch); + ker_val = arm_nn_read_s8x4(kernel_ptr + input_ch); + in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8); + ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8); + + out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1); + in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val); + out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3); + ker_val = SXTB16((uint32_t)ker_val); + out_buff0 = SMLABB(in_val, ker_val, out_buff0); + out_buff2 = SMLATT(in_val, ker_val, out_buff2); + + if ((input_x - in_w) >= 3) + { + in_val = arm_nn_read_s8x4(input_ptr + (input_ch << 1)); + ker_val = arm_nn_read_s8x4(kernel_ptr + (input_ch << 1)); + in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8); + ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8); + + out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1); + in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val); + out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3); + ker_val = SXTB16((uint32_t)ker_val); + out_buff0 = SMLABB(in_val, ker_val, out_buff0); + out_buff2 = SMLATT(in_val, ker_val, out_buff2); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + +#else + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + int32_t in_val = 0; + int32_t ker_val = 0; + + if (ker_w_start == 0) + { + in_val = arm_nn_read_s8x4(input_ptr); + ker_val = arm_nn_read_s8x4(kernel_ptr); out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); } - in_val = arm_nn_read_q7x4(input_ptr + input_ch); - ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch); + in_val = arm_nn_read_s8x4(input_ptr + input_ch); + ker_val = arm_nn_read_s8x4(kernel_ptr + input_ch); out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); @@ -143,8 +200,8 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, if ((input_x - in_w) >= 3) { - in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1)); - ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1)); + in_val = arm_nn_read_s8x4(input_ptr + (input_ch << 1)); + ker_val = arm_nn_read_s8x4(kernel_ptr + (input_ch << 1)); out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); @@ -155,6 +212,7 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, input_ptr += (input_ch * input_x); kernel_ptr += (input_ch * 3); } +#endif out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]); out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]); diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c index 354ee10d..5afe2c8a 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: Optimized s16 depthwise separable convolution function for * channel multiplier of 1. * - * $Date: 6 July 2022 - * $Revision: V.1.1.0 + * $Date: 30 January 2023 + * $Revision: V.1.3.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input, + const int16_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int64_t *bias, const cmsis_nn_dims *output_dims, - q15_t *output) + int16_t *output) { const int32_t input_ch = input_dims->c; const int32_t output_ch = output_dims->c; @@ -96,9 +96,9 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, const int32_t output_y = output_dims->h; const int32_t output_activation_min = dw_conv_params->activation.min; const int32_t output_activation_max = dw_conv_params->activation.max; - q15_t *buffer_a = (q15_t *)ctx->buf; + int16_t *buffer_a = (int16_t *)ctx->buf; -#if defined(ARM_MATH_MVEI) + #if defined(ARM_MATH_MVEI) int16_t *lhs_buffer = buffer_a; int16_t *out = output; int buffer_count = 0; @@ -214,11 +214,11 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, } } -#else // ARM_MATH_DSP + #else // ARM_MATH_DSP /* Run the following code in cores using DSP extension */ - q15_t *const col_buffer_start = buffer_a; - q15_t *col_buffer = col_buffer_start; + int16_t *const col_buffer_start = buffer_a; + int16_t *col_buffer = col_buffer_start; const int64_t *const bias_start_pos = bias; const int32_t *const out_mult_start_pos = output_mult; const int32_t *const out_shift_start_pos = output_shift; @@ -244,7 +244,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, int32_t index = 0; if (ker_y_start != 0) { - memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t)); + memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(int16_t)); index += (kernel_x * input_ch) * ker_y_start; } @@ -258,13 +258,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, if (idx_x < 0 || idx_x >= input_x) { - memset(&col_buffer[index], 0, input_ch * sizeof(q15_t)); + memset(&col_buffer[index], 0, input_ch * sizeof(int16_t)); } else { arm_memcpy_q15(&col_buffer[index], input + (idx_y * input_x + idx_x) * input_ch, - input_ch * sizeof(q15_t)); + input_ch * sizeof(int16_t)); } index += input_ch; } @@ -273,7 +273,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, const int diff = kernel_y - ker_y_end; if (diff != 0) { - memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t)); + memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(int16_t)); } row_count = output_ch / 4; @@ -284,10 +284,10 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, while (row_count) { - q31_t sum_1 = 0; - q31_t sum_2 = 0; - q31_t sum_3 = 0; - q31_t sum_4 = 0; + int32_t sum_1 = 0; + int32_t sum_2 = 0; + int32_t sum_3 = 0; + int32_t sum_4 = 0; int32_t output_mult_1 = REDUCE_MULTIPLIER(output_mult[0]); int32_t output_mult_2 = REDUCE_MULTIPLIER(output_mult[1]); @@ -296,46 +296,46 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, output_mult += 4; uint16_t col_count = (kernel_x * kernel_y) / 2; - q15_t *col_pos = col_buffer_start + row_shift; - const q7_t *row_pos = kernel + row_shift; + int16_t *col_pos = col_buffer_start + row_shift; + const int8_t *row_pos = kernel + row_shift; row_shift += 4; while (col_count) { /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */ - q31_t row_a1, row_a2, row_b1, row_b2, col_a, row_c, col_b, col_c; + int32_t row_a1, row_a2, row_b1, row_b2, col_a, row_c, col_b, col_c; /* Read 4 weights */ - row_b1 = arm_nn_read_q7x4(row_pos); - row_a1 = arm_nn_read_q7x4(row_pos + input_ch); - col_a = arm_nn_read_q15x2(col_pos); - col_b = arm_nn_read_q15x2(col_pos + input_ch); + row_b1 = arm_nn_read_s8x4(row_pos); + row_a1 = arm_nn_read_s8x4(row_pos + input_ch); + col_a = arm_nn_read_s16x2(col_pos); + col_b = arm_nn_read_s16x2(col_pos + input_ch); - row_a2 = __SXTB16(row_b1); - row_b1 = __SXTB16(__ROR(row_b1, 8)); + row_a2 = SXTB16(row_b1); + row_b1 = SXTB16(ROR(row_b1, 8)); - row_b2 = __SXTB16(row_a1); - row_a1 = __SXTB16(__ROR(row_a1, 8)); + row_b2 = SXTB16(row_a1); + row_a1 = SXTB16(ROR(row_a1, 8)); - col_c = __PKHBT(col_b, col_a, 16); - col_a = __PKHTB(col_b, col_a, 16); - row_c = __PKHBT(row_b2, row_a2, 16); - sum_1 = __SMLAD(col_c, row_c, sum_1); + col_c = PKHBT(col_b, col_a, 16); + col_a = PKHTB(col_b, col_a, 16); + row_c = PKHBT(row_b2, row_a2, 16); + sum_1 = SMLAD(col_c, row_c, sum_1); - row_c = __PKHBT(row_b1, row_a1, 16); - sum_2 = __SMLAD(col_a, row_c, sum_2); + row_c = PKHBT(row_b1, row_a1, 16); + sum_2 = SMLAD(col_a, row_c, sum_2); - col_a = arm_nn_read_q15x2(col_pos + 2); - col_b = arm_nn_read_q15x2(col_pos + input_ch + 2); + col_a = arm_nn_read_s16x2(col_pos + 2); + col_b = arm_nn_read_s16x2(col_pos + input_ch + 2); - col_c = __PKHBT(col_b, col_a, 16); - col_a = __PKHTB(col_b, col_a, 16); - row_c = __PKHTB(row_a2, row_b2, 16); - sum_3 = __SMLAD(col_c, row_c, sum_3); + col_c = PKHBT(col_b, col_a, 16); + col_a = PKHTB(col_b, col_a, 16); + row_c = PKHTB(row_a2, row_b2, 16); + sum_3 = SMLAD(col_c, row_c, sum_3); - row_c = __PKHTB(row_a1, row_b1, 16); - sum_4 = __SMLAD(col_a, row_c, sum_4); + row_c = PKHTB(row_a1, row_b1, 16); + sum_4 = SMLAD(col_a, row_c, sum_4); row_pos += input_ch << 1; col_pos += input_ch << 1; @@ -372,22 +372,22 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, result = arm_nn_requantize_s64(acc_1, output_mult_1, *output_shift++); result = MAX(result, output_activation_min); result = MIN(result, output_activation_max); - *output++ = (q15_t)result; + *output++ = (int16_t)result; result = arm_nn_requantize_s64(acc_2, output_mult_2, *output_shift++); result = MAX(result, output_activation_min); result = MIN(result, output_activation_max); - *output++ = (q15_t)result; + *output++ = (int16_t)result; result = arm_nn_requantize_s64(acc_3, output_mult_3, *output_shift++); result = MAX(result, output_activation_min); result = MIN(result, output_activation_max); - *output++ = (q15_t)result; + *output++ = (int16_t)result; result = arm_nn_requantize_s64(acc_4, output_mult_4, *output_shift++); result = MAX(result, output_activation_min); result = MIN(result, output_activation_max); - *output++ = (q15_t)result; + *output++ = (int16_t)result; row_count--; } @@ -395,9 +395,9 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, row_count = output_ch & 0x3; while (row_count) { - q15_t *col_pos = col_buffer_start + row_shift; - const q7_t *row_pos = kernel + row_shift; - q31_t sum = 0; + int16_t *col_pos = col_buffer_start + row_shift; + const int8_t *row_pos = kernel + row_shift; + int32_t sum = 0; const uint16_t col_count = (kernel_x * kernel_y); row_shift += 1; @@ -414,7 +414,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, output_mult++; result = MAX(result, output_activation_min); result = MIN(result, output_activation_max); - *output++ = (q15_t)result; + *output++ = (int16_t)result; row_count--; } @@ -426,7 +426,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, /* Advance to the next batch */ input += (input_x * input_y * input_ch); } -#endif + #endif #else /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ return arm_depthwise_conv_s16(ctx, @@ -446,22 +446,6 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) -{ -#if defined(ARM_MATH_DSP) -#if defined(ARM_MATH_MVEI) - /* The + 8 accounts for a worst case out of bounds read of the lhs buffers in the *_nt_t_* function. */ - return 4 * input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t) + 8; -#else // ARM_MATH_DSP - return input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t); -#endif -#else - (void)input_dims; - (void)filter_dims; - return 0; -#endif -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c new file mode 100644 index 00000000..fb0b8e1b --- /dev/null +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c @@ -0,0 +1,123 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_get_buffer_sizes_s16.c + * Description: Collection of get buffer size functions for the various s16 convolution layer functions. + * + * $Date: 13 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup NNconv + */ + +/** + * @addtogroup GetBufferSizeNNConv + * @{ + */ +__STATIC_INLINE int32_t arm_depthwise_conv_fast_s16_get_buffer_size_mve(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + /* The + 8 accounts for a worst case out of bounds read of the lhs buffers in the *_nt_t_* function. */ + return 4 * input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t) + 8; +} + +__STATIC_INLINE int32_t arm_depthwise_conv_fast_s16_get_buffer_size_dsp(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + return input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t); +} + +int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_MVEI) + return arm_depthwise_conv_fast_s16_get_buffer_size_mve(input_dims, filter_dims); + #else // ARM_MATH_DSP + return arm_depthwise_conv_fast_s16_get_buffer_size_dsp(input_dims, filter_dims); + #endif +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)output_dims; + + int32_t size = 0; + + if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)) + { + size = arm_depthwise_conv_fast_s16_get_buffer_size(input_dims, filter_dims); + } + + return size; +} + +int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)output_dims; + + int32_t size = 0; + + if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)) + { + size = arm_depthwise_conv_fast_s16_get_buffer_size_mve(input_dims, filter_dims); + } + + return size; +} + +int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)output_dims; + + int32_t size = 0; + + if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)) + { + size = arm_depthwise_conv_fast_s16_get_buffer_size_dsp(input_dims, filter_dims); + } + + return size; +} + +/** + * @} end of GetBufferSizeNNConv group + */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c new file mode 100644 index 00000000..a462c53d --- /dev/null +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c @@ -0,0 +1,131 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_get_buffer_sizes_s8.c + * Description: Collection of get buffer size functions for the various s8 convolution layer functions. + * + * $Date: 20 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup NNConv + */ + +/** + * @addtogroup GetBufferSizeNNConv + * @{ + */ + +__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + (void)input_dims; + return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t); +} + +__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_dsp(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims) +{ + return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +} + +int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + return arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims); +#elif defined(ARM_MATH_DSP) + return arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { +#if !defined(ARM_MATH_MVEI) + if (filter_dims->w == 3 && filter_dims->h == 3 && dw_conv_params->padding.h <= 1 && + dw_conv_params->padding.w <= 1) + { + return size; + } +#endif + size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims); + } + + return size; +} + +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + if (filter_dims->w == 3 && filter_dims->h == 3 && dw_conv_params->padding.h <= 1 && + dw_conv_params->padding.w <= 1) + { + return size; + } + size = arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims); + } + + return size; +} + +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + size = arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims); + } + + return size; +} + +/** + * @} end of GetBufferSizeNNConv group + */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c index 33161d21..6587b877 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_depthwise_conv_s16.c * Description: s16 version of depthwise convolution. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 26 October 2022 + * $Revision: V.2.0.1 * * Target Processor: Cortex-M CPUs * @@ -169,7 +169,7 @@ static void depthwise_conv_s16_generic_s16(const int16_t *input, { const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; - const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]); + const int32_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]); int64_t acc_0 = 0; int ker_y_start; @@ -245,13 +245,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input, + const int16_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int64_t *bias, const cmsis_nn_dims *output_dims, - q15_t *output) + int16_t *output) { const uint16_t dilation_x = dw_conv_params->dilation.w; const uint16_t dilation_y = dw_conv_params->dilation.h; diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c index d019be96..0c67079b 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c @@ -21,8 +21,8 @@ * Title: arm_depthwise_conv_s8.c * Description: s8 version of depthwise convolution. * - * $Date: 29 July 2022 - * $Revision: V.3.0.3 + * $Date: 26 October 2022 + * $Revision: V.3.0.4 * * Target Processor: Cortex-M CPUs * @@ -151,12 +151,12 @@ depthwise_conv_s8_mult_4(const int8_t *input, } } -static void depthwise_conv_s8_generic(const q7_t *input, +static void depthwise_conv_s8_generic(const int8_t *input, const uint16_t input_batches, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, - const q7_t *kernel, + const int8_t *kernel, const uint16_t output_ch, const uint16_t ch_mult, const uint16_t kernel_x, @@ -166,7 +166,7 @@ static void depthwise_conv_s8_generic(const q7_t *input, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, - q7_t *output, + int8_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, @@ -274,13 +274,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input, + const int8_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int32_t *bias, const cmsis_nn_dims *output_dims, - q7_t *output) + int8_t *output) { const uint16_t dilation_x = dw_conv_params->dilation.w; const uint16_t dilation_y = dw_conv_params->dilation.h; diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c index c3659c80..572a6b02 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: Optimized s8 depthwise separable convolution function for * channel multiplier of 1. * - * $Date: 27 July 2022 - * $Revision: V.3.1.0 + * $Date: 30 January 2023 + * $Revision: V.3.3.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input, + const int8_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int32_t *bias, const cmsis_nn_dims *output_dims, - q7_t *output) + int8_t *output) { const int32_t input_ch = input_dims->c; @@ -92,12 +92,12 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, const int32_t input_offset = dw_conv_params->input_offset; const int32_t output_activation_min = dw_conv_params->activation.min; const int32_t output_activation_max = dw_conv_params->activation.max; - q15_t *buffer_a = (q15_t *)ctx->buf; + int16_t *buffer_a = (int16_t *)ctx->buf; -#ifdef ARM_MATH_MVEI + #ifdef ARM_MATH_MVEI /* Generate two columns from the input tensor */ - q7_t *lhs_buffer = (q7_t *)buffer_a; - q7_t *out = output; + int8_t *lhs_buffer = (int8_t *)buffer_a; + int8_t *out = output; int padded = 0; int buffer_count = 0; const int32_t kernel_size = kernel_x * kernel_y; @@ -122,12 +122,12 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, { if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) { - arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)active_ch); + arm_memset_s8(lhs_buffer, (int8_t)-input_offset, (uint32_t)active_ch); padded = 1; } else { - arm_memcpy_q7(lhs_buffer, + arm_memcpy_s8(lhs_buffer, input_slice + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)active_ch); } @@ -139,7 +139,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, if (buffer_count == 4) { const int32_t block_offset = i_ch * CH_IN_BLOCK_MVE; - lhs_buffer = (q7_t *)buffer_a; + lhs_buffer = (int8_t *)buffer_a; if (padded == 0) { arm_nn_depthwise_conv_nt_t_s8(lhs_buffer, @@ -179,7 +179,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, } } /* Handle left over buffers */ - lhs_buffer = (q7_t *)buffer_a; + lhs_buffer = (int8_t *)buffer_a; int8_t *out_base = out; for (int i_buf = 0; i_buf < buffer_count; i_buf++) @@ -228,13 +228,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, remaining_ch -= CH_IN_BLOCK_MVE; } -#else // ARM_MATH_DSP + #else // ARM_MATH_DSP /* Run the following code in cores using DSP extension */ - q15_t *const col_buffer_start = buffer_a; - q15_t *col_buffer = col_buffer_start; + int16_t *const col_buffer_start = buffer_a; + int16_t *col_buffer = col_buffer_start; const int32_t *const bias_start_pos = bias; - const q31_t *const out_mult_start_pos = output_mult; - const q31_t *const out_shift_start_pos = output_shift; + const int32_t *const out_mult_start_pos = output_mult; + const int32_t *const out_shift_start_pos = output_shift; uint16_t row_count; uint16_t row_shift; @@ -254,7 +254,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, int32_t index = 0; if (ker_y_start != 0) { - memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t)); + memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(int16_t)); index += (kernel_x * input_ch) * ker_y_start; } @@ -267,11 +267,11 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, const int32_t idx_x = base_idx_x + i_ker_x; if (idx_x < 0 || idx_x >= input_x) { - memset(&col_buffer[index], 0, input_ch * sizeof(q15_t)); + memset(&col_buffer[index], 0, input_ch * sizeof(int16_t)); } else { - arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch, + arm_q7_to_q15_with_offset((int8_t *)input + (idx_y * input_x + idx_x) * input_ch, &col_buffer[index], input_ch, input_offset); @@ -283,7 +283,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, const int diff = kernel_y - ker_y_end; if (diff != 0) { - memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t)); + memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(int16_t)); } row_count = output_ch / 4; @@ -294,10 +294,10 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, while (row_count) { - q31_t sum = 0; - q31_t sum_2 = 0; - q31_t sum_3 = 0; - q31_t sum_4 = 0; + int32_t sum = 0; + int32_t sum_2 = 0; + int32_t sum_3 = 0; + int32_t sum_4 = 0; if (bias) { sum = *bias++; @@ -307,8 +307,8 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, } uint16_t col_count = (kernel_x * kernel_y) / 2; - q15_t *col_pos = col_buffer_start + row_shift; - const q7_t *row_pos = kernel + row_shift; + int16_t *col_pos = col_buffer_start + row_shift; + const int8_t *row_pos = kernel + row_shift; row_shift += 4; while (col_count) @@ -316,37 +316,37 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */ /* Note: variable names can be improved here to align with rows and columns. */ - q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c; + int32_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c; /* Read 4 weights */ - ip_b1 = arm_nn_read_q7x4(row_pos); - ip_a1 = arm_nn_read_q7x4(row_pos + input_ch); - op_a = arm_nn_read_q15x2(col_pos); - op_b = arm_nn_read_q15x2(col_pos + input_ch); + ip_b1 = arm_nn_read_s8x4(row_pos); + ip_a1 = arm_nn_read_s8x4(row_pos + input_ch); + op_a = arm_nn_read_s16x2(col_pos); + op_b = arm_nn_read_s16x2(col_pos + input_ch); - ip_a2 = __SXTB16(ip_b1); - ip_b1 = __SXTB16(__ROR(ip_b1, 8)); + ip_a2 = SXTB16(ip_b1); + ip_b1 = SXTB16(ROR(ip_b1, 8)); - ip_b2 = __SXTB16(ip_a1); - ip_a1 = __SXTB16(__ROR(ip_a1, 8)); + ip_b2 = SXTB16(ip_a1); + ip_a1 = SXTB16(ROR(ip_a1, 8)); - op_c = __PKHBT(op_b, op_a, 16); - op_a = __PKHTB(op_b, op_a, 16); - op_b = __PKHBT(ip_b2, ip_a2, 16); - sum = __SMLAD(op_c, op_b, sum); + op_c = PKHBT(op_b, op_a, 16); + op_a = PKHTB(op_b, op_a, 16); + op_b = PKHBT(ip_b2, ip_a2, 16); + sum = SMLAD(op_c, op_b, sum); - op_b = __PKHBT(ip_b1, ip_a1, 16); - sum_2 = __SMLAD(op_a, op_b, sum_2); + op_b = PKHBT(ip_b1, ip_a1, 16); + sum_2 = SMLAD(op_a, op_b, sum_2); - op_a = arm_nn_read_q15x2(col_pos + 2); - op_b = arm_nn_read_q15x2(col_pos + input_ch + 2); + op_a = arm_nn_read_s16x2(col_pos + 2); + op_b = arm_nn_read_s16x2(col_pos + input_ch + 2); - op_c = __PKHBT(op_b, op_a, 16); - op_a = __PKHTB(op_b, op_a, 16); - op_b = __PKHTB(ip_a2, ip_b2, 16); - sum_3 = __SMLAD(op_c, op_b, sum_3); + op_c = PKHBT(op_b, op_a, 16); + op_a = PKHTB(op_b, op_a, 16); + op_b = PKHTB(ip_a2, ip_b2, 16); + sum_3 = SMLAD(op_c, op_b, sum_3); - op_b = __PKHTB(ip_a1, ip_b1, 16); - sum_4 = __SMLAD(op_a, op_b, sum_4); + op_b = PKHTB(ip_a1, ip_b1, 16); + sum_4 = SMLAD(op_a, op_b, sum_4); row_pos += input_ch << 1; col_pos += input_ch << 1; @@ -370,24 +370,24 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, sum += output_offset; sum = MAX(sum, output_activation_min); sum = MIN(sum, output_activation_max); - *output++ = (q7_t)sum; + *output++ = (int8_t)sum; sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++); sum_2 += output_offset; sum_2 = MAX(sum_2, output_activation_min); sum_2 = MIN(sum_2, output_activation_max); - *output++ = (q7_t)sum_2; + *output++ = (int8_t)sum_2; sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++); sum_3 += output_offset; sum_3 = MAX(sum_3, output_activation_min); sum_3 = MIN(sum_3, output_activation_max); - *output++ = (q7_t)sum_3; + *output++ = (int8_t)sum_3; sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++); sum_4 += output_offset; sum_4 = MAX(sum_4, output_activation_min); sum_4 = MIN(sum_4, output_activation_max); - *output++ = (q7_t)sum_4; + *output++ = (int8_t)sum_4; row_count--; } @@ -395,9 +395,9 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, row_count = output_ch & 0x3; while (row_count) { - q15_t *col_pos = col_buffer_start + row_shift; - const q7_t *row_pos = kernel + row_shift; - q31_t sum = 0; + int16_t *col_pos = col_buffer_start + row_shift; + const int8_t *row_pos = kernel + row_shift; + int32_t sum = 0; if (bias) { sum = *bias++; @@ -413,7 +413,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, sum += output_offset; sum = MAX(sum, output_activation_min); sum = MIN(sum, output_activation_max); - *output++ = (q7_t)sum; + *output++ = (int8_t)sum; row_count--; } @@ -422,7 +422,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, col_buffer = col_buffer_start; } } -#endif + #endif #else /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ return arm_depthwise_conv_s8(ctx, @@ -442,20 +442,6 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) -{ -#if defined(ARM_MATH_MVEI) - (void)input_dims; - return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t); -#elif defined(ARM_MATH_DSP) - return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); -#else - (void)input_dims; - (void)filter_dims; - return 0; -#endif -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c index bc42de10..8a2ff210 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,20 +22,15 @@ * Description: Wrapper API to select appropriate depthwise conv API based * on dimensions. * - * $Date: 6 July 2022 - * $Revision: V.1.0.1 + * $Date: 20 January 2023 + * $Revision: V.1.1.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" - - -#define USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims) \ - (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 && \ - filter_dims->w * filter_dims->h * input_dims->c < 512) - +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" /** * @ingroup Public @@ -56,17 +51,17 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input, + const int16_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *filter, + const int8_t *filter, const cmsis_nn_dims *bias_dims, const int64_t *bias, const cmsis_nn_dims *output_dims, - q15_t *output) + int16_t *output) { arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS; - if (USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims)) + if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)) { status = arm_depthwise_conv_fast_s16(ctx, dw_conv_params, @@ -99,25 +94,6 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx, return status; } -int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, - const cmsis_nn_dims *input_dims, - const cmsis_nn_dims *filter_dims, - const cmsis_nn_dims *output_dims) -{ - (void)dw_conv_params; - (void)input_dims; - (void)filter_dims; - (void)output_dims; - int32_t size = 0; - - if (USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims)) - { - size = arm_depthwise_conv_fast_s16_get_buffer_size(input_dims, filter_dims); - } - - return size; -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c index 157aa92f..0107f757 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: Wrapper API to select appropriate depthwise conv API based * on dimensions. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 13 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -50,21 +50,21 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, const cmsis_nn_dw_conv_params *dw_conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input, + const int8_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *filter, + const int8_t *filter, const cmsis_nn_dims *bias_dims, const int32_t *bias, const cmsis_nn_dims *output_dims, - q7_t *output) + int8_t *output) { arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS; if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1) { #if !defined(ARM_MATH_MVEI) - if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) && - (dw_conv_params->padding.w <= 1)) + if (filter_dims->w == 3 && filter_dims->h == 3 && dw_conv_params->padding.h <= 1 && + dw_conv_params->padding.w <= 1) { status = arm_depthwise_conv_3x3_s8(ctx, dw_conv_params, @@ -113,23 +113,6 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, return status; } -int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, - const cmsis_nn_dims *input_dims, - const cmsis_nn_dims *filter_dims, - const cmsis_nn_dims *output_dims) -{ - (void)dw_conv_params; - int32_t size = 0; - - if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && - dw_conv_params->dilation.h == 1) - { - size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims); - } - - return size; -} - /** * @} end of NNConv group */ diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c index f4c9e386..341f1571 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_nn_depthwise_conv_s8_core.c * Description: Depthwise convolution on im2col buffers. * - * $Date: 09. October 2020 - * $Revision: V.1.0.4 + * $Date: 26 October 2022 + * $Revision: V.1.0.5 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ @@ -37,17 +37,17 @@ * */ -q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, - const q15_t *col, - const uint16_t num_ch, - const int32_t *out_shift, - const int32_t *out_mult, - const int32_t out_offset, - const int32_t activation_min, - const int32_t activation_max, - const uint16_t kernel_size, - const int32_t *const output_bias, - q7_t *out) +int8_t *arm_nn_depthwise_conv_s8_core(const int8_t *row, + const int16_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + int8_t *out) { #if defined(ARM_MATH_MVEI) int32_t ch_per_loop = num_ch / 4; diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c index 5c27bf59..8e1708d2 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_nn_mat_mult_kernel_s8_s16.c * Description: Matrix-multiplication function for convolution * - * $Date: 14. December 2021 - * $Revision: V.1.1.0 + * $Date: 5 Januray 2023 + * $Revision: V.1.2.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" @@ -37,39 +37,39 @@ * */ -q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, - const q15_t *input_b, - const uint16_t output_ch, - const int32_t *out_shift, - const int32_t *out_mult, - const int32_t out_offset, - const int16_t activation_min, - const int16_t activation_max, - const uint16_t num_col_a, - const int32_t *const output_bias, - q7_t *out_0) +int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a, + const int16_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + int8_t *out_0) { #if !defined(ARM_MATH_MVEI) /* set up the second output pointers */ - q7_t *out_1 = out_0 + output_ch; + int8_t *out_1 = out_0 + output_ch; const int32_t *bias = output_bias; uint16_t row_count = output_ch / 2; - const q7_t *ip_a0 = input_a; + const int8_t *ip_a0 = input_a; /* this loop over rows in A */ while (row_count) { /* setup pointers for B */ - const q15_t *ip_b0 = input_b; - const q15_t *ip_b1 = ip_b0 + num_col_a; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + num_col_a; /* align the second pointer for A */ - const q7_t *ip_a1 = ip_a0 + num_col_a; + const int8_t *ip_a1 = ip_a0 + num_col_a; - q31_t ch_0_out_0 = 0; - q31_t ch_0_out_1 = 0; - q31_t ch_1_out_0 = 0; - q31_t ch_1_out_1 = 0; + int32_t ch_0_out_0 = 0; + int32_t ch_0_out_1 = 0; + int32_t ch_1_out_0 = 0; + int32_t ch_1_out_1 = 0; /* Init accumulator with bias for channel N and N + 1 */ if (bias) { @@ -79,43 +79,43 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, ch_1_out_1 = *bias++; } -#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_DSP) uint16_t col_count = num_col_a / 4; /* accumulate over the vector */ while (col_count) { - q31_t a01, a02, a11, a12; - q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); - q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + int32_t a01, a02, a11, a12; + int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1); ip_a0 = read_and_pad(ip_a0, &a01, &a02); ip_a1 = read_and_pad(ip_a1, &a11, &a12); - ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); - ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); - ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = SMLAD(a11, b1, ch_1_out_1); b0 = arm_nn_read_q15x2_ia(&ip_b0); b1 = arm_nn_read_q15x2_ia(&ip_b1); - ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); - ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); - ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = SMLAD(a12, b1, ch_1_out_1); col_count--; } /* while over col_count */ col_count = num_col_a & 0x3; -#else + #else uint16_t col_count = num_col_a; -#endif + #endif while (col_count) { - q7_t a0 = *ip_a0++; - q15_t b0 = *ip_b0++; - q7_t a1 = *ip_a1++; - q15_t b1 = *ip_b1++; + int8_t a0 = *ip_a0++; + int16_t b0 = *ip_b0++; + int8_t a1 = *ip_a1++; + int16_t b1 = *ip_b1++; ch_0_out_0 += a0 * b0; ch_0_out_1 += a0 * b1; @@ -128,13 +128,13 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, ch_0_out_0 += out_offset; ch_0_out_0 = MAX(ch_0_out_0, activation_min); ch_0_out_0 = MIN(ch_0_out_0, activation_max); - *out_0++ = (q7_t)ch_0_out_0; + *out_0++ = (int8_t)ch_0_out_0; ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); ch_0_out_1 += out_offset; ch_0_out_1 = MAX(ch_0_out_1, activation_min); ch_0_out_1 = MIN(ch_0_out_1, activation_max); - *out_1++ = (q7_t)ch_0_out_1; + *out_1++ = (int8_t)ch_0_out_1; out_mult++; out_shift++; @@ -142,13 +142,13 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, ch_1_out_0 += out_offset; ch_1_out_0 = MAX(ch_1_out_0, activation_min); ch_1_out_0 = MIN(ch_1_out_0, activation_max); - *out_0++ = (q7_t)ch_1_out_0; + *out_0++ = (int8_t)ch_1_out_0; ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); ch_1_out_1 += out_offset; ch_1_out_1 = MAX(ch_1_out_1, activation_min); ch_1_out_1 = MIN(ch_1_out_1, activation_max); - *out_1++ = (q7_t)ch_1_out_1; + *out_1++ = (int8_t)ch_1_out_1; out_mult++; out_shift++; @@ -161,11 +161,11 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, if (output_ch & 0x1) { /* setup pointers for B */ - const q15_t *ip_b0 = input_b; - const q15_t *ip_b1 = ip_b0 + num_col_a; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + num_col_a; - q31_t ch_0_out_0 = 0; - q31_t ch_0_out_1 = 0; + int32_t ch_0_out_0 = 0; + int32_t ch_0_out_1 = 0; /* load the bias */ if (bias) @@ -174,35 +174,35 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, ch_0_out_1 = *bias++; } -#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_DSP) uint16_t col_count = num_col_a >> 2; while (col_count) { - q31_t a01, a02; - q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); - q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + int32_t a01, a02; + int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1); ip_a0 = read_and_pad(ip_a0, &a01, &a02); - ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1); b0 = arm_nn_read_q15x2_ia(&ip_b0); b1 = arm_nn_read_q15x2_ia(&ip_b1); - ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1); col_count--; } col_count = num_col_a & 0x3; -#else + #else uint16_t col_count = num_col_a; -#endif + #endif while (col_count) { - q7_t a0 = *ip_a0++; - q15_t b0 = *ip_b0++; - q15_t b1 = *ip_b1++; + int8_t a0 = *ip_a0++; + int16_t b0 = *ip_b0++; + int16_t b1 = *ip_b1++; ch_0_out_0 += a0 * b0; ch_0_out_1 += a0 * b1; @@ -212,13 +212,13 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, ch_0_out_0 += out_offset; ch_0_out_0 = MAX(ch_0_out_0, activation_min); ch_0_out_0 = MIN(ch_0_out_0, activation_max); - *out_0++ = (q7_t)ch_0_out_0; + *out_0++ = (int8_t)ch_0_out_0; ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); ch_0_out_1 += out_offset; ch_0_out_1 = MAX(ch_0_out_1, activation_min); ch_0_out_1 = MIN(ch_0_out_1, activation_max); - *out_1++ = (q7_t)ch_0_out_1; + *out_1++ = (int8_t)ch_0_out_1; out_mult++; out_shift++; } diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c index add72484..06b89a9c 100644 --- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c +++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c @@ -21,8 +21,8 @@ * Title: arm_nn_mat_mult_s8.c * Description: General Matrix-multiplication function * - * $Date: 16 August 2022 - * $Revision: V.2.0.7 + * $Date: 26 October 2022 + * $Revision: V.2.0.8 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ @@ -36,20 +36,20 @@ * */ -q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, - const q7_t *input_col, - const uint16_t output_ch, - const uint16_t col_batches, - const int32_t *output_shift, - const int32_t *output_mult, - const int32_t out_offset, - const int32_t col_offset, - const int32_t row_offset, - const int16_t activation_min, - const int16_t activation_max, - const uint16_t row_len, - const int32_t *const bias, - q7_t *out) +int8_t *arm_nn_mat_mult_s8(const int8_t *input_row, + const int8_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t row_len, + const int32_t *const bias, + int8_t *out) { #if defined(ARM_MATH_MVEI) (void)row_offset; @@ -153,7 +153,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, acc_0 += out_offset; acc_0 = MAX(acc_0, activation_min); acc_0 = MIN(acc_0, activation_max); - out[i_out_ch] = (q7_t)acc_0; + out[i_out_ch] = (int8_t)acc_0; } out += output_ch; } diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c new file mode 100644 index 00000000..44baf6b6 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_get_buffer_sizes_s16.c + * Description: Collection of get buffer size functions for fully connected s16 layer function. + * + * $Date: 30 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup FC + */ + +/** + * @addtogroup GetBufferSizeFC + * @{ + */ + +int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims) +{ + (void)filter_dims; + return 0; +} + +int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims) +{ + return arm_fully_connected_s16_get_buffer_size(filter_dims); +} + +int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims) +{ + return arm_fully_connected_s16_get_buffer_size(filter_dims); +} + +/** + * @} end of GetBufferSizeFC group + */ diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c new file mode 100644 index 00000000..15ff9b2e --- /dev/null +++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_get_buffer_sizes_s8.c + * Description: Collection of get buffer size functions for fully connected s8 layer function. + * + * $Date: 31 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup FC + */ + +/** + * @addtogroup GetBufferSizeFC + * @{ + */ + +int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) +{ + (void)filter_dims; + return 0; +} + +int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims) +{ + return arm_fully_connected_s8_get_buffer_size(filter_dims); +} + +int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims) +{ + return arm_fully_connected_s8_get_buffer_size(filter_dims); +} + +/** + * @} end of GetBufferSizeFC group + */ diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c index 8bd428db..f67efc59 100644 --- a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c +++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_fully_connected_s16 * Description: Fully connected function compatible with TF Lite. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 13 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M and Cortex-A cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -50,13 +50,13 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx, const cmsis_nn_fc_params *fc_params, const cmsis_nn_per_tensor_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q15_t *input, + const int16_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int64_t *bias, const cmsis_nn_dims *output_dims, - q15_t *output) + int16_t *output) { (void)bias_dims; (void)ctx; @@ -64,7 +64,7 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx, int32_t batch_cnt = input_dims->n; - const q31_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier); + const int32_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier); while (batch_cnt) { @@ -86,12 +86,6 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx, return (ARM_CMSIS_NN_SUCCESS); } -int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims) -{ - (void)filter_dims; - return 0; -} - /** * @} end of FC group */ diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c index bc05a8cc..84ce9d76 100644 --- a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c +++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_fully_connected_s8 * Description: Fully connected function compatible with TF Lite. * - * $Date: 19 April 2022 - * $Revision: V.4.0.0 + * $Date: 13 January 2023 + * $Revision: V.5.1.0 * - * Target Processor: Cortex-M and Cortex-A cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -51,13 +51,13 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, const cmsis_nn_fc_params *fc_params, const cmsis_nn_per_tensor_quant_params *quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input, + const int8_t *input, const cmsis_nn_dims *filter_dims, - const q7_t *kernel, + const int8_t *kernel, const cmsis_nn_dims *bias_dims, const int32_t *bias, const cmsis_nn_dims *output_dims, - q7_t *output) + int8_t *output) { (void)bias_dims; (void)ctx; @@ -72,7 +72,6 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, bias, output, fc_params->input_offset, - 0, fc_params->output_offset, quant_params->multiplier, quant_params->shift, @@ -88,12 +87,6 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, return (ARM_CMSIS_NN_SUCCESS); } -int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) -{ - (void)filter_dims; - return 0; -} - /** * @} end of FC group */ diff --git a/src/third_party/cmsis_nn/Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c b/src/third_party/cmsis_nn/Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c new file mode 100644 index 00000000..58a0141d --- /dev/null +++ b/src/third_party/cmsis_nn/Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c @@ -0,0 +1,184 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_lstm_unidirectional_s16_s8.c + * Description: S8 LSTM function with S16 gate output + * + * $Date: 4 November 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M processors + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup Public + */ + +/** + * @addtogroup LSTM + * @{ + */ + +/* + * S8 LSTM function for TensorFlow Lite with S16 gate output + * + * Refer to header file for details. + * + */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/* + * LSTM unidirectional function with 8 bit input and output and 16 bit weights + * + * Refer header file for details. + * + */ +arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratch_buffers, + const int8_t *input_data, + const cmsis_nn_lstm_dims *lstm_dims, + const int8_t *in_to_in_weights, + const int8_t *in_to_forget_weights, + const int8_t *in_to_cell_weights, + const int8_t *in_to_out_weights, + const int8_t *recurrent_to_in_weights, + const int8_t *recurrent_to_forget_weights, + const int8_t *recurrent_to_cell_weights, + const int8_t *recurrent_to_out_weights, + const int16_t *cell_to_in_weights, + const int16_t *cell_to_forget_weights, + const int16_t *cell_to_out_weights, + const int8_t *projection_weights, + const cmsis_nn_lstm_params *lstm, + int8_t *output_state, + int16_t *cell_state, + int8_t *output_data) +{ + (void)cell_to_in_weights; + (void)cell_to_forget_weights; + (void)cell_to_out_weights; + + const int32_t num_batch = lstm_dims->num_batches; + const int32_t num_input = lstm_dims->num_inputs; + const int32_t max_time = lstm_dims->max_time; + + const int32_t num_output = lstm_dims->num_outputs; + const int32_t out_batch_leading_dim = num_output; + + // num_cell = num_output is considered in the code under the assumption that projection is NULL. + const int32_t num_cell = num_output; + + if (projection_weights != NULL) + { + return ARM_CMSIS_NN_ARG_ERROR; + } + + if (lstm->i2f_effective_bias == NULL || lstm->i2c_effective_bias == NULL || lstm->i2o_effective_bias == NULL) + { + return ARM_CMSIS_NN_ARG_ERROR; + } + + if (lstm->r2f_effective_bias == NULL || lstm->r2c_effective_bias == NULL || lstm->r2o_effective_bias == NULL) + { + return ARM_CMSIS_NN_ARG_ERROR; + } + + if (lstm->i2i_effective_bias == NULL || lstm->r2i_effective_bias == NULL) + { + return ARM_CMSIS_NN_ARG_ERROR; + } + + if (lstm->time_major) + { + const int32_t in_step = num_batch * num_input; + const int32_t out_step = num_batch * out_batch_leading_dim; + for (int i_max_time = 0; i_max_time < max_time; i_max_time++) + { + arm_cmsis_nn_status status = arm_nn_lstm_step_s8_s16(input_data + i_max_time * in_step, + in_to_in_weights, + in_to_forget_weights, + in_to_cell_weights, + in_to_out_weights, + recurrent_to_in_weights, + recurrent_to_forget_weights, + recurrent_to_cell_weights, + recurrent_to_out_weights, + lstm, + num_batch, + num_cell, + num_input, + num_output, + output_state, + cell_state, + output_data + i_max_time * out_step, + scratch_buffers); + if (status != ARM_CMSIS_NN_SUCCESS) + { + return status; + } + } + } + else + { + for (int i_num_batch = 0; i_num_batch < num_batch; i_num_batch++) + { + const int32_t in_step = num_input; + const int32_t out_step = out_batch_leading_dim; + for (int i_max_time = 0; i_max_time < max_time; i_max_time++) + { + const int32_t time_offset = i_num_batch * max_time + i_max_time; + + arm_cmsis_nn_status status = arm_nn_lstm_step_s8_s16(input_data + time_offset * in_step, + in_to_in_weights, + in_to_forget_weights, + in_to_cell_weights, + in_to_out_weights, + recurrent_to_in_weights, + recurrent_to_forget_weights, + recurrent_to_cell_weights, + recurrent_to_out_weights, + lstm, + /*num_batch=*/1, + num_cell, + num_input, + num_output, + output_state + i_num_batch * out_batch_leading_dim, + cell_state + i_num_batch * num_cell, + output_data + time_offset * out_step, + scratch_buffers); + if (status != ARM_CMSIS_NN_SUCCESS) + { + return status; + } + } + } + } + + return ARM_CMSIS_NN_SUCCESS; +} + +/** + * @} end of LSTM group + */ diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c index 263ff780..c75694b2 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c @@ -21,8 +21,8 @@ * Title: arm_nn_depthwise_conv_nt_t_padded_s8.c * Description: Depthwise convolution with padded matrices. * - * $Date: 27. July 2022 - * $Revision: V.2.0.0 + * $Date: 26 October 2022 + * $Revision: V.2.0.1 * * Target Processor: Cortex-M processors with MVE extension * -------------------------------------------------------------------- */ @@ -53,8 +53,8 @@ * */ -arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, - const q7_t *rhs, +arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const int8_t *lhs, + const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, @@ -65,7 +65,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, - q7_t *out) + int8_t *out) { #if defined(ARM_MATH_MVEI) int32_t loop_count = (active_ch + 3) / 4; diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c index 7bcff790..b623b896 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c @@ -21,8 +21,8 @@ * Title: arm_nn_depthwise_conv_nt_t_s16.c * Description: Depthwise convolution on matrices with no padding. * - * $Date: 6 July 2022 - * $Revision: V.1.0.0 + * $Date: 26 October 2022 + * $Revision: V.1.0.1 * * Target Processor: Cortex-M processors with MVE extension * -------------------------------------------------------------------- */ @@ -45,7 +45,7 @@ * */ int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs, - const q7_t *rhs, + const int8_t *rhs, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c index 30700631..6ec09708 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c @@ -21,8 +21,8 @@ * Title: arm_nn_depthwise_conv_nt_t_s8.c * Description: Depthwise convolution on matrices with no padding. * - * $Date: 27. July 2022 - * $Revision: V.2.0.0 + * $Date: 26 October 2022 + * $Revision: V.2.0.1 * * Target Processor: Cortex-M processors with MVE extension. * -------------------------------------------------------------------- */ @@ -44,8 +44,8 @@ * Refer header file for details. * */ -arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, - const q7_t *rhs, +arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs, + const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, @@ -56,7 +56,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, - q7_t *out) + int8_t *out) { #if defined(ARM_MATH_MVEI) const int32_t *bias = output_bias; diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c new file mode 100644 index 00000000..020dd197 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_lstm_calculate_gate_s8_s16.c + * Description: Update single gate for an incremental step of LSTM function. + * + * $Date: 8 September 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nn_tables.h" +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @defgroup supportLSTM LSTM + * + * Support functions for LSTM + * + */ + +/** + * @addtogroup supportLSTM + * @{ + */ + +/* + * Calculates a single LSTM gate, int8x8_16 version. + * Refer to header file for details + */ +void arm_nn_lstm_calculate_gate_s8_s16(const int8_t *input, + const int8_t *input_to_gate_weights, + const int32_t *input_to_gate_bias, + const cmsis_nn_scaling input_to_gate_scaling, + const int8_t *output_state, + const int8_t *recurrent_to_gate_weights, + const int32_t *recurrent_to_gate_bias, + const cmsis_nn_scaling recurrent_to_gate, + const int32_t n_batch, + const int32_t n_input, + const int32_t n_output, + const int32_t n_cell, + const arm_nn_activation_type activation_type, + int16_t *gate) +{ + const int32_t n_block = n_batch * n_cell; + + memset(gate, 0, n_block * sizeof(int16_t)); + arm_nn_vec_mat_mul_result_acc_s8(input, + input_to_gate_weights, + input_to_gate_bias, + gate, + 0, + input_to_gate_scaling.multiplier, + input_to_gate_scaling.shift, + n_input, + n_cell, + n_batch); + + arm_nn_vec_mat_mul_result_acc_s8(output_state, + recurrent_to_gate_weights, + recurrent_to_gate_bias, + gate, + 0, + recurrent_to_gate.multiplier, + recurrent_to_gate.shift, + n_output, + n_cell, + n_batch); + + arm_nn_activation_s16(gate, gate, n_block, 0, activation_type); +} +/** + * @} end of supportLSTM group + */ diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_step_s8_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_step_s8_s16.c new file mode 100644 index 00000000..e4fd5b8b --- /dev/null +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_step_s8_s16.c @@ -0,0 +1,154 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_lstm_step_s8_s16.c + * Description: Update LSTM function for a single iteration step. + * + * $Date: 9 Februari 2023 + * $Revision: V.1.1.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup supportLSTM + * @{ + */ + +/* + * Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version. + * Refer to header file for details. + */ +arm_cmsis_nn_status arm_nn_lstm_step_s8_s16(const int8_t *input, + const int8_t *input_to_input_weight, + const int8_t *input_to_forget_weight, + const int8_t *input_to_cell_weight, + const int8_t *input_to_output_weight, + const int8_t *recurrent_to_input_weight, + const int8_t *recurrent_to_forget_weight, + const int8_t *recurrent_to_cell_weight, + const int8_t *recurrent_to_output_weight, + const cmsis_nn_lstm_params *lstm, + const int n_batch, + const int n_cell, + const int n_input, + const int n_output, + int8_t *output_state, + int16_t *cell_state, + int8_t *output, + cmsis_nn_lstm_context *scratch_buffers) +{ + const int32_t n_block = n_batch * n_cell; + + // Calculate the input gate + arm_nn_lstm_calculate_gate_s8_s16(input, + input_to_input_weight, + lstm->i2i_effective_bias, + lstm->input_to_input_scaling, + output_state, + recurrent_to_input_weight, + lstm->r2i_effective_bias, + lstm->recurrent_to_input_scaling, + n_batch, + n_input, + n_output, + n_cell, + ARM_SIGMOID, + scratch_buffers->input_gate); + + // Calculate the forget gate + arm_nn_lstm_calculate_gate_s8_s16(input, + input_to_forget_weight, + lstm->i2f_effective_bias, + lstm->input_to_forget_scaling, + output_state, + recurrent_to_forget_weight, + lstm->r2f_effective_bias, + lstm->recurrent_to_forget_scaling, + n_batch, + n_input, + n_output, + n_cell, + ARM_SIGMOID, + scratch_buffers->forget_gate); + + // Calculate the cell update gate + arm_nn_lstm_calculate_gate_s8_s16(input, + input_to_cell_weight, + lstm->i2c_effective_bias, + lstm->input_to_cell_scaling, + output_state, + recurrent_to_cell_weight, + lstm->r2c_effective_bias, + lstm->recurrent_to_cell_scaling, + n_batch, + n_input, + n_output, + n_cell, + ARM_TANH, + scratch_buffers->cell_gate); + + // Update the cell state + arm_nn_lstm_update_cell_state_s16(n_block, + lstm->cell_state_shift, + cell_state, + scratch_buffers->input_gate, + scratch_buffers->forget_gate, + scratch_buffers->cell_gate); + + // Calculate the output gate + arm_nn_lstm_calculate_gate_s8_s16(input, + input_to_output_weight, + lstm->i2o_effective_bias, + lstm->input_to_output_scaling, + output_state, + recurrent_to_output_weight, + lstm->r2o_effective_bias, + lstm->recurrent_to_output_scaling, + n_batch, + n_input, + n_output, + n_cell, + ARM_SIGMOID, + scratch_buffers->output_gate); + + // Update the output state + arm_nn_lstm_update_output_s8_s16(n_batch, + n_cell, + cell_state, + lstm->cell_state_shift, + scratch_buffers->output_gate, + lstm->hidden_scaling, + lstm->hidden_offset, + output_state, + scratch_buffers->input_gate); + + arm_memcpy_s8(output, output_state, n_batch * n_output * sizeof(int8_t)); + + return ARM_CMSIS_NN_SUCCESS; +} +/** + * @} end of supportLSTM group + */ diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_cell_state_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_cell_state_s16.c new file mode 100644 index 00000000..4a81c288 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_cell_state_s16.c @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_lstm_update_cell_state_s16.c + * Description: Update cell state for an incremental step of LSTM function. + * + * $Date: 20 January 2023 + * $Revision: V.1.2.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup supportLSTM + * @{ + */ + +/* + * Update cell state for a single LSTM iteration step, int8x8_16 version. + * + * Refer to header file for more details + */ +void arm_nn_lstm_update_cell_state_s16(const int32_t n_block, + const int32_t cell_state_scale, + int16_t *cell_state, + const int16_t *input_gate, + const int16_t *forget_gate, + const int16_t *cell_gate) +{ + const int32_t cell_scale = 30 + cell_state_scale; + int32_t loop_count = n_block; + +#if defined(ARM_MATH_MVEI) + + while (loop_count > 0) + { + mve_pred16_t p = vctp32q(loop_count); + loop_count -= 4; + + int32x4_t res_1 = vmulq_s32(vldrhq_z_s32(cell_state, p), vldrhq_z_s32(forget_gate, p)); + forget_gate += 4; + res_1 = arm_divide_by_power_of_two_mve(res_1, 15); + int32x4_t res_2 = vmulq_s32(vldrhq_z_s32(input_gate, p), vldrhq_z_s32(cell_gate, p)); + input_gate += 4; + cell_gate += 4; + + res_2 = arm_divide_by_power_of_two_mve(res_2, cell_scale); + res_1 += res_2; + + res_1 = vmaxq_s32(res_1, vdupq_n_s32(NN_Q15_MIN)); + res_1 = vminq_s32(res_1, vdupq_n_s32(NN_Q15_MAX)); + + vstrhq_p_s32(cell_state, res_1, p); + cell_state += 4; + } +#else + #if defined(ARM_MATH_DSP) + while (loop_count > 1) + { + int32_t cell_state_01 = arm_nn_read_s16x2(cell_state); + int32_t forget_gate_01 = arm_nn_read_q15x2_ia(&forget_gate); + + int32_t value_00 = SMULBB(cell_state_01, forget_gate_01); + int32_t value_01 = SMULTT(cell_state_01, forget_gate_01); + value_00 = arm_nn_divide_by_power_of_two(value_00, 15); + value_01 = arm_nn_divide_by_power_of_two(value_01, 15); + + int32_t input_gate_01 = arm_nn_read_q15x2_ia(&input_gate); + int32_t cell_gate_01 = arm_nn_read_q15x2_ia(&cell_gate); + + int32_t value_10 = SMULBB(input_gate_01, cell_gate_01); + int32_t value_11 = SMULTT(input_gate_01, cell_gate_01); + + value_10 = arm_nn_divide_by_power_of_two(value_10, cell_scale); + value_11 = arm_nn_divide_by_power_of_two(value_11, cell_scale); + + value_00 += value_10; + value_01 += value_11; + + value_00 = CLAMP(value_00, NN_Q15_MAX, NN_Q15_MIN); + value_01 = CLAMP(value_01, NN_Q15_MAX, NN_Q15_MIN); + + arm_nn_write_q15x2_ia(&cell_state, PACK_Q15x2_32x1(value_00, value_01)); + loop_count -= 2; + } + #endif + for (int i = 0; i < loop_count; i++) + { + int32_t value = cell_state[i] * forget_gate[i]; + int32_t value_1 = input_gate[i] * cell_gate[i]; + + value = arm_nn_divide_by_power_of_two(value, 15); + value_1 = arm_nn_divide_by_power_of_two(value_1, cell_scale); + + cell_state[i] = CLAMP(value + value_1, NN_Q15_MAX, NN_Q15_MIN); + } +#endif // #if defined(ARM_MATH_MVEI) +} +/** + * @} end of supportLSTM group + */ diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_output_s8_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_output_s8_s16.c new file mode 100644 index 00000000..3367e3f2 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_output_s8_s16.c @@ -0,0 +1,81 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_lstm_update_output_s8_s16.c + * Description: Update output gate for an incremental step of LSTM function. + * + * $Date: 13 Februari 2023 + * $Revision: V.2.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup supportLSTM + * @{ + */ + +/* + * Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version. + * Refer to header files for details + */ +void arm_nn_lstm_update_output_s8_s16(const int n_batch, + const int n_cell, + int16_t *cell_state, + const int32_t cell_state_scale, + const int16_t *output_gate, + const cmsis_nn_scaling hidden_scaling, + const int32_t hidden_offset, + int8_t *output_state, + int16_t *cell_gate_scratch) +{ + const int32_t size = n_batch * n_cell; + + int32_t tanh_input_left_shift = (15 + cell_state_scale) - 3; + if (tanh_input_left_shift < 0) + { + tanh_input_left_shift = -tanh_input_left_shift; + for (int32_t i = 0; i < size; i++) + { + cell_state[i] = cell_state[i] >> tanh_input_left_shift; + } + tanh_input_left_shift = 0; + } + arm_nn_activation_s16(cell_state, cell_gate_scratch, size, tanh_input_left_shift, ARM_TANH); + + arm_elementwise_mul_s16_s8(output_gate, + cell_gate_scratch, + output_state, + hidden_offset, + hidden_scaling.multiplier, + hidden_scaling.shift, + size); +} +/** + * @} end of supportLSTM group + */ diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c index 5b227d4c..03dc7ab6 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_nn_mat_mul_core_1x_s8.c * Description: General Matrix-multiplication function * - * $Date: 22 Aug 2022 - * $Revision: V.3.1.0 + * $Date: 20 January 2023 + * $Revision: V.3.1.3 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" @@ -70,27 +70,28 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, int32_t sum_tmp = 0; -#if defined(ARM_MATH_AUTOVECTORIZE) + #if defined(ARM_MATH_AUTOVECTORIZE) for (int j = 0; j < row_elements; j++) { int32_t col = col_base[j]; sum_tmp += col; acc_n0 += row_base[j] * col; } -#else - __ASM volatile(" vldrb.8 q0, [%[col]], #16 \n" - " wlstp.8 lr, %[cnt], 1f \n" + #else + __ASM volatile(" .p2align 2 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " wlstp.8 lr, %[cnt], 1f \n" "2: \n" - " vaddva.s8 %[sum], q0 \n" - " vldrb.8 q1, [%[row0]], #16 \n" - " vmladava.s8 %[out0], q0, q1 \n" - " vldrb.8 q0, [%[col]], #16 \n" - " letp lr, 2b \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], #16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " letp lr, 2b \n" "1: \n" : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0) : [cnt] "r"(row_elements) : "q0", "q1", "memory", "r14"); -#endif + #endif sum_tmp *= conv_params->input_offset; acc_n0 += sum_tmp; @@ -129,8 +130,9 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, acc_n0 += conv_params->output_offset; acc_n0 = MAX(acc_n0, conv_params->activation.min); acc_n0 = MIN(acc_n0, conv_params->activation.max); - *output++ = (q7_t)acc_n0; + *output++ = (int8_t)acc_n0; } + return ARM_CMSIS_NN_SUCCESS; #else (void)row_elements; @@ -142,8 +144,8 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, (void)quant_params; (void)bias; (void)output; + return ARM_CMSIS_NN_NO_IMPL_ERROR; #endif - return ARM_CMSIS_NN_SUCCESS; } /** diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c index 8b69107c..643a9c7f 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c @@ -21,8 +21,8 @@ * Title: arm_nn_mat_mul_core_4x_s8.c * Description: General matrix multiplication function for MVE extension * - * $Date: 22. Aug 2022 - * $Revision: V.3.1.0 + * $Date: 13 December 2022 + * $Revision: V.3.1.1 * * Target Processor: Cortex-M processors * -------------------------------------------------------------------- */ @@ -81,19 +81,20 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, acc_n3 += ip_row_3[j] * col; } #else - __ASM volatile(" vldrb.8 q0, [%[col]], #16 \n" + __ASM volatile(" .p2align 2 \n" + " vldrb.8 q0, [%[col]], #16 \n" " wlstp.8 lr, %[cnt], 1f \n" "2: \n" " vaddva.s8 %[sum], q0 \n" - " vldrb.8 q1, [%[row0]], #16 \n" + " vldrb.8 q1, [%[row0]], #16 \n" " vmladava.s8 %[out0], q0, q1 \n" - " vldrb.8 q2, [%[row1]], #16 \n" + " vldrb.8 q2, [%[row1]], #16 \n" " vmladava.s8 %[out1], q0, q2 \n" - " vldrb.8 q3, [%[row2]], #16 \n" + " vldrb.8 q3, [%[row2]], #16 \n" " vmladava.s8 %[out2], q0, q3 \n" - " vldrb.8 q4, [%[row3]], #16 \n" + " vldrb.8 q4, [%[row3]], #16 \n" " vmladava.s8 %[out3], q0, q4 \n" - " vldrb.8 q0, [%[col]], #16 \n" + " vldrb.8 q0, [%[col]], #16 \n" " letp lr, 2b \n" "1: \n" : [col] "+r"(col_base), diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c index 2295b0fe..28cd534a 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates + * * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +22,10 @@ * Title: arm_nn_mat_mult_kernel_s16.c * Description: Matrix-multiplication function for convolution * - * $Date: 12 August 2021 - * $Revision: V.1.1.0 + * $Date: 5 Janauray 2023 + * $Revision: V.1.2.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" @@ -46,74 +47,74 @@ * */ -q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, - const q15_t *input_b, - const int32_t output_ch, - const int32_t *out_shift, - const int32_t *out_mult, - const int16_t activation_min, - const int16_t activation_max, - const int32_t num_col_a, - const int64_t *const output_bias, - q15_t *out_0) +int16_t *arm_nn_mat_mult_kernel_s16(const int8_t *input_a, + const int16_t *input_b, + const int32_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int16_t activation_min, + const int16_t activation_max, + const int32_t num_col_a, + const int64_t *const output_bias, + int16_t *out_0) { #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) /* set up the second output pointers */ - q15_t *out_1 = out_0 + output_ch; + int16_t *out_1 = out_0 + output_ch; const int64_t *bias = output_bias; uint16_t row_count = output_ch / 2; - const q7_t *ip_a0 = input_a; + const int8_t *ip_a0 = input_a; /* this loop over rows in A */ while (row_count) { /* setup pointers for B */ - const q15_t *ip_b0 = input_b; - const q15_t *ip_b1 = ip_b0 + num_col_a; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + num_col_a; /* align the second pointer for A */ - const q7_t *ip_a1 = ip_a0 + num_col_a; + const int8_t *ip_a1 = ip_a0 + num_col_a; /* Init accumulator for channel N and N + 1 */ - q31_t ch_0_out_0 = 0; - q31_t ch_0_out_1 = 0; - q31_t ch_1_out_0 = 0; - q31_t ch_1_out_1 = 0; + int32_t ch_0_out_0 = 0; + int32_t ch_0_out_1 = 0; + int32_t ch_1_out_0 = 0; + int32_t ch_1_out_1 = 0; uint16_t col_count = num_col_a / 4; /* accumulate over the vector */ while (col_count) { - q31_t a01, a02, a11, a12; - q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); - q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + int32_t a01, a02, a11, a12; + int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1); ip_a0 = read_and_pad(ip_a0, &a01, &a02); ip_a1 = read_and_pad(ip_a1, &a11, &a12); - ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); - ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); - ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = SMLAD(a11, b1, ch_1_out_1); b0 = arm_nn_read_q15x2_ia(&ip_b0); b1 = arm_nn_read_q15x2_ia(&ip_b1); - ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); - ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); - ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = SMLAD(a12, b1, ch_1_out_1); col_count--; } /* while over col_count */ col_count = num_col_a & 0x3; while (col_count) { - q7_t a0 = *ip_a0++; - q15_t b0 = *ip_b0++; - q7_t a1 = *ip_a1++; - q15_t b1 = *ip_b1++; + int8_t a0 = *ip_a0++; + int16_t b0 = *ip_b0++; + int8_t a1 = *ip_a1++; + int16_t b1 = *ip_b1++; ch_0_out_0 += a0 * b0; ch_0_out_1 += a0 * b1; @@ -123,8 +124,8 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, } /* while over col_count */ if (bias) { - q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); - q63_t acc_64 = ch_0_out_0 + *bias; + int32_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); + int64_t acc_64 = ch_0_out_0 + *bias; ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); acc_64 = ch_0_out_1 + *bias++; ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); @@ -138,17 +139,17 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, } ch_0_out_0 = MAX(ch_0_out_0, activation_min); ch_0_out_0 = MIN(ch_0_out_0, activation_max); - *out_0++ = (q15_t)ch_0_out_0; + *out_0++ = (int16_t)ch_0_out_0; ch_0_out_1 = MAX(ch_0_out_1, activation_min); ch_0_out_1 = MIN(ch_0_out_1, activation_max); - *out_1++ = (q15_t)ch_0_out_1; + *out_1++ = (int16_t)ch_0_out_1; out_shift++; if (bias) { - q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); - q63_t acc_64 = ch_1_out_0 + *bias; + int32_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); + int64_t acc_64 = ch_1_out_0 + *bias; ch_1_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); acc_64 = ch_1_out_1 + *bias++; ch_1_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); @@ -162,11 +163,11 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, } ch_1_out_0 = MAX(ch_1_out_0, activation_min); ch_1_out_0 = MIN(ch_1_out_0, activation_max); - *out_0++ = (q15_t)ch_1_out_0; + *out_0++ = (int16_t)ch_1_out_0; ch_1_out_1 = MAX(ch_1_out_1, activation_min); ch_1_out_1 = MIN(ch_1_out_1, activation_max); - *out_1++ = (q15_t)ch_1_out_1; + *out_1++ = (int16_t)ch_1_out_1; out_shift++; /* skip row */ @@ -178,37 +179,37 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, if (output_ch & 0x1) { /* setup pointers for B */ - const q15_t *ip_b0 = input_b; - const q15_t *ip_b1 = ip_b0 + num_col_a; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + num_col_a; - q31_t ch_0_out_0 = 0; - q31_t ch_0_out_1 = 0; + int32_t ch_0_out_0 = 0; + int32_t ch_0_out_1 = 0; uint16_t col_count = num_col_a >> 2; while (col_count) { - q31_t a01, a02; - q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); - q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + int32_t a01, a02; + int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1); ip_a0 = read_and_pad(ip_a0, &a01, &a02); - ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1); b0 = arm_nn_read_q15x2_ia(&ip_b0); b1 = arm_nn_read_q15x2_ia(&ip_b1); - ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); - ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1); col_count--; } col_count = num_col_a & 0x3; while (col_count) { - q7_t a0 = *ip_a0++; - q15_t b0 = *ip_b0++; - q15_t b1 = *ip_b1++; + int8_t a0 = *ip_a0++; + int16_t b0 = *ip_b0++; + int16_t b1 = *ip_b1++; ch_0_out_0 += a0 * b0; ch_0_out_1 += a0 * b1; @@ -216,8 +217,8 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, } if (bias) { - q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); - q63_t acc_64 = ch_0_out_0 + *bias; + int32_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); + int64_t acc_64 = ch_0_out_0 + *bias; ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); acc_64 = ch_0_out_1 + *bias++; ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); @@ -229,11 +230,11 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, } ch_0_out_0 = MAX(ch_0_out_0, activation_min); ch_0_out_0 = MIN(ch_0_out_0, activation_max); - *out_0++ = (q15_t)ch_0_out_0; + *out_0++ = (int16_t)ch_0_out_0; ch_0_out_1 = MAX(ch_0_out_1, activation_min); ch_0_out_1 = MIN(ch_0_out_1, activation_max); - *out_1++ = (q15_t)ch_0_out_1; + *out_1++ = (int16_t)ch_0_out_1; out_mult++; out_shift++; } diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c index a446db8c..b8f9c14f 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_nn_mat_mult_s8_nt_t_s8 * Description: Matrix multiplication support function with the right-hand-side (rhs) matrix transposed * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 5 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -45,10 +45,10 @@ * Refer header file for details. * */ -arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, - const q7_t *rhs, - const q31_t *bias, - q7_t *dst, +arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs, + const int8_t *rhs, + const int32_t *bias, + int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, @@ -57,18 +57,179 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, - const int32_t activation_max) + const int32_t activation_max, + const int32_t rhs_cols_offset) { -#if defined(ARM_MATH_DSP) + if (rhs_cols_offset < rhs_cols) + { + return ARM_CMSIS_NN_ARG_ERROR; + } +#if defined(ARM_MATH_MVEI) + + int8_t *out_ref = dst; + const int8_t *in_ref = lhs; + (void)out_ref; + (void)in_ref; + int32_t offset = rhs_cols_offset; + int i_items = 0; + for (; i_items <= (lhs_rows - 4); i_items += 4) + { + for (int i = 0; i < rhs_rows; i++) + { + int32_t acc_n0 = 0; + int32_t acc_n1 = 0; + int32_t acc_n2 = 0; + int32_t acc_n3 = 0; + + const int8_t *lhs_vec = lhs; + const int8_t *ip_row_1 = lhs + offset; + const int8_t *ip_row_2 = lhs + (2 * offset); + const int8_t *ip_row_3 = lhs + (3 * offset); + const int8_t *col_base = rhs + i * rhs_cols; + int32_t sum_tmp = 0; + + #if defined(ARM_MATH_AUTOVECTORIZE) + for (int j = 0; j < rhs_cols; j++) + { + int32_t col = col_base[j]; + sum_tmp += col; + acc_n0 += lhs_vec[j] * col; + acc_n1 += ip_row_1[j] * col; + acc_n2 += ip_row_2[j] * col; + acc_n3 += ip_row_3[j] * col; + } + #else + __ASM volatile(" .p2align 2 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], #16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q2, [%[row1]], #16 \n" + " vmladava.s8 %[out1], q0, q2 \n" + " vldrb.8 q3, [%[row2]], #16 \n" + " vmladava.s8 %[out2], q0, q3 \n" + " vldrb.8 q4, [%[row3]], #16 \n" + " vmladava.s8 %[out3], q0, q4 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " letp lr, 2b \n" + "1: \n" + : [col] "+r"(col_base), + [sum] "+Te"(sum_tmp), + [row0] "+r"(lhs_vec), + [row1] "+r"(ip_row_1), + [row2] "+r"(ip_row_2), + [row3] "+r"(ip_row_3), + [out0] "+Te"(acc_n0), + [out1] "+Te"(acc_n1), + [out2] "+Te"(acc_n2), + [out3] "+Te"(acc_n3) + : [cnt] "r"(rhs_cols) + : "q0", "q1", "q2", "q3", "q4", "memory", "r14"); + #endif + int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3}; + sum_tmp *= lhs_offset; + if (bias) + { + sum_tmp += bias[i]; + } + res = vaddq_n_s32(res, sum_tmp); + + res = arm_requantize_mve(res, dst_multipliers[i], dst_shifts[i]); + res = vaddq_n_s32(res, dst_offset); + + res = vmaxq_s32(res, vdupq_n_s32(activation_min)); + res = vminq_s32(res, vdupq_n_s32(activation_max)); + + const uint32x4_t scatter_offset = {0, (uint32_t)rhs_rows, (uint32_t)rhs_rows * 2, (uint32_t)rhs_rows * 3}; + vstrbq_scatter_offset_s32(dst, scatter_offset, res); + dst++; + } + lhs += 4 * offset; + dst += (3 * rhs_rows); + } + + for (; i_items < lhs_rows; i_items++) + { + int32_t acc[4]; + const int32_t *multipliers = dst_multipliers; + const int32_t *shifts = dst_shifts; + for (int i = 0; i < rhs_rows; i++) + { + int32_t acc_n0 = 0; + const int8_t *lhs_vec = lhs; + const int8_t *col_base = rhs + i * rhs_cols; + int32_t sum_tmp = 0; + + #if defined(ARM_MATH_AUTOVECTORIZE) + for (int j = 0; j < rhs_cols; j++) + { + int32_t col = col_base[j]; + sum_tmp += col; + acc_n0 += lhs_vec[j] * col; + } + #else + __ASM volatile(" .p2align 2 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], #16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " letp lr, 2b \n" + "1: \n" + : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(lhs_vec), [out0] "+Te"(acc_n0) + : [cnt] "r"(rhs_cols) + : "q0", "q1", "memory", "r14"); + #endif + sum_tmp *= lhs_offset; + sum_tmp += acc_n0; + if (bias) + { + sum_tmp += bias[i]; + } + const int32_t index = i & 0x3; + acc[index] = sum_tmp; + + if (index == 3) + { + int32x4_t res = vldrwq_s32(acc); + res = arm_requantize_mve_32x4(res, vldrwq_s32(multipliers), vldrwq_s32(shifts)); + multipliers += 4; + shifts += 4; + res = vaddq_n_s32(res, dst_offset); + res = vmaxq_s32(res, vdupq_n_s32(activation_min)); + res = vminq_s32(res, vdupq_n_s32(activation_max)); + vstrbq_s32(dst, res); + dst += 4; + } + } + lhs += offset; + + for (int i = 0; i < (rhs_rows & 0x3); i++) + { + int32_t acc_n0 = acc[i]; + acc_n0 = arm_nn_requantize(acc_n0, multipliers[i], shifts[i]); + acc_n0 += dst_offset; + acc_n0 = MAX(acc_n0, activation_min); + acc_n0 = MIN(acc_n0, activation_max); + *dst++ = (int8_t)acc_n0; + } + } + +#elif defined(ARM_MATH_DSP) const int32_t off0 = rhs_cols - 4; + const int32_t lhs_off0 = rhs_cols_offset - 4; for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2) { - const q7_t *lhs_ptr = &lhs[0]; - q7_t *dst_ptr = &dst[0]; + const int8_t *lhs_ptr = &lhs[0]; + int8_t *dst_ptr = &dst[0]; - q31_t lhs_offset_contribution0 = 0; - q31_t lhs_offset_contribution1 = 0; + int32_t lhs_offset_contribution0 = 0; + int32_t lhs_offset_contribution1 = 0; for (int32_t x = 0; x < rhs_cols; ++x) { @@ -88,130 +249,158 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, while (lhs_rows_idx) { - const q7_t *rhs_ptr = &rhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - q31_t res00 = lhs_offset_contribution0; - q31_t res01 = lhs_offset_contribution1; - q31_t res10 = lhs_offset_contribution0; - q31_t res11 = lhs_offset_contribution1; + int32_t res00 = lhs_offset_contribution0; + int32_t res01 = lhs_offset_contribution1; + int32_t res10 = lhs_offset_contribution0; + int32_t res11 = lhs_offset_contribution1; int32_t rhs_cols_idx = 0; - q31_t val0, val1, val2, val3, val4, val5; + int32_t val0, val1, val2, val3, val4, val5; for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) { - val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - val2 = __SXTB16(val1); - val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val1 = __SXTB16_RORn(val1, 8); - val0 = __SXTB16_RORn(val0, 8); + val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val2 = SXTB16(val1); + val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val1 = SXTB16_RORn(val1, 8); + val0 = SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = SMLAD(val3, val2, res00); + val5 = SXTB16(val4); + res00 = SMLAD(val0, val1, res00); + val4 = SXTB16_RORn(val4, 8); + res01 = SMLAD(val3, val5, res01); + res01 = SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]); + val3 = SXTB16(val0); + val0 = SXTB16_RORn(val0, 8); + res10 = SMLAD(val3, val2, res10); + res11 = SMLAD(val3, val5, res11); + res10 = SMLAD(val0, val1, res10); + val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + res11 = SMLAD(val0, val4, res11); + + val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = SXTB16(val1); + val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val1 = SXTB16_RORn(val1, 8); + val0 = SXTB16_RORn(val0, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val3, val2, res00); - val5 = __SXTB16(val4); - res00 = __SMLAD(val0, val1, res00); - val4 = __SXTB16_RORn(val4, 8); - res01 = __SMLAD(val3, val5, res01); - res01 = __SMLAD(val0, val4, res01); + res00 = SMLAD(val3, val2, res00); + val5 = SXTB16(val4); + res00 = SMLAD(val0, val1, res00); + val4 = SXTB16_RORn(val4, 8); + res01 = SMLAD(val3, val5, res01); + res01 = SMLAD(val0, val4, res01); // 4 x MAC res10, res11 - val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); - val3 = __SXTB16(val0); - val0 = __SXTB16_RORn(val0, 8); - res10 = __SMLAD(val3, val2, res10); - res11 = __SMLAD(val3, val5, res11); - res10 = __SMLAD(val0, val1, res10); - val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - res11 = __SMLAD(val0, val4, res11); - - val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = __SXTB16(val1); - val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val1 = __SXTB16_RORn(val1, 8); - val0 = __SXTB16_RORn(val0, 8); + val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]); + val3 = SXTB16(val0); + val0 = SXTB16_RORn(val0, 8); + res10 = SMLAD(val3, val2, res10); + res11 = SMLAD(val3, val5, res11); + res10 = SMLAD(val0, val1, res10); + val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + res11 = SMLAD(val0, val4, res11); + + val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = SXTB16(val1); + val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val1 = SXTB16_RORn(val1, 8); + val0 = SXTB16_RORn(val0, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val3, val2, res00); - val5 = __SXTB16(val4); - res00 = __SMLAD(val0, val1, res00); - val4 = __SXTB16_RORn(val4, 8); - res01 = __SMLAD(val3, val5, res01); - res01 = __SMLAD(val0, val4, res01); + res00 = SMLAD(val3, val2, res00); + val5 = SXTB16(val4); + res00 = SMLAD(val0, val1, res00); + val4 = SXTB16_RORn(val4, 8); + res01 = SMLAD(val3, val5, res01); + res01 = SMLAD(val0, val4, res01); // 4 x MAC res10, res11 - val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); - val3 = __SXTB16(val0); - val0 = __SXTB16_RORn(val0, 8); - res10 = __SMLAD(val3, val2, res10); - res11 = __SMLAD(val3, val5, res11); - res10 = __SMLAD(val0, val1, res10); - val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - res11 = __SMLAD(val0, val4, res11); - - val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = __SXTB16(val1); - val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val1 = __SXTB16_RORn(val1, 8); - val0 = __SXTB16_RORn(val0, 8); + val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]); + val3 = SXTB16(val0); + val0 = SXTB16_RORn(val0, 8); + res10 = SMLAD(val3, val2, res10); + res11 = SMLAD(val3, val5, res11); + res10 = SMLAD(val0, val1, res10); + val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + res11 = SMLAD(val0, val4, res11); + + val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = SXTB16(val1); + val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val1 = SXTB16_RORn(val1, 8); + val0 = SXTB16_RORn(val0, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val3, val2, res00); - val5 = __SXTB16(val4); - res00 = __SMLAD(val0, val1, res00); - val4 = __SXTB16_RORn(val4, 8); - res01 = __SMLAD(val3, val5, res01); - res01 = __SMLAD(val0, val4, res01); + res00 = SMLAD(val3, val2, res00); + val5 = SXTB16(val4); + res00 = SMLAD(val0, val1, res00); + val4 = SXTB16_RORn(val4, 8); + res01 = SMLAD(val3, val5, res01); + res01 = SMLAD(val0, val4, res01); // 4 x MAC res10, res11 - val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); - val3 = __SXTB16(val0); - val0 = __SXTB16_RORn(val0, 8); - res10 = __SMLAD(val3, val2, res10); - res11 = __SMLAD(val3, val5, res11); - res10 = __SMLAD(val0, val1, res10); - val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - res11 = __SMLAD(val0, val4, res11); - - val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = __SXTB16(val1); - val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val1 = __SXTB16_RORn(val1, 8); - val0 = __SXTB16_RORn(val0, 8); + val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]); + val3 = SXTB16(val0); + val0 = SXTB16_RORn(val0, 8); + res10 = SMLAD(val3, val2, res10); + res11 = SMLAD(val3, val5, res11); + res10 = SMLAD(val0, val1, res10); + res11 = SMLAD(val0, val4, res11); + } + + for (; rhs_cols_idx <= (rhs_cols - 4); rhs_cols_idx += 4) + { + val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val2 = SXTB16(val1); + val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val1 = SXTB16_RORn(val1, 8); + val0 = SXTB16_RORn(val0, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val3, val2, res00); - val5 = __SXTB16(val4); - res00 = __SMLAD(val0, val1, res00); - val4 = __SXTB16_RORn(val4, 8); - res01 = __SMLAD(val3, val5, res01); - res01 = __SMLAD(val0, val4, res01); + res00 = SMLAD(val3, val2, res00); + val5 = SXTB16(val4); + res00 = SMLAD(val0, val1, res00); + val4 = SXTB16_RORn(val4, 8); + res01 = SMLAD(val3, val5, res01); + res01 = SMLAD(val0, val4, res01); // 4 x MAC res10, res11 - val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); - val3 = __SXTB16(val0); - val0 = __SXTB16_RORn(val0, 8); - res10 = __SMLAD(val3, val2, res10); - res11 = __SMLAD(val3, val5, res11); - res10 = __SMLAD(val0, val1, res10); - res11 = __SMLAD(val0, val4, res11); + val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]); + val3 = SXTB16(val0); + val0 = SXTB16_RORn(val0, 8); + res10 = SMLAD(val3, val2, res10); + res11 = SMLAD(val3, val5, res11); + res10 = SMLAD(val0, val1, res10); + res11 = SMLAD(val0, val4, res11); } for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - q7_t rhs_value0 = rhs_ptr[0]; - q7_t rhs_value1 = rhs_ptr[rhs_cols]; - q7_t lhs_value = lhs_ptr[0]; + int8_t rhs_value0 = rhs_ptr[0]; + int8_t rhs_value1 = rhs_ptr[rhs_cols]; + int8_t lhs_value = lhs_ptr[0]; res00 += lhs_value * rhs_value0; res01 += lhs_value * rhs_value1; - lhs_value = lhs_ptr[rhs_cols]; + lhs_value = lhs_ptr[rhs_cols_offset]; res10 += lhs_value * rhs_value0; res11 += lhs_value * rhs_value1; @@ -241,14 +430,15 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, res11 = MAX(res11, activation_min); res11 = MIN(res11, activation_max); - dst_ptr[0] = (q7_t)res00; - dst_ptr[1] = (q7_t)res01; + dst_ptr[0] = (int8_t)res00; + dst_ptr[1] = (int8_t)res01; dst_ptr += rhs_rows; - dst_ptr[0] = (q7_t)res10; - dst_ptr[1] = (q7_t)res11; + dst_ptr[0] = (int8_t)res10; + dst_ptr[1] = (int8_t)res11; dst_ptr += rhs_rows; - lhs_ptr += rhs_cols; + lhs_ptr -= rhs_cols; + lhs_ptr += 2 * rhs_cols_offset; lhs_rows_idx--; } @@ -256,87 +446,106 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, // Left-over rows if (lhs_rows % 2) { - const q7_t *rhs_ptr = &rhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - q31_t res00 = lhs_offset_contribution0; - q31_t res01 = lhs_offset_contribution1; + int32_t res00 = lhs_offset_contribution0; + int32_t res01 = lhs_offset_contribution1; int32_t rhs_cols_idx = 0; - q31_t val0, val1, val2, val3, val4, val5; + int32_t val0, val1, val2, val3, val4, val5; for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) { - val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val5 = __SXTB16(val2); - val4 = __SXTB16(val1); - val0 = __SXTB16_RORn(val0, 8); - val2 = __SXTB16_RORn(val2, 8); - val1 = __SXTB16_RORn(val1, 8); + val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val5 = SXTB16(val2); + val4 = SXTB16(val1); + val0 = SXTB16_RORn(val0, 8); + val2 = SXTB16_RORn(val2, 8); + val1 = SXTB16_RORn(val1, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val5, val3, res00); - res00 = __SMLAD(val2, val0, res00); - res01 = __SMLAD(val5, val4, res01); - res01 = __SMLAD(val2, val1, res01); - - val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val5 = __SXTB16(val2); - val4 = __SXTB16(val1); - val0 = __SXTB16_RORn(val0, 8); - val2 = __SXTB16_RORn(val2, 8); - val1 = __SXTB16_RORn(val1, 8); + res00 = SMLAD(val5, val3, res00); + res00 = SMLAD(val2, val0, res00); + res01 = SMLAD(val5, val4, res01); + res01 = SMLAD(val2, val1, res01); + + val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val5 = SXTB16(val2); + val4 = SXTB16(val1); + val0 = SXTB16_RORn(val0, 8); + val2 = SXTB16_RORn(val2, 8); + val1 = SXTB16_RORn(val1, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val5, val3, res00); - res00 = __SMLAD(val2, val0, res00); - res01 = __SMLAD(val5, val4, res01); - res01 = __SMLAD(val2, val1, res01); - - val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val5 = __SXTB16(val2); - val4 = __SXTB16(val1); - val0 = __SXTB16_RORn(val0, 8); - val2 = __SXTB16_RORn(val2, 8); - val1 = __SXTB16_RORn(val1, 8); + res00 = SMLAD(val5, val3, res00); + res00 = SMLAD(val2, val0, res00); + res01 = SMLAD(val5, val4, res01); + res01 = SMLAD(val2, val1, res01); + + val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val5 = SXTB16(val2); + val4 = SXTB16(val1); + val0 = SXTB16_RORn(val0, 8); + val2 = SXTB16_RORn(val2, 8); + val1 = SXTB16_RORn(val1, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val5, val3, res00); - res00 = __SMLAD(val2, val0, res00); - res01 = __SMLAD(val5, val4, res01); - res01 = __SMLAD(val2, val1, res01); - - val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); - val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); - val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); - val3 = __SXTB16(val0); - val5 = __SXTB16(val2); - val4 = __SXTB16(val1); - val0 = __SXTB16_RORn(val0, 8); - val2 = __SXTB16_RORn(val2, 8); - val1 = __SXTB16_RORn(val1, 8); + res00 = SMLAD(val5, val3, res00); + res00 = SMLAD(val2, val0, res00); + res01 = SMLAD(val5, val4, res01); + res01 = SMLAD(val2, val1, res01); + + val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val5 = SXTB16(val2); + val4 = SXTB16(val1); + val0 = SXTB16_RORn(val0, 8); + val2 = SXTB16_RORn(val2, 8); + val1 = SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = SMLAD(val5, val3, res00); + res00 = SMLAD(val2, val0, res00); + res01 = SMLAD(val5, val4, res01); + res01 = SMLAD(val2, val1, res01); + } + + for (; rhs_cols_idx <= (rhs_cols - 4); rhs_cols_idx += 4) + { + val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr); + val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr); + val3 = SXTB16(val0); + val5 = SXTB16(val2); + val4 = SXTB16(val1); + val0 = SXTB16_RORn(val0, 8); + val2 = SXTB16_RORn(val2, 8); + val1 = SXTB16_RORn(val1, 8); // 4 x MAC res00, res01 - res00 = __SMLAD(val5, val3, res00); - res00 = __SMLAD(val2, val0, res00); - res01 = __SMLAD(val5, val4, res01); - res01 = __SMLAD(val2, val1, res01); + res00 = SMLAD(val5, val3, res00); + res00 = SMLAD(val2, val0, res00); + res01 = SMLAD(val5, val4, res01); + res01 = SMLAD(val2, val1, res01); } // Left-over accumulations for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - q7_t rhs_value0 = rhs_ptr[0]; - q7_t rhs_value1 = rhs_ptr[rhs_cols]; - q7_t lhs_value = lhs_ptr[0]; + int8_t rhs_value0 = rhs_ptr[0]; + int8_t rhs_value1 = rhs_ptr[rhs_cols]; + int8_t lhs_value = lhs_ptr[0]; res00 += lhs_value * rhs_value0; res01 += lhs_value * rhs_value1; @@ -359,8 +568,8 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, res01 = MAX(res01, activation_min); res01 = MIN(res01, activation_max); - dst_ptr[0] = (q7_t)res00; - dst_ptr[1] = (q7_t)res01; + dst_ptr[0] = (int8_t)res00; + dst_ptr[1] = (int8_t)res01; } rhs += 2 * rhs_cols; @@ -369,13 +578,13 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, if (rhs_rows % 2) { - const q7_t *lhs_ptr = &lhs[0]; - q7_t *dst_ptr = &dst[0]; + const int8_t *lhs_ptr = &lhs[0]; + int8_t *dst_ptr = &dst[0]; for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx) { - const q7_t *rhs_ptr = &rhs[0]; - q31_t res00 = 0; + const int8_t *rhs_ptr = &rhs[0]; + int32_t res00 = 0; if (bias) { res00 = bias[rhs_rows - 1]; @@ -383,14 +592,16 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - q31_t rhs_value = rhs_ptr[0]; - q31_t lhs_value = lhs_ptr[0] + lhs_offset; + int32_t rhs_value = rhs_ptr[0]; + int32_t lhs_value = lhs_ptr[0] + lhs_offset; res00 += lhs_value * rhs_value; ++rhs_ptr; ++lhs_ptr; } + lhs_ptr -= rhs_cols; + lhs_ptr += rhs_cols_offset; // Quantize down res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]); @@ -402,18 +613,19 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, res00 = MAX(res00, activation_min); res00 = MIN(res00, activation_max); - dst_ptr[0] = (q7_t)res00; + dst_ptr[0] = (int8_t)res00; dst_ptr += rhs_rows; } } #else + (void)rhs_cols_offset; for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2) { - const q7_t *lhs_ptr = &lhs[0]; - q7_t *dst_ptr = &dst[0]; + const int8_t *lhs_ptr = &lhs[0]; + int8_t *dst_ptr = &dst[0]; - q31_t lhs_offset_contribution0 = 0; - q31_t lhs_offset_contribution1 = 0; + int32_t lhs_offset_contribution0 = 0; + int32_t lhs_offset_contribution1 = 0; for (int32_t x = 0; x < rhs_cols; ++x) { @@ -433,23 +645,23 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, while (lhs_rows_idx) { - const q7_t *rhs_ptr = &rhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - q31_t res00 = lhs_offset_contribution0; - q31_t res01 = lhs_offset_contribution1; - q31_t res10 = lhs_offset_contribution0; - q31_t res11 = lhs_offset_contribution1; + int32_t res00 = lhs_offset_contribution0; + int32_t res01 = lhs_offset_contribution1; + int32_t res10 = lhs_offset_contribution0; + int32_t res11 = lhs_offset_contribution1; for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) { - q7_t rhs_value0 = rhs_ptr[0]; - q7_t rhs_value1 = rhs_ptr[rhs_cols]; - q7_t lhs_value = lhs_ptr[0]; + int8_t rhs_value0 = rhs_ptr[0]; + int8_t rhs_value1 = rhs_ptr[rhs_cols]; + int8_t lhs_value = lhs_ptr[0]; res00 += lhs_value * rhs_value0; res01 += lhs_value * rhs_value1; - lhs_value = lhs_ptr[rhs_cols]; + lhs_value = lhs_ptr[rhs_cols_offset]; res10 += lhs_value * rhs_value0; res11 += lhs_value * rhs_value1; @@ -479,14 +691,15 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, res11 = MAX(res11, activation_min); res11 = MIN(res11, activation_max); - dst_ptr[0] = (q7_t)res00; - dst_ptr[1] = (q7_t)res01; + dst_ptr[0] = (int8_t)res00; + dst_ptr[1] = (int8_t)res01; dst_ptr += rhs_rows; - dst_ptr[0] = (q7_t)res10; - dst_ptr[1] = (q7_t)res11; + dst_ptr[0] = (int8_t)res10; + dst_ptr[1] = (int8_t)res11; dst_ptr += rhs_rows; - lhs_ptr += rhs_cols; + lhs_ptr -= rhs_cols; + lhs_ptr += 2 * rhs_cols_offset; lhs_rows_idx--; } @@ -494,16 +707,16 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, // Left-over rows if (lhs_rows % 2) { - const q7_t *rhs_ptr = &rhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - q31_t res00 = lhs_offset_contribution0; - q31_t res01 = lhs_offset_contribution1; + int32_t res00 = lhs_offset_contribution0; + int32_t res01 = lhs_offset_contribution1; for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) { - q7_t rhs_value0 = rhs_ptr[0]; - q7_t rhs_value1 = rhs_ptr[rhs_cols]; - q7_t lhs_value = lhs_ptr[0]; + int8_t rhs_value0 = rhs_ptr[0]; + int8_t rhs_value1 = rhs_ptr[rhs_cols]; + int8_t lhs_value = lhs_ptr[0]; res00 += lhs_value * rhs_value0; res01 += lhs_value * rhs_value1; @@ -526,8 +739,8 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, res01 = MAX(res01, activation_min); res01 = MIN(res01, activation_max); - dst_ptr[0] = (q7_t)res00; - dst_ptr[1] = (q7_t)res01; + dst_ptr[0] = (int8_t)res00; + dst_ptr[1] = (int8_t)res01; } rhs += 2 * rhs_cols; @@ -536,13 +749,13 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, if (rhs_rows % 2) { - const q7_t *lhs_ptr = &lhs[0]; - q7_t *dst_ptr = &dst[0]; + const int8_t *lhs_ptr = &lhs[0]; + int8_t *dst_ptr = &dst[0]; for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx) { - const q7_t *rhs_ptr = &rhs[0]; - q31_t res00 = 0; + const int8_t *rhs_ptr = &rhs[0]; + int32_t res00 = 0; if (bias) { res00 = bias[rhs_rows - 1]; @@ -550,14 +763,16 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) { - q31_t rhs_value = rhs_ptr[0]; - q31_t lhs_value = lhs_ptr[0] + lhs_offset; + int32_t rhs_value = rhs_ptr[0]; + int32_t lhs_value = lhs_ptr[0] + lhs_offset; res00 += lhs_value * rhs_value; ++rhs_ptr; ++lhs_ptr; } + lhs_ptr -= rhs_cols; + lhs_ptr += rhs_cols_offset; // Quantize down res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]); @@ -569,7 +784,7 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, res00 = MAX(res00, activation_min); res00 = MIN(res00, activation_max); - dst_ptr[0] = (q7_t)res00; + dst_ptr[0] = (int8_t)res00; dst_ptr += rhs_rows; } } diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8.c new file mode 100644 index 00000000..b1427cca --- /dev/null +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8.c @@ -0,0 +1,347 @@ +/* + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_vec_mat_mul_result_acc_s8.c + * Description: Multiplies a matrix by a vector and accumulate with output. + * + * $Date: 20 January 2023 + * $Revision: V.1.2.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup supportLSTM + * @{ + */ + +/* + * Refer to header file for details. + */ +void arm_nn_vec_mat_mul_result_acc_s8(const int8_t *lhs_in, + const int8_t *rhs_in, + const int32_t *bias, + int16_t *dst, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t batch) +{ + for (int i_batch = 0; i_batch < batch; ++i_batch) + { + const int8_t *rhs = rhs_in; + const int8_t *lhs = lhs_in + i_batch * rhs_cols; + +#if defined(ARM_MATH_MVEI) + const int32_t row_loop_cnt = rhs_rows / 4; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + int32_t acc_3 = 0; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + const int8_t *rhs_2 = rhs + 2 * rhs_cols; + const int8_t *rhs_3 = rhs + 3 * rhs_cols; + + int32_t col_cnt = rhs_cols; + + while (col_cnt > 0) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); + acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p); + + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); + acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p); + + const int8x16_t ker_3 = vldrbq_z_s8(rhs_3, p); + acc_3 = vmladavaq_p_s8(acc_3, ker_3, input, p); + + lhs_vec += 16; + rhs_0 += 16; + rhs_1 += 16; + rhs_2 += 16; + rhs_3 += 16; + } + rhs += 4 * rhs_cols; + + int32x4_t acc = {acc_0, acc_1, acc_2, acc_3}; + int32x4_t b = vldrwq_s32(bias); + acc = vaddq_s32(acc, b); + bias += 4; + + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); + + acc = vaddq_s32(acc, vldrhq_s32(dst)); + + acc = vmaxq_s32(acc, vdupq_n_s32(NN_Q15_MIN)); + acc = vminq_s32(acc, vdupq_n_s32(NN_Q15_MAX)); + + vstrhq_s32(dst, acc); + dst += 4; + } + + const int loop_cnt = rhs_rows % 4; + for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + int32_t col_cnt = rhs_cols; + + while (col_cnt > 0) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + lhs_vec += 16; + rhs_0 += 16; + } + rhs += rhs_cols; + + acc_0 += *bias; + bias++; + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_0 += dst_offset + *dst; + + // Clamp the result + acc_0 = CLAMP(acc_0, NN_Q15_MAX, NN_Q15_MIN); + *dst++ = (int16_t)acc_0; + } + +#elif defined(ARM_MATH_DSP) + const int32_t row_loop_cnt = rhs_rows / 2; + + for (int32_t i = 0; i < row_loop_cnt; i++) + { + int32_t acc_0 = *bias++; + int32_t acc_1 = *bias++; + + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + rhs += 2 * rhs_cols; + + for (int j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTB16_RORn((uint32_t)vec_0, 8); + + vec_0 = SXTB16(vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + + ker_0 = SXTB16(ker_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + acc_1 = SMLAD(ker_1, vec_1, acc_1); + + ker_0 = SXTB16(ker_0); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + } + + for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = *lhs_vec; + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0); + rhs_0++; + acc_1 += lhs_temp * (*rhs_1); + rhs_1++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset + *dst; + acc_1 += dst_offset + dst[1]; + // Clamp the result + acc_0 = CLAMP(acc_0, NN_Q15_MAX, NN_Q15_MIN); + acc_1 = CLAMP(acc_1, NN_Q15_MAX, NN_Q15_MIN); + + *dst++ = (int16_t)acc_0; + *dst++ = (int16_t)acc_1; + } + + if (rhs_rows & 0x1) + { + int32_t acc_0 = *bias++; + + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + + for (int i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTB16_RORn((uint32_t)vec_0, 8); + vec_0 = SXTB16(vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + } + + for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = *lhs_vec++; + acc_0 += lhs_temp * (*rhs_0++); + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset + *dst; + // Clamp the result + acc_0 = CLAMP(acc_0, NN_Q15_MAX, NN_Q15_MIN); + *dst++ = (int16_t)acc_0; + } + +#else + + const int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + const int8_t *lhs_ptr = lhs; + const int8_t *rhs_ptr_0 = &rhs[0]; + const int8_t *rhs_ptr_1 = &rhs[rhs_cols]; + const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + int32_t res00 = *bias++; + int32_t res01 = *bias++; + int32_t res02 = *bias++; + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const int32_t rhs_value0 = (int8_t)*rhs_ptr_0; + const int32_t rhs_value1 = (int8_t)*rhs_ptr_1; + const int32_t rhs_value2 = (int8_t)*rhs_ptr_2; + const int32_t lhs_value = (int8_t)*lhs_ptr; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset + *dst; + res01 += dst_offset + dst[1]; + res02 += dst_offset + dst[2]; + + // Clamp the result + res00 = CLAMP(res00, NN_Q15_MAX, NN_Q15_MIN); + res01 = CLAMP(res01, NN_Q15_MAX, NN_Q15_MIN); + res02 = CLAMP(res02, NN_Q15_MAX, NN_Q15_MIN); + + dst[0] = (int16_t)res00; + dst[1] = (int16_t)res01; + dst[2] = (int16_t)res02; + dst += 3; + + rhs += 3 * rhs_cols; + } + + const int loop_cnt = rhs_rows % 3; + + for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) + { + const int8_t *lhs_ptr = &lhs[0]; + const int8_t *rhs_ptr = &rhs[0]; + + int32_t res00 = *bias++; + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + int32_t rhs_value0 = (int8_t)rhs_ptr[0]; + int32_t lhs_value = (int8_t)lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset + *dst; + + // Clamp the result + res00 = CLAMP(res00, NN_Q15_MAX, NN_Q15_MIN); + + *dst++ = (int16_t)res00; + rhs += rhs_cols; + } +#endif + } +} + +/** + * @} end of supportLSTM group + */ diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c index c273d3af..4f056739 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,14 +21,15 @@ * Title: arm_nn_vec_mat_mult_t_s16 * Description: s16 vector by matrix (transposed) multiplication * - * $Date: 11 August 2022 - * $Revision: V.2.1.0 + * $Date: 5 January 2023 + * $Revision: V.2.2.0 * - * Target Processor: Cortex-M + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" + #define MAX_COL_COUNT (512) /** @@ -46,10 +47,10 @@ * Refer header file for details. * */ -arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, - const q7_t *rhs, - const q63_t *bias, - q15_t *dst, +arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const int16_t *lhs, + const int8_t *rhs, + const int64_t *bias, + int16_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, @@ -66,7 +67,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, rhs_cols_fast = MAX_COL_COUNT; } -#if defined(ARM_MATH_MVEI) + #if defined(ARM_MATH_MVEI) int32_t row_loop_cnt = rhs_rows / 4; int32_t col_loop_cnt = (rhs_cols_fast + 7) / 8; @@ -140,25 +141,25 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, tmp = arm_nn_requantize_s64(result_64_0, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; tmp = 0; tmp = arm_nn_requantize_s64(result_64_1, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; tmp = 0; tmp = arm_nn_requantize_s64(result_64_2, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; tmp = 0; tmp = arm_nn_requantize_s64(result_64_3, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; rhs += 4 * rhs_cols; } @@ -209,20 +210,20 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, tmp = arm_nn_requantize_s64(result_64, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; rhs += rhs_cols; } -#else // ARM_MATH_MVEI + #else // ARM_MATH_MVEI const int32_t row_loop_cnt = rhs_rows / 2; for (int32_t i = 0; i < row_loop_cnt; i++) { - q63_t acc_64_0 = 0; - q63_t acc_64_1 = 0; + int64_t acc_64_0 = 0; + int64_t acc_64_1 = 0; int32_t acc_0 = 0; int32_t acc_1 = 0; @@ -242,13 +243,13 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1); - acc_0 = __SMLAD(ker_0, vec_part_0, acc_0); - acc_0 = __SMLAD(ker_1, vec_part_1, acc_0); + acc_0 = SMLAD(ker_0, vec_part_0, acc_0); + acc_0 = SMLAD(ker_1, vec_part_1, acc_0); rhs_1 = read_and_pad(rhs_1, &ker_0, &ker_1); - acc_1 = __SMLAD(ker_0, vec_part_0, acc_1); - acc_1 = __SMLAD(ker_1, vec_part_1, acc_1); + acc_1 = SMLAD(ker_0, vec_part_0, acc_1); + acc_1 = SMLAD(ker_1, vec_part_1, acc_1); } acc_64_0 += acc_0; @@ -269,22 +270,22 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, acc_64_0 += *bias++; acc_64_1 += *bias++; } - q31_t tmp; + int32_t tmp; tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; tmp = arm_nn_requantize_s64(acc_64_1, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; } if (rhs_rows & 0x1) { - q63_t acc_64_0 = 0; + int64_t acc_64_0 = 0; int32_t acc_0 = 0; const int32_t col_loop_cnt = rhs_cols_fast / 4; @@ -297,10 +298,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1); vec = arm_nn_read_q15x2_ia(&lhs_vec); - acc_0 = __SMLAD(ker_0, vec, acc_0); + acc_0 = SMLAD(ker_0, vec, acc_0); vec = arm_nn_read_q15x2_ia(&lhs_vec); - acc_0 = __SMLAD(ker_1, vec, acc_0); + acc_0 = SMLAD(ker_1, vec, acc_0); } acc_64_0 += acc_0; @@ -317,26 +318,26 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, { acc_64_0 += *bias++; } - q31_t tmp; + int32_t tmp; tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift); tmp = MAX(tmp, activation_min); tmp = MIN(tmp, activation_max); - *dst++ = (q15_t)tmp; + *dst++ = (int16_t)tmp; } -#endif // ARM_MATH_MVEI -#else // ARM_MATH_DSP + #endif // ARM_MATH_MVEI +#else // ARM_MATH_DSP for (int i_row_loop_cnt = 0; i_row_loop_cnt < rhs_rows; i_row_loop_cnt++) { - const q15_t *lhs_ptr = lhs; - const q7_t *rhs_ptr_0 = &rhs[0]; + const int16_t *lhs_ptr = lhs; + const int8_t *rhs_ptr_0 = &rhs[0]; - q63_t result = 0; + int64_t result = 0; for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - const q63_t rhs_value0 = (int8_t)*rhs_ptr_0; - const q63_t lhs_value = *lhs_ptr; + const int64_t rhs_value0 = (int8_t)*rhs_ptr_0; + const int64_t lhs_value = *lhs_ptr; result += lhs_value * rhs_value0; @@ -355,10 +356,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, result = ((result) > (activation_min) ? (result) : (activation_min)); result = ((result) < (activation_max) ? (result) : (activation_max)); - *dst++ = (q15_t)result; + *dst++ = (int16_t)result; rhs += rhs_cols; } -#endif // ARM_MATH_DSP +#endif // ARM_MATH_DSP return ARM_CMSIS_NN_SUCCESS; } diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c index 371a1830..938530a4 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_nn_vec_mat_mult_t_s8 * Description: s8 vector by matrix (transposed) multiplication * - * $Date: 16 Aug 2022 - * $Revision: V.4.0.2 + * $Date: 26 January 2023 + * $Revision: V.5.3.0 * - * Target Processor: Cortex-M + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -52,12 +52,14 @@ * Refer header file for details. * */ -arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, - const q7_t *rhs, - const q31_t *bias, - q7_t *dst, +#if defined(ARM_MATH_DSP) && !defined(__ARMCC_VERSION) && !defined(__ICCARM__) + #pragma GCC optimize("unroll-loops") +#endif +arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, + const int8_t *rhs, + const int32_t *bias, + int8_t *dst, const int32_t lhs_offset, - const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, @@ -67,7 +69,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, const int32_t activation_max, const int32_t address_offset) { - (void)rhs_offset; #if defined(ARM_MATH_MVEI) const int32_t row_loop_cnt = rhs_rows / 3; const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3}; @@ -188,7 +189,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, #elif defined(ARM_MATH_DSP) const int32_t row_loop_cnt = rhs_rows / 2; const int16_t lhs_offset_s16 = (int16_t)lhs_offset; - const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16); + const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16); for (int32_t i = 0; i < row_loop_cnt; i++) { @@ -209,24 +210,24 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, for (int j = col_loop_cnt; j != 0; j--) { - int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); - int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); - vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); - int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); - int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); - ker_0 = __SXTB16(ker_0); + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); - acc_0 = __SMLAD(ker_1, vec_1, acc_0); - acc_0 = __SMLAD(ker_0, vec_0, acc_0); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); - ker_0 = arm_nn_read_q7x4_ia(&rhs_1); - ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); - ker_0 = __SXTB16(ker_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); - acc_1 = __SMLAD(ker_1, vec_1, acc_1); - acc_1 = __SMLAD(ker_0, vec_0, acc_1); + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); } for (int k = col_loop_cnt * 4; k < rhs_cols; k++) @@ -269,16 +270,16 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, for (int i = col_loop_cnt; i != 0; i--) { - int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); - int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); - vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); - int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); - int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); - ker_0 = __SXTB16(ker_0); + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); - acc_0 = __SMLAD(ker_1, vec_1, acc_0); - acc_0 = __SMLAD(ker_0, vec_0, acc_0); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); } for (int j = col_loop_cnt * 4; j < rhs_cols; j++) @@ -306,14 +307,14 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) { - const q7_t *lhs_ptr = lhs; - const q7_t *rhs_ptr_0 = &rhs[0]; - const q7_t *rhs_ptr_1 = &rhs[rhs_cols]; - const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; - - q31_t res00 = 0; - q31_t res01 = 0; - q31_t res02 = 0; + const int8_t *lhs_ptr = lhs; + const int8_t *rhs_ptr_0 = &rhs[0]; + const int8_t *rhs_ptr_1 = &rhs[rhs_cols]; + const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + int32_t res00 = 0; + int32_t res01 = 0; + int32_t res02 = 0; if (bias) { res00 = *bias++; @@ -322,10 +323,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, } for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - const q31_t rhs_value0 = (int8_t)*rhs_ptr_0; - const q31_t rhs_value1 = (int8_t)*rhs_ptr_1; - const q31_t rhs_value2 = (int8_t)*rhs_ptr_2; - const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + const int32_t rhs_value0 = (int8_t)*rhs_ptr_0; + const int32_t rhs_value1 = (int8_t)*rhs_ptr_1; + const int32_t rhs_value2 = (int8_t)*rhs_ptr_2; + const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; res00 += lhs_value * rhs_value0; res01 += lhs_value * rhs_value1; @@ -354,9 +355,9 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, res02 = MAX(res02, activation_min); res02 = MIN(res02, activation_max); - *dst = (q7_t)res00; - *(dst + address_offset) = (q7_t)res01; - *(dst + 2 * address_offset) = (q7_t)res02; + *dst = (int8_t)res00; + *(dst + address_offset) = (int8_t)res01; + *(dst + 2 * address_offset) = (int8_t)res02; dst += 3 * address_offset; rhs += 3 * rhs_cols; @@ -366,10 +367,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) { - const q7_t *lhs_ptr = &lhs[0]; - const q7_t *rhs_ptr = &rhs[0]; + const int8_t *lhs_ptr = &lhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - q31_t res00 = 0; + int32_t res00 = 0; if (bias) { res00 = *bias++; @@ -377,8 +378,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - q31_t rhs_value0 = (int8_t)rhs_ptr[0]; - q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + int32_t rhs_value0 = (int8_t)rhs_ptr[0]; + int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; res00 += lhs_value * rhs_value0; diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c index e22095c3..1e2f4f84 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,10 +22,10 @@ * Description: s8 vector by matrix (transposed) multiplication with * s16 output. Targetted at SVDF operator. * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 5 January 2023 + * $Revision: V.3.1.0 * - * Target Processor: Cortex-M + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -46,11 +46,10 @@ * Refer header file for details. * */ -arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, - const q7_t *rhs, - q15_t *dst, +arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const int8_t *lhs, + const int8_t *rhs, + int16_t *dst, const int32_t lhs_offset, - const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, @@ -59,13 +58,11 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, const int32_t activation_min, const int32_t activation_max) { - (void)rhs_offset; if (rhs_cols < 0 || (NN_Q31_MAX - rhs_cols) < 16 || dst_offset < 0) { return ARM_CMSIS_NN_ARG_ERROR; } - (void)rhs_offset; #if defined(ARM_MATH_MVEI) int32_t row_loop_cnt = rhs_rows / 3; @@ -153,12 +150,12 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, rhs += rhs_cols; const int32_t offsets = rhs_sum_0 * lhs_offset; - acc_0 = __QADD(acc_0, offsets); + acc_0 = QADD(acc_0, offsets); acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); // Clamp the result acc_0 = MAX(acc_0, activation_min); - *dst = (q15_t)MIN(acc_0, activation_max); + *dst = (int16_t)MIN(acc_0, activation_max); dst += dst_offset; } @@ -166,45 +163,126 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, int32_t row_loop_cnt = rhs_rows / 2; const int16_t lhs_offset_s16 = lhs_offset; - const int16_t rhs_offset_s16 = rhs_offset; - const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16); - const uint32_t rhs_offset_s16x2 = __PKHBT(rhs_offset_s16, rhs_offset_s16, 16); + const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16); for (int32_t i = 0; i < row_loop_cnt; i++) { int32_t acc_0 = 0; int32_t acc_1 = 0; - const int32_t col_loop_cnt = rhs_cols / 4; const int8_t *lhs_vec = lhs; const int8_t *rhs_0 = rhs; const int8_t *rhs_1 = rhs + rhs_cols; rhs += 2 * rhs_cols; - for (int j = col_loop_cnt; j != 0; j--) + + int32_t rhs_cols_idx = 0; + + int32_t vec_0, vec_1, ker_0, ker_1; + + #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + #pragma clang loop unroll(disable) + #endif + for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) { - int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); - int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); - vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); - int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); - int32_t ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); - ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); - acc_0 = __SMLAD(ker_1, vec_1, acc_0); - acc_0 = __SMLAD(ker_0, vec_0, acc_0); - ker_0 = arm_nn_read_q7x4_ia(&rhs_1); - ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); - ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); - acc_1 = __SMLAD(ker_1, vec_1, acc_1); - acc_1 = __SMLAD(ker_0, vec_0, acc_1); + // 4 x MAC acc_0, acc1 + vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + + // 4 x MAC acc_0, acc1 + vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + + // 4 x MAC acc_0, acc1 + vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + + // 4 x MAC acc_0, acc1 + vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + } + + #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + #pragma clang loop unroll(disable) + #endif + for (; rhs_cols_idx <= (rhs_cols - 4); rhs_cols_idx += 4) + { + vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + + ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + + ker_0 = arm_nn_read_s8x4_ia(&rhs_1); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); } - for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + + #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + #pragma clang loop unroll(disable) + #endif + for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { const int32_t lhs_temp = (*lhs_vec + lhs_offset); lhs_vec++; - acc_0 += lhs_temp * (*rhs_0 + rhs_offset); + acc_0 += lhs_temp * (*rhs_0); rhs_0++; - acc_1 += lhs_temp * (*rhs_1 + rhs_offset); + acc_1 += lhs_temp * (*rhs_1); rhs_1++; } + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); @@ -213,8 +291,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, acc_0 = MIN(acc_0, activation_max); acc_1 = MAX(acc_1, activation_min); acc_1 = MIN(acc_1, activation_max); - *dst = (q15_t)acc_0; - *(dst + dst_offset) = (q15_t)acc_1; + *dst = (int16_t)acc_0; + *(dst + dst_offset) = (int16_t)acc_1; dst += 2 * dst_offset; } if (rhs_rows & 0x1) @@ -225,20 +303,22 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, const int8_t *rhs_0 = rhs; for (int i = col_loop_cnt; i != 0; i--) { - int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); - int32_t vec_1 = __SXTAB16(lhs_offset_s16x2, __ROR((uint32_t)vec_0, 8)); - vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); - int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); - int32_t ker_1 = __SXTAB16(rhs_offset_s16x2, __ROR((uint32_t)ker_0, 8)); - ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); - acc_0 = __SMLAD(ker_1, vec_1, acc_0); - acc_0 = __SMLAD(ker_0, vec_0, acc_0); + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16(lhs_offset_s16x2, ROR((uint32_t)vec_0, 8)); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); } for (int j = col_loop_cnt * 4; j < rhs_cols; j++) { const int32_t lhs_temp = (*lhs_vec + lhs_offset); lhs_vec++; - acc_0 += lhs_temp * (*rhs_0 + rhs_offset); + acc_0 += lhs_temp * *rhs_0; rhs_0++; } acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); @@ -246,7 +326,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, // Clamp the result acc_0 = MAX(acc_0, activation_min); acc_0 = MIN(acc_0, activation_max); - *dst = (q15_t)acc_0; + *dst = (int16_t)acc_0; dst += dst_offset; } @@ -256,20 +336,20 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) { - const q7_t *lhs_ptr = lhs; - const q7_t *rhs_ptr_0 = &rhs[0]; - const q7_t *rhs_ptr_1 = &rhs[rhs_cols]; - const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; - - q31_t res00 = 0; - q31_t res01 = 0; - q31_t res02 = 0; + const int8_t *lhs_ptr = lhs; + const int8_t *rhs_ptr_0 = &rhs[0]; + const int8_t *rhs_ptr_1 = &rhs[rhs_cols]; + const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + int32_t res00 = 0; + int32_t res01 = 0; + int32_t res02 = 0; for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - const q31_t rhs_value0 = (int8_t)*rhs_ptr_0; - const q31_t rhs_value1 = (int8_t)*rhs_ptr_1; - const q31_t rhs_value2 = (int8_t)*rhs_ptr_2; - const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + const int32_t rhs_value0 = (int8_t)*rhs_ptr_0; + const int32_t rhs_value1 = (int8_t)*rhs_ptr_1; + const int32_t rhs_value2 = (int8_t)*rhs_ptr_2; + const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; res00 += lhs_value * rhs_value0; res01 += lhs_value * rhs_value1; @@ -293,9 +373,9 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, res02 = MAX(res02, activation_min); res02 = MIN(res02, activation_max); - *dst = (q15_t)res00; - *(dst + dst_offset) = (q15_t)res01; - *(dst + 2 * dst_offset) = (q15_t)res02; + *dst = (int16_t)res00; + *(dst + dst_offset) = (int16_t)res01; + *(dst + 2 * dst_offset) = (int16_t)res02; dst += 3 * dst_offset; rhs += 3 * rhs_cols; } @@ -304,15 +384,15 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) { - const q7_t *lhs_ptr = &lhs[0]; - const q7_t *rhs_ptr = &rhs[0]; + const int8_t *lhs_ptr = &lhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - q31_t res00 = 0; + int32_t res00 = 0; for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) { - q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset; - q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + int32_t rhs_value0 = (int8_t)rhs_ptr[0]; + int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; res00 += lhs_value * rhs_value0; @@ -327,7 +407,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, res00 = MAX(res00, activation_min); res00 = MIN(res00, activation_max); - *dst = (q15_t)res00; + *dst = (int16_t)res00; dst += dst_offset; rhs += rhs_cols; } diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c index 1bbfcfb1..7ccb89e0 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c @@ -21,8 +21,8 @@ * Title: arm_nntables.c * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift * - * $Date: 30 September 2022 - * $Revision: V.2.0.0 + * $Date: 28 October 2022 + * $Revision: V.2.1.0 * * Target Processor: Cortex-M cores * @@ -38,3 +38,25 @@ * */ +// Table of sigmoid(i/24) at 0.16 format - 256 elements. +// Combined sigmoid and tanh look-up table, since +// tanh(x) = 2*sigmoid(2*x) -1. +// Both functions are symmetric, so the LUT table is only needed +// for the absolute value of the input. +const uint16_t sigmoid_table_uint16[256] = { + 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688, + 43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, + 51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433, + 57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, + 61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186, + 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, + 64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900, + 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208, + 65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367, + 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449, + 65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, + 65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513, + 65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, + 65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530, + 65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, + 65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535}; diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c index fd4bb8d4..1c4e87e7 100644 --- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c +++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_q7_to_q15_with_offset.c * Description: Converts the elements of the Q7 vector to Q15 vector with an added offset * - * $Date: March 3, 2020 - * $Revision: V.2.0.2 + * $Date: 5 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -39,7 +39,7 @@ * @{ */ -void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) +void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, uint32_t block_size, int16_t offset) { int block_cnt; @@ -63,28 +63,28 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, #elif defined(ARM_MATH_DSP) /* Run the below code for cores that support SIMD instructions */ - q31_t in_q7x4; - q31_t in_q15x2_1; - q31_t in_q15x2_2; - q31_t out_q15x2_1; - q31_t out_q15x2_2; + int32_t in_q7x4; + int32_t in_q15x2_1; + int32_t in_q15x2_2; + int32_t out_q15x2_1; + int32_t out_q15x2_2; /*loop unrolling */ block_cnt = block_size >> 2; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ - const q31_t offset_q15x2 = __PKHBT(offset, offset, 16); + const int32_t offset_q15x2 = PKHBT(offset, offset, 16); while (block_cnt > 0) { - /* convert from q7 to q15 and then store the results in the destination buffer */ - in_q7x4 = arm_nn_read_q7x4_ia(&src); + /* convert from s8 to s16 and then store the results in the destination buffer */ + in_q7x4 = arm_nn_read_s8x4_ia(&src); - /* Extract and sign extend each of the four q7 values to q15 */ - in_q15x2_1 = __SXTAB16(offset_q15x2, __ROR(in_q7x4, 8)); - in_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4); + /* Extract and sign extend each of the four s8 values to s16 */ + in_q15x2_1 = SXTAB16(offset_q15x2, ROR(in_q7x4, 8)); + in_q15x2_2 = SXTAB16(offset_q15x2, in_q7x4); - out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); - out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); + out_q15x2_2 = PKHTB(in_q15x2_1, in_q15x2_2, 16); + out_q15x2_1 = PKHBT(in_q15x2_2, in_q15x2_1, 16); arm_nn_write_q15x2_ia(&dst, out_q15x2_1); arm_nn_write_q15x2_ia(&dst, out_q15x2_2); @@ -102,7 +102,7 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, while (block_cnt > 0) { - *dst++ = (q15_t)*src++ + offset; + *dst++ = (int16_t)*src++ + offset; /* Decrement the loop counter */ block_cnt--; diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s16.c new file mode 100644 index 00000000..fae40824 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s16.c @@ -0,0 +1,71 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_get_buffer_sizes_s16.c + * Description: Collection of get buffer size functions for avgpool s16 layer function. + * + * $Date: 13 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup Pooling + */ + +/** + * @addtogroup GetBufferSizePooling + * @{ + */ + +int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src) +{ +#if defined(ARM_MATH_MVEI) + return arm_avgpool_s16_get_buffer_size_mve(output_x, ch_src); +#elif defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return arm_avgpool_s16_get_buffer_size_dsp(output_x, ch_src); +#else + (void)output_x; + (void)ch_src; + return 0; +#endif +} + +int32_t arm_avgpool_s16_get_buffer_size_dsp(const int output_x, const int ch_src) +{ + (void)output_x; + return (ch_src * sizeof(int32_t)); +} + +int32_t arm_avgpool_s16_get_buffer_size_mve(const int output_x, const int ch_src) +{ + (void)output_x; + (void)ch_src; + + return 0; +} + +/** + * @} end of GetBufferSizePooling group + */ diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s8.c new file mode 100644 index 00000000..62b75f32 --- /dev/null +++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s8.c @@ -0,0 +1,71 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_get_buffer_sizes_s8.c + * Description: Collection of get buffer size functions for avgpool s8 layer function. + * + * $Date: 25 January 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "third_party/cmsis_nn/Include/arm_nnfunctions.h" + +/** + * @ingroup Pooling + */ + +/** + * @addtogroup GetBufferSizePooling + * @{ + */ + +int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src) +{ +#if defined(ARM_MATH_MVEI) + return arm_avgpool_s8_get_buffer_size_mve(output_x, ch_src); +#elif defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return arm_avgpool_s8_get_buffer_size_dsp(output_x, ch_src); +#else + (void)output_x; + (void)ch_src; + return 0; +#endif +} + +int32_t arm_avgpool_s8_get_buffer_size_dsp(const int output_x, const int ch_src) +{ + (void)output_x; + return (ch_src * sizeof(int32_t)); +} + +int32_t arm_avgpool_s8_get_buffer_size_mve(const int output_x, const int ch_src) +{ + (void)output_x; + (void)ch_src; + + return 0; +} + +/** + * @} end of GetBufferSizePooling group + */ diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c index 0178d15f..3c38a515 100644 --- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c +++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_avgpool_s16.c * Description: Pooling function implementations * - * $Date: 27 July 2022 - * $Revision: V.2.2.0 + * $Date: 30 January 2023 + * $Revision: V.2.4.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -33,8 +33,8 @@ #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) -static void scale_q31_to_q15_and_clamp(const q31_t *buffer, - q15_t *target, +static void scale_q31_to_q15_and_clamp(const int32_t *buffer, + int16_t *target, int32_t length, const int32_t count, const int act_min, @@ -49,7 +49,7 @@ static void scale_q31_to_q15_and_clamp(const q31_t *buffer, sum = MAX(sum, act_min); sum = MIN(sum, act_max); - target[i] = (q15_t)sum; + target[i] = (int16_t)sum; } } #endif @@ -73,10 +73,10 @@ static void scale_q31_to_q15_and_clamp(const q31_t *buffer, arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, const cmsis_nn_pool_params *pool_params, const cmsis_nn_dims *input_dims, - const q15_t *src, + const int16_t *src, const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims, - q15_t *dst) + int16_t *dst) { const int32_t input_y = input_dims->h; const int32_t input_x = input_dims->w; @@ -180,7 +180,7 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, } #elif defined(ARM_MATH_DSP) - q31_t *buffer = (q31_t *)ctx->buf; + int32_t *buffer = (int32_t *)ctx->buf; if (buffer == NULL) { @@ -209,7 +209,7 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, { for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++) { - const q15_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); + const int16_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); if (count == 0) { @@ -222,7 +222,7 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, { for (int i = 0; i < ch_src; i++) { - buffer[i] = __QADD(start[i], buffer[i]); + buffer[i] = QADD(start[i], buffer[i]); } } count++; @@ -291,17 +291,6 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, return ARM_CMSIS_NN_SUCCESS; } -int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src) -{ - (void)output_x; -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - return (ch_src * (int32_t)sizeof(int32_t)); -#else - (void)ch_src; -#endif - return 0; -} - /** * @} end of Pooling group */ diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c index 1a61f376..0001b0ee 100644 --- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c +++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_avgpool_s8.c * Description: Pooling function implementations * - * $Date: 7 July 2022 - * $Revision: V.3.0.2 + * $Date: 30 January 2023 + * $Revision: V.3.2.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -32,8 +32,8 @@ #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) -static void scale_q31_to_q7_and_clamp(const q31_t *buffer, - q7_t *target, +static void scale_q31_to_q7_and_clamp(const int32_t *buffer, + int8_t *target, int32_t length, const int32_t count, const int act_min, @@ -48,7 +48,7 @@ static void scale_q31_to_q7_and_clamp(const q31_t *buffer, sum = MAX(sum, act_min); sum = MIN(sum, act_max); - target[i] = (q7_t)sum; + target[i] = (int8_t)sum; } } #endif @@ -74,10 +74,10 @@ static void scale_q31_to_q7_and_clamp(const q31_t *buffer, arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, const cmsis_nn_pool_params *pool_params, const cmsis_nn_dims *input_dims, - const q7_t *src, + const int8_t *src, const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims, - q7_t *dst) + int8_t *dst) { (void)ctx; const int32_t input_y = input_dims->h; @@ -220,10 +220,10 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, const cmsis_nn_pool_params *pool_params, const cmsis_nn_dims *input_dims, - const q7_t *src, + const int8_t *src, const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims, - q7_t *dst) + int8_t *dst) { const int32_t input_y = input_dims->h; const int32_t input_x = input_dims->w; @@ -243,9 +243,9 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, { return ARM_CMSIS_NN_ARG_ERROR; } - q31_t *buffer = (q31_t *)ctx->buf; + int32_t *buffer = (int32_t *)ctx->buf; -#if defined(ARM_MATH_DSP) + #if defined(ARM_MATH_DSP) /* Run the following code for CPU's with DSP extension */ @@ -269,7 +269,7 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, { for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++) { - const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); + const int8_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); if (count == 0) { @@ -282,7 +282,7 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, { for (int i = 0; i < ch_src; i++) { - buffer[i] = __QADD(start[i], buffer[i]); + buffer[i] = QADD(start[i], buffer[i]); } } count++; @@ -299,7 +299,7 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, dst += ch_src; } } -#else + #else /* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC. */ @@ -340,23 +340,12 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, } } -#endif + #endif return ARM_CMSIS_NN_SUCCESS; } #endif /* ARM_MATH_MVEI */ -int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src) -{ - (void)output_x; - -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - return (ch_src * sizeof(int32_t)); -#else - (void)ch_src; - return 0; -#endif -} /** * @} end of Pooling group */ diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c index 8a0a01df..a306d05c 100644 --- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c +++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c @@ -21,8 +21,8 @@ * Title: arm_max_pool_s16.c * Description: Pooling function implementations * - * $Date: 16 August 2022 - * $Revision: V.2.1.1 + * $Date: 26 October 2022 + * $Revision: V.2.1.2 * * Target Processor: Cortex-M CPUs * @@ -47,15 +47,15 @@ static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, length -= 8; } #else - q15_t *dst = base; - const q15_t *src = target; + int16_t *dst = base; + const int16_t *src = target; union arm_nnword ref_max; union arm_nnword comp_max; int32_t cnt = length >> 1; while (cnt > 0l) { - ref_max.word = arm_nn_read_q15x2(dst); + ref_max.word = arm_nn_read_s16x2(dst); comp_max.word = arm_nn_read_q15x2_ia(&src); if (comp_max.half_words[0] > ref_max.half_words[0]) @@ -105,7 +105,7 @@ static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, while (cnt > 0l) { - in.word = arm_nn_read_q15x2(source); + in.word = arm_nn_read_s16x2(source); in.half_words[0] = MAX(in.half_words[0], act_min); in.half_words[0] = MIN(in.half_words[0], act_max); diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c index 3fcf64da..2afb704a 100644 --- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c +++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c @@ -21,8 +21,8 @@ * Title: arm_max_pool_s8.c * Description: Pooling function implementations * - * $Date: 16 August 2022 - * $Revision: V.3.0.1 + * $Date: 26 October 2022 + * $Revision: V.3.0.2 * * Target Processor: Cortex-M CPUs * @@ -31,7 +31,7 @@ #include "third_party/cmsis_nn/Include/arm_nnfunctions.h" #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" -static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length) +static void compare_and_replace_if_larger_q7(int8_t *base, const int8_t *target, int32_t length) { #if defined(ARM_MATH_MVEI) int32_t loop_count = (length + 15) / 16; @@ -47,16 +47,16 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int length -= 16; } #else - q7_t *dst = base; - const q7_t *src = target; + int8_t *dst = base; + const int8_t *src = target; union arm_nnword ref_max; union arm_nnword comp_max; int32_t cnt = length >> 2; while (cnt > 0l) { - ref_max.word = arm_nn_read_q7x4(dst); - comp_max.word = arm_nn_read_q7x4_ia(&src); + ref_max.word = arm_nn_read_s8x4(dst); + comp_max.word = arm_nn_read_s8x4_ia(&src); if (comp_max.bytes[0] > ref_max.bytes[0]) { @@ -75,7 +75,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int ref_max.bytes[3] = comp_max.bytes[3]; } - arm_nn_write_q7x4_ia(&dst, ref_max.word); + arm_nn_write_s8x4_ia(&dst, ref_max.word); cnt--; } @@ -94,7 +94,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int #endif } -static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max) +static void clamp_output(int8_t *source, int32_t length, const int32_t act_min, const int32_t act_max) { #if defined(ARM_MATH_MVEI) int32_t loop_count = (length + 15) / 16; @@ -117,7 +117,7 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co while (cnt > 0l) { - in.word = arm_nn_read_q7x4(source); + in.word = arm_nn_read_s8x4(source); in.bytes[0] = MAX(in.bytes[0], act_min); in.bytes[0] = MIN(in.bytes[0], act_max); @@ -128,7 +128,7 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co in.bytes[3] = MAX(in.bytes[3], act_min); in.bytes[3] = MIN(in.bytes[3], act_max); - arm_nn_write_q7x4_ia(&source, in.word); + arm_nn_write_s8x4_ia(&source, in.word); cnt--; } @@ -163,10 +163,10 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx, const cmsis_nn_pool_params *pool_params, const cmsis_nn_dims *input_dims, - const q7_t *src, + const int8_t *src, const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims, - q7_t *dst) + int8_t *dst) { const int32_t input_y = input_dims->h; const int32_t input_x = input_dims->w; @@ -182,7 +182,7 @@ arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx, const int32_t act_max = pool_params->activation.max; const int32_t channel_in = input_dims->c; (void)ctx; - q7_t *dst_base = dst; + int8_t *dst_base = dst; for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) { @@ -202,11 +202,11 @@ arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx, { for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) { - const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); + const int8_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); if (count == 0) { - arm_memcpy_q7(dst, start, channel_in); + arm_memcpy_s8(dst, start, channel_in); count++; } else diff --git a/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c b/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c index 11aaf349..8d4ff3eb 100644 --- a/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c +++ b/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c @@ -21,8 +21,8 @@ * Title: arm_reshape_s8.c * Description: Reshape a s8 vector * - * $Date: 4 Aug 2022 - * $Revision: V.1.0.1 + * $Date: 26 October 2022 + * $Revision: V.1.0.2 * * Target Processor: Cortex-M cores * @@ -49,7 +49,7 @@ void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size) { - arm_memcpy_q7(output, input, total_size); + arm_memcpy_s8(output, input, total_size); } /** diff --git a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c index 9b3457c4..e689f13a 100644 --- a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c +++ b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_svdf_s8.c * Description: S8 basic SVDF layer function * - * $Date: 4 May 2022 - * $Revision: V.4.0.1 + * $Date: 5 January 2023 + * $Revision: V.5.1.0 * - * Target Processor: Cortex-M processors + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -53,26 +53,26 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_per_tensor_quant_params *input_quant_params, const cmsis_nn_per_tensor_quant_params *output_quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *state_dims, - q7_t *state_data, + int8_t *state_data, const cmsis_nn_dims *weights_feature_dims, - const q7_t *weights_feature_data, + const int8_t *weights_feature_data, const cmsis_nn_dims *weights_time_dims, - const q7_t *weights_time_data, + const int8_t *weights_time_data, const cmsis_nn_dims *bias_dims, - const q31_t *bias_data, + const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data) + int8_t *output_data) { (void)bias_dims; (void)state_dims; (void)output_dims; - const q31_t multiplier_in = input_quant_params->multiplier; - const q31_t shift_in = input_quant_params->shift; - const q31_t multiplier_out = output_quant_params->multiplier; - const q31_t shift_2 = output_quant_params->shift; + const int32_t multiplier_in = input_quant_params->multiplier; + const int32_t shift_in = input_quant_params->shift; + const int32_t multiplier_out = output_quant_params->multiplier; + const int32_t shift_2 = output_quant_params->shift; const int32_t zp_in = svdf_params->input_offset; const int32_t zp_out = svdf_params->output_offset; const int32_t in_activation_min = svdf_params->input_activation.min; @@ -91,13 +91,13 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, { return ARM_CMSIS_NN_ARG_ERROR; } - q31_t *buffer_a = (q31_t *)input_ctx->buf; + int32_t *buffer_a = (int32_t *)input_ctx->buf; if (output_ctx->buf == NULL) { return ARM_CMSIS_NN_ARG_ERROR; } - q31_t *buffer_b = (q31_t *)output_ctx->buf; + int32_t *buffer_b = (int32_t *)output_ctx->buf; // Left shift state memmove((int8_t *)state_data, @@ -107,9 +107,9 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, // Matrix multiplication input * feature weight for (int i_batch = 0; i_batch < input_batches; i_batch++) { - q7_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); - const q7_t *weight = weights_feature_data; - const q7_t *input = input_data + i_batch * input_height; + int8_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); + const int8_t *weight = weights_feature_data; + const int8_t *input = input_data + i_batch * input_height; arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_s8(input, weight, @@ -117,7 +117,6 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, res_ptr, -zp_in, 0, - 0, multiplier_in, shift_in, input_height, @@ -134,7 +133,7 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, // Matrix multiplicate time weight * state tensors { - q31_t *ptr_a = buffer_a; + int32_t *ptr_a = buffer_a; const int8_t *v2 = state_data; for (int i_batch = 0; i_batch < input_batches; i_batch++) { @@ -152,11 +151,11 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, { j += 4; - q31_t r1_1, r1_2, r2_1, r2_2; + int32_t r1_1, r1_2, r2_1, r2_2; v1 = read_and_pad_reordered(v1, &r1_1, &r1_2); v2 = read_and_pad_reordered(v2, &r2_1, &r2_2); - sum = __SMLAD(r1_1, r2_1, sum); - sum = __SMLAD(r1_2, r2_2, sum); + sum = SMLAD(r1_1, r2_1, sum); + sum = SMLAD(r1_2, r2_2, sum); } // Process the remaining data @@ -187,8 +186,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, { for (int i = 0; i < input_batches; i++) { - q31_t *output_temp = buffer_b + i * feature_batches; - const q31_t *ptr_a = buffer_a + i * feature_batches; + int32_t *output_temp = buffer_b + i * feature_batches; + const int32_t *ptr_a = buffer_a + i * feature_batches; const int32_t *bi = bias_data; for (int j = 0; j < feature_batches; j++) @@ -201,8 +200,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, { for (int i_batch = 0; i_batch < input_batches; i_batch++) { - q31_t *output_data_temp = buffer_b + i_batch * unit_count; - q31_t *ptr_a = buffer_a + i_batch * feature_batches; + int32_t *output_data_temp = buffer_b + i_batch * unit_count; + int32_t *ptr_a = buffer_a + i_batch * feature_batches; for (int i = 0; i < unit_count; i++) { @@ -221,8 +220,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, { for (int i_batch = 0; i_batch < input_batches; i_batch++) { - q31_t *output_data_temp = buffer_b + i_batch * unit_count; - q31_t *ptr_a = buffer_a + i_batch * feature_batches; + int32_t *output_data_temp = buffer_b + i_batch * unit_count; + int32_t *ptr_a = buffer_a + i_batch * feature_batches; for (int i = 0; i < unit_count; i++) { @@ -258,7 +257,7 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, #else for (int i = 0; i < input_batches * unit_count; i++) { - output_data[i] = (q7_t)CLAMP( + output_data[i] = (int8_t)CLAMP( arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min); } #endif diff --git a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c index fc5d19e6..73b8fa13 100644 --- a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c +++ b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_svdf_s8.c * Description: S8 basic SVDF layer function with s16 state tensor * - * $Date: 4 May 2022 - * $Revision: V.2.0.1 + * $Date: 5 January 2023 + * $Revision: V.3.1.0 * - * Target Processor: Cortex-M processors + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -53,26 +53,26 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_per_tensor_quant_params *input_quant_params, const cmsis_nn_per_tensor_quant_params *output_quant_params, const cmsis_nn_dims *input_dims, - const q7_t *input_data, + const int8_t *input_data, const cmsis_nn_dims *state_dims, - q15_t *state_data, + int16_t *state_data, const cmsis_nn_dims *weights_feature_dims, - const q7_t *weights_feature_data, + const int8_t *weights_feature_data, const cmsis_nn_dims *weights_time_dims, - const q15_t *weights_time_data, + const int16_t *weights_time_data, const cmsis_nn_dims *bias_dims, - const q31_t *bias_data, + const int32_t *bias_data, const cmsis_nn_dims *output_dims, - q7_t *output_data) + int8_t *output_data) { (void)bias_dims; (void)state_dims; (void)output_dims; - const q31_t multiplier_in = input_quant_params->multiplier; - const q31_t shift_in = input_quant_params->shift; - const q31_t multiplier_out = output_quant_params->multiplier; - const q31_t shift_2 = output_quant_params->shift; + const int32_t multiplier_in = input_quant_params->multiplier; + const int32_t shift_in = input_quant_params->shift; + const int32_t multiplier_out = output_quant_params->multiplier; + const int32_t shift_2 = output_quant_params->shift; const int32_t zp_in = svdf_params->input_offset; const int32_t zp_out = svdf_params->output_offset; const int32_t in_activation_min = svdf_params->input_activation.min; @@ -91,31 +91,30 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, { return ARM_CMSIS_NN_ARG_ERROR; } - q31_t *buffer_a = (q31_t *)input_ctx->buf; + int32_t *buffer_a = (int32_t *)input_ctx->buf; if (output_ctx->buf == NULL) { return ARM_CMSIS_NN_ARG_ERROR; } - q31_t *buffer_b = (q31_t *)output_ctx->buf; + int32_t *buffer_b = (int32_t *)output_ctx->buf; // Left shift state - memmove((q15_t *)state_data, - (q15_t *)state_data + 1, + memmove((int16_t *)state_data, + (int16_t *)state_data + 1, (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t))); // Matrix multiplication input * feature weight for (int i_batch = 0; i_batch < input_batches; i_batch++) { - q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); - const q7_t *weight = weights_feature_data; - const q7_t *input = input_data + i_batch * input_height; + int16_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); + const int8_t *weight = weights_feature_data; + const int8_t *input = input_data + i_batch * input_height; arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_svdf_s8(input, weight, res_ptr, -zp_in, - 0, time_batches, multiplier_in, shift_in, @@ -132,11 +131,11 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, { // Matrix multiplication time weight * state tensors - q31_t *ptr_a = buffer_a; - const q15_t *v2 = state_data; + int32_t *ptr_a = buffer_a; + const int16_t *v2 = state_data; for (int i_batch = 0; i_batch < input_batches; i_batch++) { - const q15_t *v1 = weights_time_data; + const int16_t *v1 = weights_time_data; for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++) { @@ -149,10 +148,10 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, for (int i = 0; i < block_count; i++) { j += 2; - q31_t r1 = arm_nn_read_q15x2_ia(&v1); - q31_t r2 = arm_nn_read_q15x2_ia(&v2); + int32_t r1 = arm_nn_read_q15x2_ia(&v1); + int32_t r2 = arm_nn_read_q15x2_ia(&v2); - sum = __SMLAD(r1, r2, sum); + sum = SMLAD(r1, r2, sum); } // Process the remaining data @@ -183,8 +182,8 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, { for (int i = 0; i < input_batches; i++) { - q31_t *output_temp = buffer_b + i * feature_batches; - const q31_t *ptr_a = buffer_a + i * feature_batches; + int32_t *output_temp = buffer_b + i * feature_batches; + const int32_t *ptr_a = buffer_a + i * feature_batches; const int32_t *bi = bias_data; for (int j = 0; j < feature_batches; j++) @@ -197,8 +196,8 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, { for (int i_batch = 0; i_batch < input_batches; i_batch++) { - q31_t *output_data_temp = buffer_b + i_batch * unit_count; - q31_t *ptr_a = buffer_a + i_batch * feature_batches; + int32_t *output_data_temp = buffer_b + i_batch * unit_count; + int32_t *ptr_a = buffer_a + i_batch * feature_batches; for (int i = 0; i < unit_count; i++) { @@ -217,8 +216,8 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, { for (int i_batch = 0; i_batch < input_batches; i_batch++) { - q31_t *output_data_temp = buffer_b + i_batch * unit_count; - q31_t *ptr_a = buffer_a + i_batch * feature_batches; + int32_t *output_data_temp = buffer_b + i_batch * unit_count; + int32_t *ptr_a = buffer_a + i_batch * feature_batches; for (int i = 0; i < unit_count; i++) { @@ -254,7 +253,7 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, #else for (int i = 0; i < input_batches * unit_count; i++) { - output_data[i] = (q7_t)CLAMP( + output_data[i] = (int8_t)CLAMP( arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min); } #endif diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c index 22b9a1d3..6d73402d 100644 --- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c +++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_nn_softmax_common_s8.c * Description: Softmax with s8 input and output of s8 or s16. * - * $Date: 17 March 2022 - * $Revision: V.1.0.1 + * $Date: 5 January 2023 + * $Revision: V.1.1.0 * - * Target Processor: Cortex-M processors + * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h" @@ -89,7 +89,7 @@ void arm_nn_softmax_common_s8(const int8_t *input, } } - const int32_t headroom = __CLZ(sum); + const int32_t headroom = CLZ(sum); const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31)); int32_t bits_over_unit; diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c index 31e27e81..a132e96c 100644 --- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c +++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Arm Limited or its affiliates. + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_softmax_s16.c * Description: S16 softmax function * - * $Date: 19 April 2022 - * $Revision: V.2.0.0 + * $Date: 5 January 2023 + * $Revision: V.2.1.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -84,7 +84,7 @@ arm_cmsis_nn_status arm_softmax_s16(const int16_t *input, sum += cached_exp_results[col]; } - const int32_t headroom = __CLZ(sum); + const int32_t headroom = CLZ(sum); // Compute the reciprocal 1/sum const int32_t shifted_sum = (((sum) << (headroom - 1)) + (1 << 13)) >> 14; diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c index 671bb893..d49e0dc0 100644 --- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c +++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_softmax_s8.c * Description: S8 softmax function * - * $Date: 9 March 2022 - * $Revision: V.2.1.0 + * $Date: 5 January 2023 + * $Revision: V.2.2.0 * - * Target Processor: Cortex-M cores + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -33,10 +33,10 @@ #define ACCUM_BITS 12 -#ifdef ARM_MATH_MVEI +#if defined(ARM_MATH_MVEI) && !defined(ARM_GCC_12_2_ICE) static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val) { -#define SHIFT_START (24) + #define SHIFT_START (24) int32_t shift = SHIFT_START; int32x4_t mask; @@ -49,12 +49,12 @@ static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val) const int32x4_t op_2 = x + DIV_POW2_MVE(MUL_SAT_MVE(op_1, vdupq_n_s32(715827883)) + x2, 1); int32x4_t result = vdupq_n_s32(1895147668) + MUL_SAT_MVE(vdupq_n_s32(1895147668), op_2); -#define SELECT_IF_NON_ZERO(x) \ - { \ - mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0); \ - mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); \ - result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result); \ - } + #define SELECT_IF_NON_ZERO(x) \ + { \ + mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0); \ + mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); \ + result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result); \ + } SELECT_IF_NON_ZERO(1672461947) SELECT_IF_NON_ZERO(1302514674) @@ -64,7 +64,7 @@ static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val) SELECT_IF_NON_ZERO(720401) SELECT_IF_NON_ZERO(242) -#undef SELECT_IF_NON_ZERO + #undef SELECT_IF_NON_ZERO mve_pred16_t p = vcmpeqq_n_s32(val, 0); mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); @@ -91,10 +91,10 @@ void arm_softmax_s8(const int8_t *input, const int32_t diff_min, int8_t *output) { -#ifdef ARM_MATH_MVEI +#if defined(ARM_MATH_MVEI) && !defined(ARM_GCC_12_2_ICE) -#define ACT_MIN ((int8_t)NN_Q7_MIN) -#define ACT_MAX ((int8_t)NN_Q7_MAX) + #define ACT_MIN ((int8_t)NN_Q7_MIN) + #define ACT_MAX ((int8_t)NN_Q7_MAX) const int32_t mask = (1 << shift); @@ -147,7 +147,7 @@ void arm_softmax_s8(const int8_t *input, } } - const int32_t headroom = __CLZ((uint32_t)sum); + const int32_t headroom = CLZ((uint32_t)sum); const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31)); diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c index 4a88930f..a7c1bff8 100644 --- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c +++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2020, 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,10 +21,10 @@ * Title: arm_softmax_u8.c * Description: U8 softmax function * - * $Date: 09. October 2020 - * $Revision: V.1.0.2 + * $Date: 5 January 2023 + * $Revision: V.1.1.0 * - * Target Processor: Cortex-M CPUs + * Target : Arm(R) M-Profile Architecture * * -------------------------------------------------------------------- */ @@ -76,7 +76,7 @@ void arm_softmax_u8(const uint8_t *input, } } - const int32_t headroom = __CLZ((uint32_t)sum); + const int32_t headroom = CLZ((uint32_t)sum); const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; const int32_t shifted_scale = ONE_OVER1((sum << headroom) - (1 << 31));