diff --git a/.github/mergify.yml b/.github/mergify.yml
deleted file mode 100644
index 9e715eb3..00000000
--- a/.github/mergify.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-queue_rules:
-  - name: default
-    conditions:
-      - label=ci:mergify
-    
-   
-pull_request_rules:
-  - name: push to default merge queue
-    conditions:
-      - base=main
-      - label=ci:mergify
-      - check-success=cla/google
-    actions:
-      queue:
-        name: default
-        require_branch_protection: true
-        method: squash
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index d5f5ebc7..00000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-# YAML schema for GitHub Actions:
-# https://help.github.com/en/actions/automating-your-workflow-with-github-actions/workflow-syntax-for-github-actions
-#
-# Helpful YAML parser to clarify YAML syntax:
-# https://yaml-online-parser.appspot.com/
-#
-#
-# This file contains the workflows that are run prior to merging a pull request.
-
-name: CI
-
-on:
-  pull_request:
-    types: [labeled]
-    branches:
-      - main
-
-#  schedule:
-    # 10am UTC is 3am or 4am PT depending on daylight savings.
-#    - cron: '0 10 * * *'
-
-  # Allow manually triggering of the workflow.
-  workflow_dispatch: {}
-
-jobs:
-  arduino:
-    runs-on: ubuntu-latest
-
-    if: |
-      github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:run')) ||
-      (github.event_name == 'schedule' && github.repository == 'tensorflow/tflite-micro-arduino-examples')
-
-    name: Arduino CLI Build
-    steps:
-      - uses: actions/checkout@v2
-      - name: Test
-        run: |
-          ./scripts/install_arduino_cli.sh
-          ./scripts/test_arduino_library.sh "${PWD}"
diff --git a/.github/workflows/remove-labels.yml b/.github/workflows/remove-labels.yml
deleted file mode 100644
index 7e7b6849..00000000
--- a/.github/workflows/remove-labels.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: Remove Labels
-
-on:
-  pull_request_target:
-    types: [labeled]
-
-jobs:
-  label_cleanup:
-    runs-on: ubuntu-latest
-
-    name: remove CI runner labels
-    steps:
-      - name: remove tags
-        uses: actions/github-script@a3e7071a34d7e1f219a8a4de9a5e0a34d1ee1293
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            github.issues.removeLabel({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              name: 'ci:run'
-            })
-            github.issues.removeLabel({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              name: 'ci:test'
-            })
-        # Prevent erroring out if label doesn't exist
-        continue-on-error: true
diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml
index 3f3fce13..e44ebff5 100644
--- a/.github/workflows/sync.yml
+++ b/.github/workflows/sync.yml
@@ -5,7 +5,7 @@
 # https://yaml-online-parser.appspot.com/
 #
 
-name: (Arduino) Sync from tflite-micro
+name: (RTduino) Sync from tflite-micro
 
 on:
 #  schedule:
@@ -21,7 +21,7 @@ jobs:
 
     if: |
       github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'tensorflow/tflite-micro-arduino-examples')
+      (github.event_name == 'schedule' && github.repository == 'RTduino-libraries/TensorFlow-Lite-Micro')
 
     steps:
       - uses: actions/setup-python@v2
@@ -31,7 +31,7 @@ jobs:
 
       - name: Sync the code
         run: |
-          pip3 install six Pillow Wave
+          pip3 install six Pillow Wave numpy
           ./scripts/sync_from_tflite_micro.sh
           git config --local user.name "TFLM-bot"
           git config --local user.email "tflm-github-bot@google.com"
@@ -56,4 +56,4 @@ jobs:
           author: TFLM-bot <tflm-github-bot@google.com>
           body: "(Arduino) Automated sync from github.com/tensorflow/tflite-micro"
           labels: bot:sync-tf, ci:run
-          reviewers: advaitjain
+          reviewers: mysterywolf
diff --git a/scripts/sync_from_tflite_micro.sh b/scripts/sync_from_tflite_micro.sh
index c696c9d8..df77d91c 100755
--- a/scripts/sync_from_tflite_micro.sh
+++ b/scripts/sync_from_tflite_micro.sh
@@ -27,7 +27,7 @@ TEMP_DIR=$(mktemp -d)
 cd "${TEMP_DIR}"
 
 echo Cloning tflite-micro repo to "${TEMP_DIR}"
-git clone --depth 1 --single-branch "https://github.com/tensorflow/tflite-micro.git"
+git clone --depth 1 --single-branch "https://github.com/RTduino-libraries/tflite-micro.git" -b sync-baseline
 cd tflite-micro
 
 make -f tensorflow/lite/micro/tools/make/Makefile clean_downloads
diff --git a/src/tensorflow/lite/builtin_ops.h b/src/tensorflow/lite/builtin_ops.h
index 33707308..f9871add 100644
--- a/src/tensorflow/lite/builtin_ops.h
+++ b/src/tensorflow/lite/builtin_ops.h
@@ -186,6 +186,9 @@ typedef enum {
   kTfLiteBuiltinAtan2 = 156,
   kTfLiteBuiltinUnsortedSegmentMin = 157,
   kTfLiteBuiltinSign = 158,
+  kTfLiteBuiltinBitcast = 159,
+  kTfLiteBuiltinBitwiseXor = 160,
+  kTfLiteBuiltinRightShift = 161,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/src/tensorflow/lite/c/common.h b/src/tensorflow/lite/c/common.h
index 0e485812..e3e8001c 100644
--- a/src/tensorflow/lite/c/common.h
+++ b/src/tensorflow/lite/c/common.h
@@ -38,10 +38,4 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/common.h"
 
-// TfLiteOpaqueDelegate: allows delegation of nodes to alternative backends.
-// TfLiteOpaqueDelegate is an abstract type that is intended to have the same
-// role as TfLiteDelegate, but without necessarily exposing the implementation
-// details of how delegates are implemented.
-typedef TfLiteDelegate TfLiteOpaqueDelegate;
-
 #endif  // TENSORFLOW_LITE_C_COMMON_H_
diff --git a/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp b/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp
index 117bc75e..68b94d95 100644
--- a/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp
+++ b/src/tensorflow/lite/core/api/flatbuffer_conversions.cpp
@@ -256,6 +256,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseElu(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_EMBEDDING_LOOKUP: {
+      return ParseEmbeddingLookup(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_EXP: {
       return ParseExp(op, error_reporter, allocator, builtin_data);
     }
@@ -542,6 +546,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseZerosLike(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_BITWISE_XOR: {
+      return ParseBitwiseXor(op, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_RIGHT_SHIFT: {
+      return ParseRightShift(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CAST: {
       return ParseCast(op, error_reporter, allocator, builtin_data);
     }
@@ -845,6 +857,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
+
     // Below are the ops with no builtin_data structure.
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
@@ -855,7 +868,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DENSIFY:
     case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
-    case BuiltinOperator_EMBEDDING_LOOKUP:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_HASHTABLE_FIND:
     case BuiltinOperator_HASHTABLE_IMPORT:
@@ -885,6 +897,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_UNSORTED_SEGMENT_SUM:
     case BuiltinOperator_ATAN2:
     case BuiltinOperator_SIGN:
+    case BuiltinOperator_BITCAST:
     case BuiltinOperator_WHERE:
       return kTfLiteOk;
     case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES:
@@ -1335,6 +1348,14 @@ TfLiteStatus ParseElu(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseEmbeddingLookup(const Operator*, ErrorReporter*,
+                                  BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -2441,6 +2462,22 @@ TfLiteStatus ParseZerosLike(const Operator*, ErrorReporter*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseBitwiseXor(const Operator*, ErrorReporter*,
+                             BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseRightShift(const Operator*, ErrorReporter*,
+                             BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
diff --git a/src/tensorflow/lite/core/api/flatbuffer_conversions.h b/src/tensorflow/lite/core/api/flatbuffer_conversions.h
index 4df83d5e..9ffe3971 100644
--- a/src/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/src/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -151,6 +151,11 @@ TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseElu(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseEmbeddingLookup(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
 TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
@@ -407,6 +412,14 @@ TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 
+TfLiteStatus ParseBitwiseXor(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseRightShift(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/src/tensorflow/lite/core/c/c_api_types.h b/src/tensorflow/lite/core/c/c_api_types.h
index 3aab43f4..670ec1ee 100644
--- a/src/tensorflow/lite/core/c/c_api_types.h
+++ b/src/tensorflow/lite/core/c/c_api_types.h
@@ -21,6 +21,7 @@ limitations under the License.
 /// "third_party/tensorflow/lite/c/c_api_types.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/c_api_types.h"
 
 #ifndef TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
 #define TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
diff --git a/src/tensorflow/lite/core/c/common.cpp b/src/tensorflow/lite/core/c/common.cpp
index 00bbcde2..a25abcfb 100644
--- a/src/tensorflow/lite/core/c/common.cpp
+++ b/src/tensorflow/lite/core/c/common.cpp
@@ -98,11 +98,22 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
   return ret;
 }
 
+TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) {
+  if (!src) return nullptr;
+  TfLiteFloatArray* ret = TfLiteFloatArrayCreate(src->size);
+  if (ret) {
+    memcpy(ret->data, src->data, src->size * sizeof(float));
+  }
+  return ret;
+}
+
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic ||
-      t->allocation_type == kTfLitePersistentRo) {
+  if (t->allocation_type == kTfLiteVariantObject) {
+    delete reinterpret_cast<VariantData*>(t->data.data);
+  } else if (t->allocation_type == kTfLiteDynamic ||
+             t->allocation_type == kTfLitePersistentRo) {
     if (t->data.raw) {
 #ifdef TF_LITE_TENSORFLOW_PROFILER
       tflite::PauseHeapMonitoring(/*pause=*/true);
@@ -207,11 +218,16 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
   if (!src || !dst) return kTfLiteOk;
   if (src->bytes != dst->bytes) return kTfLiteError;
   if (src == dst) return kTfLiteOk;
-
   dst->type = src->type;
   if (dst->dims) TfLiteIntArrayFree(dst->dims);
   dst->dims = TfLiteIntArrayCopy(src->dims);
-  memcpy(dst->data.raw, src->data.raw, src->bytes);
+  if (src->allocation_type == kTfLiteVariantObject) {
+    if (dst->allocation_type != kTfLiteVariantObject) return kTfLiteError;
+    dst->data.data =
+        reinterpret_cast<VariantData*>(src->data.data)->Clone(dst->data.raw);
+  } else {
+    memcpy(dst->data.raw, src->data.raw, src->bytes);
+  }
   dst->buffer_handle = src->buffer_handle;
   dst->data_is_stale = src->data_is_stale;
   dst->delegate = src->delegate;
diff --git a/src/tensorflow/lite/core/c/common.h b/src/tensorflow/lite/core/c/common.h
index 8ca987d2..9bf98971 100644
--- a/src/tensorflow/lite/core/c/common.h
+++ b/src/tensorflow/lite/core/c/common.h
@@ -38,6 +38,7 @@ limitations under the License.
 /// "third_party/tensorflow/lite/c/common.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
 
 #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
 #define TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -157,6 +158,10 @@ int TfLiteFloatArrayGetSizeInBytes(int size);
 // This returns a pointer, that you must free using TfLiteFloatArrayFree().
 TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
 
+// Create a copy of an array passed as `src`.
+// You are expected to free memory with TfLiteFloatArrayFree.
+TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src);
+
 // Free memory of array `a`.
 void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 #endif  // TF_LITE_STATIC_MEMORY
@@ -345,6 +350,8 @@ typedef union TfLitePtrUnion {
 //        as constant inputs for downstream ops (also in prepare).
 //  * kTfLiteCustom: Custom memory allocation provided by the user. See
 //        TfLiteCustomAllocation below.
+// * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object.
+//        Allocation and deallocation are done through `new` and `delete`.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -353,6 +360,7 @@ typedef enum TfLiteAllocationType {
   kTfLiteDynamic,
   kTfLitePersistentRo,
   kTfLiteCustom,
+  kTfLiteVariantObject,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -959,12 +967,53 @@ typedef struct TfLiteRegistration {
   // ops. We keep it inside of `TfLiteRegistration` and use it to route
   // callbacks properly.
   TfLiteRegistrationExternal* registration_external;
+
+  // Retrieves asynchronous kernel.
+  //
+  // If the `async_kernel` field is nullptr, it means the operation described by
+  // this TfLiteRegistration object does not support asynchronous execution.
+  // Otherwise, the function that the field points to should only be called for
+  // delegate kernel nodes, i.e. `node` should be a delegate kernel node created
+  // by applying a delegate.
+  // If the function returns nullptr, that means that the underlying delegate
+  // does not support asynchronous execution for this `node`.
+  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
+                                            TfLiteNode* node);
 } TfLiteRegistration;
 
+/// \private
 // Old version of `TfLiteRegistration` to maintain binary backward
 // compatibility.
-// WARNING: This structure is deprecated / not an official part of the API.
-// It should be only used for binary backward compatibility.
+// The legacy registration type must be a POD struct type whose field types must
+// be a prefix of the field types in TfLiteRegistration, and offset of the first
+// field in TfLiteRegistration that is not present in the legacy registration
+// type must be greater than or equal to the size of the legacy registration
+// type.
+// WARNING: This structure is deprecated / not an official part of the
+// API. It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V2 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+  TfLiteRegistrationExternal* registration_external;
+} TfLiteRegistration_V2;
+
+/// \private
+// Old version of `TfLiteRegistration` to maintain binary backward
+// compatibility.
+// The legacy registration type must be a POD struct type whose field types must
+// be a prefix of the field types in TfLiteRegistration, and offset of the first
+// field in TfLiteRegistration that is not present in the legacy registration
+// type must be greater than or equal to the size of the legacy registration
+// type.
+// WARNING: This structure is deprecated / not an official part of the
+// API. It should be only used for binary backward compatibility.
 typedef struct TfLiteRegistration_V1 {
   void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
   void (*free)(TfLiteContext* context, void* buffer);
@@ -1155,5 +1204,74 @@ void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
 
 #ifdef __cplusplus
 }  // extern "C"
+
+#include <utility>
+
+// `kTfLiteVariant` type tensors encode arbitrary C++ objects behind their
+// `data.data : void*` member. This is the type-erased interface for interacting
+// with such objects at runtime. Deleting or Cloning any `VariantData`
+// will call the destructor and copy constructor of the erased type
+// automatically. For example usage, see `common_test.cc`.
+class VariantData {
+ public:
+  // All variant objects must be able to be destroyed and copied.
+  virtual ~VariantData() = default;
+  // This allows for a "virtual copy-constructor" like pattern.
+  // In most cases, we will be copying from an input to an output tensor.
+  // Often, the output tensor is already allocated so we can pass
+  // a pointer to its buffer for reuse.
+  virtual VariantData* Clone(char* maybe_alloc) const = 0;
+};
+
+// An abstract base class for variant objects. The template parameter
+// is the type we are erasing.
+template <typename ErasedDerived>
+class AbstractVariantData : public VariantData {
+ public:
+  VariantData* Clone(char* maybe_alloc) const override {
+    if (maybe_alloc) {
+      // We assume that the output tensor is already a variant of the same
+      // derived type. If the output is still allocated, then it still may have
+      // state that was not destroyed, so we must call the destructor before
+      // using the buffer.
+      //     This may actual have a non-negligle effect on perfomance if the
+      // destructor is complex. In a future optimization we would want to
+      // introduce something like "move to" semantics, allowing for the
+      // underlying implementation to optimize for this case.
+      reinterpret_cast<VariantData*>(maybe_alloc)->~VariantData();
+      return new (maybe_alloc)
+          ErasedDerived(static_cast<ErasedDerived const&>(*this));
+    }
+    return new ErasedDerived(static_cast<ErasedDerived const&>(*this));
+  }
+
+ protected:
+  AbstractVariantData() = default;
+  AbstractVariantData(const AbstractVariantData&) = default;
+  AbstractVariantData(AbstractVariantData&&) = delete;
+};
+
+// Analogous to `TfLiteTensorRealloc` for allocation of tensors whose
+// data member points to an arbitrary C++ object. `VariantType` refers
+// to the erased type of said object and `VariantArgs` refers to
+// a list of argument types with which to construct a new `VariantType`
+// `VariantArgs` must match constructor in `VariantType`.
+template <class VariantType, class... VariantArgs>
+TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t,
+                                        VariantArgs&&... args) {
+  if (t->type != kTfLiteVariant) return kTfLiteError;
+  if (t->data.raw) {
+    reinterpret_cast<VariantData*>(t->data.data)->~VariantData();
+    // For now we assume if `t` is already allocated then it was allocated
+    // with the same `VariantType` as templated.
+    t->data.data =
+        new (t->data.raw) VariantType(std::forward<VariantArgs...>(args...));
+  } else {
+    t->data.data = new VariantType(std::forward<VariantArgs...>(args...));
+  }
+  t->allocation_type = kTfLiteVariantObject;
+  return kTfLiteOk;
+}
+
 #endif  // __cplusplus
 #endif  // TENSORFLOW_LITE_CORE_C_COMMON_H_
diff --git a/src/tensorflow/lite/core/macros.h b/src/tensorflow/lite/core/macros.h
new file mode 100644
index 00000000..8ebc8db2
--- /dev/null
+++ b/src/tensorflow/lite/core/macros.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides utility macros and functions that are inherently platform
+// specific.
+#ifndef TENSORFLOW_LITE_CORE_MACROS_H_
+#define TENSORFLOW_LITE_CORE_MACROS_H_
+
+#ifdef __has_builtin
+#define TFLITE_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TFLITE_HAS_BUILTIN(x) 0
+#endif
+
+#if (!defined(__NVCC__)) && (TFLITE_HAS_BUILTIN(__builtin_expect) || \
+                             (defined(__GNUC__) && __GNUC__ >= 3))
+#define TFLITE_EXPECT_FALSE(cond) __builtin_expect(cond, false)
+#define TFLITE_EXPECT_TRUE(cond) __builtin_expect(!!(cond), true)
+#else
+#define TFLITE_EXPECT_FALSE(cond) (cond)
+#define TFLITE_EXPECT_TRUE(cond) (cond)
+#endif
+
+#ifdef _WIN32
+#define TFLITE_NOINLINE __declspec(noinline)
+#else
+#ifdef __has_attribute
+#if __has_attribute(noinline)
+#define TFLITE_NOINLINE __attribute__((noinline))
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute(noinline)
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute
+#endif  // _WIN32
+
+// Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but
+// we avoid the absl dependency for binary size reasons.
+#ifdef __has_attribute
+#define TFLITE_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define TFLITE_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if (TFLITE_HAS_ATTRIBUTE(weak) ||                  \
+     (defined(__GNUC__) && !defined(__clang__))) && \
+    !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__)
+#undef TFLITE_ATTRIBUTE_WEAK
+#define TFLITE_ATTRIBUTE_WEAK __attribute__((weak))
+#define TFLITE_HAS_ATTRIBUTE_WEAK 1
+#else
+#define TFLITE_ATTRIBUTE_WEAK
+#define TFLITE_HAS_ATTRIBUTE_WEAK 0
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_MACROS_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/bits.h b/src/tensorflow/lite/experimental/microfrontend/lib/bits.h
deleted file mode 100644
index 04b3ba6f..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/bits.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
-
-#ifdef __cplusplus
-#include <cstdint>
-
-extern "C" {
-#endif
-
-static inline int CountLeadingZeros32Slow(uint64_t n) {
-  int zeroes = 28;
-  if (n >> 16) zeroes -= 16, n >>= 16;
-  if (n >> 8) zeroes -= 8, n >>= 8;
-  if (n >> 4) zeroes -= 4, n >>= 4;
-  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
-}
-
-static inline int CountLeadingZeros32(uint32_t n) {
-#if defined(_MSC_VER)
-  unsigned long result = 0;  // NOLINT(runtime/int)
-  if (_BitScanReverse(&result, n)) {
-    return 31 - result;
-  }
-  return 32;
-#elif defined(__GNUC__)
-
-  // Handle 0 as a special case because __builtin_clz(0) is undefined.
-  if (n == 0) {
-    return 32;
-  }
-  return __builtin_clz(n);
-#else
-  return CountLeadingZeros32Slow(n);
-#endif
-}
-
-static inline int MostSignificantBit32(uint32_t n) {
-  return 32 - CountLeadingZeros32(n);
-}
-
-static inline int CountLeadingZeros64Slow(uint64_t n) {
-  int zeroes = 60;
-  if (n >> 32) zeroes -= 32, n >>= 32;
-  if (n >> 16) zeroes -= 16, n >>= 16;
-  if (n >> 8) zeroes -= 8, n >>= 8;
-  if (n >> 4) zeroes -= 4, n >>= 4;
-  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
-}
-
-static inline int CountLeadingZeros64(uint64_t n) {
-#if defined(_MSC_VER) && defined(_M_X64)
-  // MSVC does not have __builtin_clzll. Use _BitScanReverse64.
-  unsigned long result = 0;  // NOLINT(runtime/int)
-  if (_BitScanReverse64(&result, n)) {
-    return 63 - result;
-  }
-  return 64;
-#elif defined(_MSC_VER)
-  // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse
-  unsigned long result = 0;  // NOLINT(runtime/int)
-  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
-    return 31 - result;
-  }
-  if (_BitScanReverse(&result, n)) {
-    return 63 - result;
-  }
-  return 64;
-#elif defined(__GNUC__)
-
-  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
-  if (n == 0) {
-    return 64;
-  }
-  return __builtin_clzll(n);
-#else
-  return CountLeadingZeros64Slow(n);
-#endif
-}
-
-static inline int MostSignificantBit64(uint64_t n) {
-  return 64 - CountLeadingZeros64(n);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft.cpp b/src/tensorflow/lite/experimental/microfrontend/lib/fft.cpp
deleted file mode 100644
index bcdd9cc0..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/fft.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
-
-#include <string.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h"
-
-void FftCompute(struct FftState* state, const int16_t* input,
-                int input_scale_shift) {
-  const size_t input_size = state->input_size;
-  const size_t fft_size = state->fft_size;
-
-  int16_t* fft_input = state->input;
-  // First, scale the input by the given shift.
-  size_t i;
-  for (i = 0; i < input_size; ++i) {
-    fft_input[i] = static_cast<int16_t>(static_cast<uint16_t>(input[i])
-                                        << input_scale_shift);
-  }
-  // Zero out whatever else remains in the top part of the input.
-  for (; i < fft_size; ++i) {
-    fft_input[i] = 0;
-  }
-
-  // Apply the FFT.
-  kissfft_fixed16::kiss_fftr(
-      reinterpret_cast<kissfft_fixed16::kiss_fftr_cfg>(state->scratch),
-      state->input,
-      reinterpret_cast<kissfft_fixed16::kiss_fft_cpx*>(state->output));
-}
-
-void FftInit(struct FftState* state) {
-  // All the initialization is done in FftPopulateState()
-}
-
-void FftReset(struct FftState* state) {
-  memset(state->input, 0, state->fft_size * sizeof(*state->input));
-  memset(state->output, 0, (state->fft_size / 2 + 1) * sizeof(*state->output));
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft.h b/src/tensorflow/lite/experimental/microfrontend/lib/fft.h
deleted file mode 100644
index aaffa69d..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/fft.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct complex_int16_t {
-  int16_t real;
-  int16_t imag;
-};
-
-struct FftState {
-  int16_t* input;
-  struct complex_int16_t* output;
-  size_t fft_size;
-  size_t input_size;
-  void* scratch;
-  size_t scratch_size;
-};
-
-void FftCompute(struct FftState* state, const int16_t* input,
-                int input_scale_shift);
-
-void FftInit(struct FftState* state);
-
-void FftReset(struct FftState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.cpp b/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.cpp
deleted file mode 100644
index ed3dc8fb..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
-
-#include <stdio.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h"
-
-int FftPopulateState(struct FftState* state, size_t input_size) {
-  state->input_size = input_size;
-  state->fft_size = 1;
-  while (state->fft_size < state->input_size) {
-    state->fft_size <<= 1;
-  }
-
-  state->input = reinterpret_cast<int16_t*>(
-      malloc(state->fft_size * sizeof(*state->input)));
-  if (state->input == nullptr) {
-    fprintf(stderr, "Failed to alloc fft input buffer\n");
-    return 0;
-  }
-
-  state->output = reinterpret_cast<complex_int16_t*>(
-      malloc((state->fft_size / 2 + 1) * sizeof(*state->output) * 2));
-  if (state->output == nullptr) {
-    fprintf(stderr, "Failed to alloc fft output buffer\n");
-    return 0;
-  }
-
-  // Ask kissfft how much memory it wants.
-  size_t scratch_size = 0;
-  kissfft_fixed16::kiss_fftr_cfg kfft_cfg = kissfft_fixed16::kiss_fftr_alloc(
-      state->fft_size, 0, nullptr, &scratch_size);
-  if (kfft_cfg != nullptr) {
-    fprintf(stderr, "Kiss memory sizing failed.\n");
-    return 0;
-  }
-  state->scratch = malloc(scratch_size);
-  if (state->scratch == nullptr) {
-    fprintf(stderr, "Failed to alloc fft scratch buffer\n");
-    return 0;
-  }
-  state->scratch_size = scratch_size;
-  // Let kissfft configure the scratch space we just allocated
-  kfft_cfg = kissfft_fixed16::kiss_fftr_alloc(state->fft_size, 0,
-                                              state->scratch, &scratch_size);
-  if (kfft_cfg != state->scratch) {
-    fprintf(stderr, "Kiss memory preallocation strategy failed.\n");
-    return 0;
-  }
-  return 1;
-}
-
-void FftFreeStateContents(struct FftState* state) {
-  free(state->input);
-  free(state->output);
-  free(state->scratch);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.c b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.c
deleted file mode 100644
index 80f8738f..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
-
-#include <string.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
-
-void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
-                                         struct complex_int16_t* fft_output,
-                                         int32_t* energy) {
-  const int end_index = state->end_index;
-  int i;
-  energy += state->start_index;
-  fft_output += state->start_index;
-  for (i = state->start_index; i < end_index; ++i) {
-    const int32_t real = fft_output->real;
-    const int32_t imag = fft_output->imag;
-    fft_output++;
-    const uint32_t mag_squared = (real * real) + (imag * imag);
-    *energy++ = mag_squared;
-  }
-}
-
-void FilterbankAccumulateChannels(struct FilterbankState* state,
-                                  const int32_t* energy) {
-  uint64_t* work = state->work;
-  uint64_t weight_accumulator = 0;
-  uint64_t unweight_accumulator = 0;
-
-  const int16_t* channel_frequency_starts = state->channel_frequency_starts;
-  const int16_t* channel_weight_starts = state->channel_weight_starts;
-  const int16_t* channel_widths = state->channel_widths;
-
-  int num_channels_plus_1 = state->num_channels + 1;
-  int i;
-  for (i = 0; i < num_channels_plus_1; ++i) {
-    const int32_t* magnitudes = energy + *channel_frequency_starts++;
-    const int16_t* weights = state->weights + *channel_weight_starts;
-    const int16_t* unweights = state->unweights + *channel_weight_starts++;
-    const int width = *channel_widths++;
-    int j;
-    for (j = 0; j < width; ++j) {
-      weight_accumulator += *weights++ * ((uint64_t)*magnitudes);
-      unweight_accumulator += *unweights++ * ((uint64_t)*magnitudes);
-      ++magnitudes;
-    }
-    *work++ = weight_accumulator;
-    weight_accumulator = unweight_accumulator;
-    unweight_accumulator = 0;
-  }
-}
-
-static uint16_t Sqrt32(uint32_t num) {
-  if (num == 0) {
-    return 0;
-  }
-  uint32_t res = 0;
-  int max_bit_number = 32 - MostSignificantBit32(num);
-  max_bit_number |= 1;
-  uint32_t bit = 1U << (31 - max_bit_number);
-  int iterations = (31 - max_bit_number) / 2 + 1;
-  while (iterations--) {
-    if (num >= res + bit) {
-      num -= res + bit;
-      res = (res >> 1U) + bit;
-    } else {
-      res >>= 1U;
-    }
-    bit >>= 2U;
-  }
-  // Do rounding - if we have the bits.
-  if (num > res && res != 0xFFFF) {
-    ++res;
-  }
-  return res;
-}
-
-static uint32_t Sqrt64(uint64_t num) {
-  // Take a shortcut and just use 32 bit operations if the upper word is all
-  // clear. This will cause a slight off by one issue for numbers close to 2^32,
-  // but it probably isn't going to matter (and gives us a big performance win).
-  if ((num >> 32) == 0) {
-    return Sqrt32((uint32_t)num);
-  }
-  uint64_t res = 0;
-  int max_bit_number = 64 - MostSignificantBit64(num);
-  max_bit_number |= 1;
-  uint64_t bit = 1ULL << (63 - max_bit_number);
-  int iterations = (63 - max_bit_number) / 2 + 1;
-  while (iterations--) {
-    if (num >= res + bit) {
-      num -= res + bit;
-      res = (res >> 1U) + bit;
-    } else {
-      res >>= 1U;
-    }
-    bit >>= 2U;
-  }
-  // Do rounding - if we have the bits.
-  if (num > res && res != 0xFFFFFFFFLL) {
-    ++res;
-  }
-  return res;
-}
-
-uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
-  const int num_channels = state->num_channels;
-  const uint64_t* work = state->work + 1;
-  // Reuse the work buffer since we're fine clobbering it at this point to hold
-  // the output.
-  uint32_t* output = (uint32_t*)state->work;
-  int i;
-  for (i = 0; i < num_channels; ++i) {
-    *output++ = Sqrt64(*work++) >> scale_down_shift;
-  }
-  return (uint32_t*)state->work;
-}
-
-void FilterbankReset(struct FilterbankState* state) {
-  memset(state->work, 0, (state->num_channels + 1) * sizeof(*state->work));
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.h b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.h
deleted file mode 100644
index 1e6d3885..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
-
-#define kFilterbankBits 12
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct FilterbankState {
-  int num_channels;
-  int start_index;
-  int end_index;
-  int16_t* channel_frequency_starts;
-  int16_t* channel_weight_starts;
-  int16_t* channel_widths;
-  int16_t* weights;
-  int16_t* unweights;
-  uint64_t* work;
-};
-
-// Converts the relevant complex values of an FFT output into energy (the
-// square magnitude).
-void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
-                                         struct complex_int16_t* fft_output,
-                                         int32_t* energy);
-
-// Computes the mel-scale filterbank on the given energy array. Output is cached
-// internally - to fetch it, you need to call FilterbankSqrt.
-void FilterbankAccumulateChannels(struct FilterbankState* state,
-                                  const int32_t* energy);
-
-// Applies an integer square root to the 64 bit intermediate values of the
-// filterbank, and returns a pointer to them. Memory will be invalidated the
-// next time FilterbankAccumulateChannels is called.
-uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift);
-
-void FilterbankReset(struct FilterbankState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
deleted file mode 100644
index f18ebf54..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h"
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-
-#define kFilterbankIndexAlignment 4
-#define kFilterbankChannelBlockSize 4
-
-void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) {
-  config->num_channels = 32;
-  config->lower_band_limit = 125.0f;
-  config->upper_band_limit = 7500.0f;
-  config->output_scale_shift = 7;
-}
-
-static float FreqToMel(float freq) { return 1127.0 * log1p(freq / 700.0); }
-
-static void CalculateCenterFrequencies(const int num_channels,
-                                       const float lower_frequency_limit,
-                                       const float upper_frequency_limit,
-                                       float* center_frequencies) {
-  assert(lower_frequency_limit >= 0.0f);
-  assert(upper_frequency_limit > lower_frequency_limit);
-
-  const float mel_low = FreqToMel(lower_frequency_limit);
-  const float mel_hi = FreqToMel(upper_frequency_limit);
-  const float mel_span = mel_hi - mel_low;
-  const float mel_spacing = mel_span / ((float)num_channels);
-  int i;
-  for (i = 0; i < num_channels; ++i) {
-    center_frequencies[i] = mel_low + (mel_spacing * (i + 1));
-  }
-}
-
-static void QuantizeFilterbankWeights(const float float_weight, int16_t* weight,
-                                      int16_t* unweight) {
-  *weight = floor(float_weight * (1 << kFilterbankBits) + 0.5);
-  *unweight = floor((1.0 - float_weight) * (1 << kFilterbankBits) + 0.5);
-}
-
-int FilterbankPopulateState(const struct FilterbankConfig* config,
-                            struct FilterbankState* state, int sample_rate,
-                            int spectrum_size) {
-  state->num_channels = config->num_channels;
-  const int num_channels_plus_1 = config->num_channels + 1;
-
-  // How should we align things to index counts given the byte alignment?
-  const int index_alignment =
-      (kFilterbankIndexAlignment < sizeof(int16_t)
-           ? 1
-           : kFilterbankIndexAlignment / sizeof(int16_t));
-
-  state->channel_frequency_starts =
-      malloc(num_channels_plus_1 * sizeof(*state->channel_frequency_starts));
-  state->channel_weight_starts =
-      malloc(num_channels_plus_1 * sizeof(*state->channel_weight_starts));
-  state->channel_widths =
-      malloc(num_channels_plus_1 * sizeof(*state->channel_widths));
-  state->work = malloc(num_channels_plus_1 * sizeof(*state->work));
-
-  float* center_mel_freqs =
-      malloc(num_channels_plus_1 * sizeof(*center_mel_freqs));
-  int16_t* actual_channel_starts =
-      malloc(num_channels_plus_1 * sizeof(*actual_channel_starts));
-  int16_t* actual_channel_widths =
-      malloc(num_channels_plus_1 * sizeof(*actual_channel_widths));
-
-  if (state->channel_frequency_starts == NULL ||
-      state->channel_weight_starts == NULL || state->channel_widths == NULL ||
-      center_mel_freqs == NULL || actual_channel_starts == NULL ||
-      actual_channel_widths == NULL) {
-    free(center_mel_freqs);
-    free(actual_channel_starts);
-    free(actual_channel_widths);
-    fprintf(stderr, "Failed to allocate channel buffers\n");
-    return 0;
-  }
-
-  CalculateCenterFrequencies(num_channels_plus_1, config->lower_band_limit,
-                             config->upper_band_limit, center_mel_freqs);
-
-  // Always exclude DC.
-  const float hz_per_sbin = 0.5 * sample_rate / ((float)spectrum_size - 1);
-  state->start_index = 1.5 + config->lower_band_limit / hz_per_sbin;
-  state->end_index = 0;  // Initialized to zero here, but actually set below.
-
-  // For each channel, we need to figure out what frequencies belong to it, and
-  // how much padding we need to add so that we can efficiently multiply the
-  // weights and unweights for accumulation. To simplify the multiplication
-  // logic, all channels will have some multiplication to do (even if there are
-  // no frequencies that accumulate to that channel) - they will be directed to
-  // a set of zero weights.
-  int chan_freq_index_start = state->start_index;
-  int weight_index_start = 0;
-  int needs_zeros = 0;
-
-  int chan;
-  for (chan = 0; chan < num_channels_plus_1; ++chan) {
-    // Keep jumping frequencies until we overshoot the bound on this channel.
-    int freq_index = chan_freq_index_start;
-    while (FreqToMel((freq_index)*hz_per_sbin) <= center_mel_freqs[chan]) {
-      ++freq_index;
-    }
-
-    const int width = freq_index - chan_freq_index_start;
-    actual_channel_starts[chan] = chan_freq_index_start;
-    actual_channel_widths[chan] = width;
-
-    if (width == 0) {
-      // This channel doesn't actually get anything from the frequencies, it's
-      // always zero. We need then to insert some 'zero' weights into the
-      // output, and just redirect this channel to do a single multiplication at
-      // this point. For simplicity, the zeros are placed at the beginning of
-      // the weights arrays, so we have to go and update all the other
-      // weight_starts to reflect this shift (but only once).
-      state->channel_frequency_starts[chan] = 0;
-      state->channel_weight_starts[chan] = 0;
-      state->channel_widths[chan] = kFilterbankChannelBlockSize;
-      if (!needs_zeros) {
-        needs_zeros = 1;
-        int j;
-        for (j = 0; j < chan; ++j) {
-          state->channel_weight_starts[j] += kFilterbankChannelBlockSize;
-        }
-        weight_index_start += kFilterbankChannelBlockSize;
-      }
-    } else {
-      // How far back do we need to go to ensure that we have the proper
-      // alignment?
-      const int aligned_start =
-          (chan_freq_index_start / index_alignment) * index_alignment;
-      const int aligned_width = (chan_freq_index_start - aligned_start + width);
-      const int padded_width =
-          (((aligned_width - 1) / kFilterbankChannelBlockSize) + 1) *
-          kFilterbankChannelBlockSize;
-
-      state->channel_frequency_starts[chan] = aligned_start;
-      state->channel_weight_starts[chan] = weight_index_start;
-      state->channel_widths[chan] = padded_width;
-      weight_index_start += padded_width;
-    }
-    chan_freq_index_start = freq_index;
-  }
-
-  // Allocate the two arrays to store the weights - weight_index_start contains
-  // the index of what would be the next set of weights that we would need to
-  // add, so that's how many weights we need to allocate.
-  state->weights = calloc(weight_index_start, sizeof(*state->weights));
-  state->unweights = calloc(weight_index_start, sizeof(*state->unweights));
-
-  // If the alloc failed, we also need to nuke the arrays.
-  if (state->weights == NULL || state->unweights == NULL) {
-    free(center_mel_freqs);
-    free(actual_channel_starts);
-    free(actual_channel_widths);
-    fprintf(stderr, "Failed to allocate weights or unweights\n");
-    return 0;
-  }
-
-  // Next pass, compute all the weights. Since everything has been memset to
-  // zero, we only need to fill in the weights that correspond to some frequency
-  // for a channel.
-  const float mel_low = FreqToMel(config->lower_band_limit);
-  for (chan = 0; chan < num_channels_plus_1; ++chan) {
-    int frequency = actual_channel_starts[chan];
-    const int num_frequencies = actual_channel_widths[chan];
-    const int frequency_offset =
-        frequency - state->channel_frequency_starts[chan];
-    const int weight_start = state->channel_weight_starts[chan];
-    const float denom_val = (chan == 0) ? mel_low : center_mel_freqs[chan - 1];
-
-    int j;
-    for (j = 0; j < num_frequencies; ++j, ++frequency) {
-      const float weight =
-          (center_mel_freqs[chan] - FreqToMel(frequency * hz_per_sbin)) /
-          (center_mel_freqs[chan] - denom_val);
-
-      // Make the float into an integer for the weights (and unweights).
-      const int weight_index = weight_start + frequency_offset + j;
-      QuantizeFilterbankWeights(weight, state->weights + weight_index,
-                                state->unweights + weight_index);
-    }
-    if (frequency > state->end_index) {
-      state->end_index = frequency;
-    }
-  }
-
-  free(center_mel_freqs);
-  free(actual_channel_starts);
-  free(actual_channel_widths);
-  if (state->end_index >= spectrum_size) {
-    fprintf(stderr, "Filterbank end_index is above spectrum size.\n");
-    return 0;
-  }
-  return 1;
-}
-
-void FilterbankFreeStateContents(struct FilterbankState* state) {
-  free(state->channel_frequency_starts);
-  free(state->channel_weight_starts);
-  free(state->channel_widths);
-  free(state->weights);
-  free(state->unweights);
-  free(state->work);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h
deleted file mode 100644
index 781d1024..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
-
-#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct FilterbankConfig {
-  // number of frequency channel buckets for filterbank
-  int num_channels;
-  // maximum frequency to include
-  float upper_band_limit;
-  // minimum frequency to include
-  float lower_band_limit;
-  // unused
-  int output_scale_shift;
-};
-
-// Fills the frontendConfig with "sane" defaults.
-void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config);
-
-// Allocates any buffers.
-int FilterbankPopulateState(const struct FilterbankConfig* config,
-                            struct FilterbankState* state, int sample_rate,
-                            int spectrum_size);
-
-// Frees any allocated buffers.
-void FilterbankFreeStateContents(struct FilterbankState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.c b/src/tensorflow/lite/experimental/microfrontend/lib/frontend.c
deleted file mode 100644
index 9de2a879..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
-
-#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
-
-struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
-                                             const int16_t* samples,
-                                             size_t num_samples,
-                                             size_t* num_samples_read) {
-  struct FrontendOutput output;
-  output.values = NULL;
-  output.size = 0;
-
-  // Try to apply the window - if it fails, return and wait for more data.
-  if (!WindowProcessSamples(&state->window, samples, num_samples,
-                            num_samples_read)) {
-    return output;
-  }
-
-  // Apply the FFT to the window's output (and scale it so that the fixed point
-  // FFT can have as much resolution as possible).
-  int input_shift =
-      15 - MostSignificantBit32(state->window.max_abs_output_value);
-  FftCompute(&state->fft, state->window.output, input_shift);
-
-  // We can re-ruse the fft's output buffer to hold the energy.
-  int32_t* energy = (int32_t*)state->fft.output;
-
-  FilterbankConvertFftComplexToEnergy(&state->filterbank, state->fft.output,
-                                      energy);
-
-  FilterbankAccumulateChannels(&state->filterbank, energy);
-  uint32_t* scaled_filterbank = FilterbankSqrt(&state->filterbank, input_shift);
-
-  // Apply noise reduction.
-  NoiseReductionApply(&state->noise_reduction, scaled_filterbank);
-
-  if (state->pcan_gain_control.enable_pcan) {
-    PcanGainControlApply(&state->pcan_gain_control, scaled_filterbank);
-  }
-
-  // Apply the log and scale.
-  int correction_bits =
-      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
-  uint16_t* logged_filterbank =
-      LogScaleApply(&state->log_scale, scaled_filterbank,
-                    state->filterbank.num_channels, correction_bits);
-
-  output.size = state->filterbank.num_channels;
-  output.values = logged_filterbank;
-  return output;
-}
-
-void FrontendReset(struct FrontendState* state) {
-  WindowReset(&state->window);
-  FftReset(&state->fft);
-  FilterbankReset(&state->filterbank);
-  NoiseReductionReset(&state->noise_reduction);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.h b/src/tensorflow/lite/experimental/microfrontend/lib/frontend.h
deleted file mode 100644
index 883df5fd..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct FrontendState {
-  struct WindowState window;
-  struct FftState fft;
-  struct FilterbankState filterbank;
-  struct NoiseReductionState noise_reduction;
-  struct PcanGainControlState pcan_gain_control;
-  struct LogScaleState log_scale;
-};
-
-struct FrontendOutput {
-  const uint16_t* values;
-  size_t size;
-};
-
-// Main entry point to processing frontend samples. Updates num_samples_read to
-// contain the number of samples that have been consumed from the input array.
-// Returns a struct containing the generated output. If not enough samples were
-// added to generate a feature vector, the returned size will be 0 and the
-// values pointer will be NULL. Note that the output pointer will be invalidated
-// as soon as FrontendProcessSamples is called again, so copy the contents
-// elsewhere if you need to use them later.
-struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
-                                             const int16_t* samples,
-                                             size_t num_samples,
-                                             size_t* num_samples_read);
-
-void FrontendReset(struct FrontendState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c
deleted file mode 100644
index 27224f6d..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
-
-#include <stdio.h>
-#include <string.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
-
-void FrontendFillConfigWithDefaults(struct FrontendConfig* config) {
-  WindowFillConfigWithDefaults(&config->window);
-  FilterbankFillConfigWithDefaults(&config->filterbank);
-  NoiseReductionFillConfigWithDefaults(&config->noise_reduction);
-  PcanGainControlFillConfigWithDefaults(&config->pcan_gain_control);
-  LogScaleFillConfigWithDefaults(&config->log_scale);
-}
-
-int FrontendPopulateState(const struct FrontendConfig* config,
-                          struct FrontendState* state, int sample_rate) {
-  memset(state, 0, sizeof(*state));
-
-  if (!WindowPopulateState(&config->window, &state->window, sample_rate)) {
-    fprintf(stderr, "Failed to populate window state\n");
-    return 0;
-  }
-
-  if (!FftPopulateState(&state->fft, state->window.size)) {
-    fprintf(stderr, "Failed to populate fft state\n");
-    return 0;
-  }
-  FftInit(&state->fft);
-
-  if (!FilterbankPopulateState(&config->filterbank, &state->filterbank,
-                               sample_rate, state->fft.fft_size / 2 + 1)) {
-    fprintf(stderr, "Failed to populate filterbank state\n");
-    return 0;
-  }
-
-  if (!NoiseReductionPopulateState(&config->noise_reduction,
-                                   &state->noise_reduction,
-                                   state->filterbank.num_channels)) {
-    fprintf(stderr, "Failed to populate noise reduction state\n");
-    return 0;
-  }
-
-  int input_correction_bits =
-      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
-  if (!PcanGainControlPopulateState(
-          &config->pcan_gain_control, &state->pcan_gain_control,
-          state->noise_reduction.estimate, state->filterbank.num_channels,
-          state->noise_reduction.smoothing_bits, input_correction_bits)) {
-    fprintf(stderr, "Failed to populate pcan gain control state\n");
-    return 0;
-  }
-
-  if (!LogScalePopulateState(&config->log_scale, &state->log_scale)) {
-    fprintf(stderr, "Failed to populate log scale state\n");
-    return 0;
-  }
-
-  FrontendReset(state);
-
-  // All good, return a true value.
-  return 1;
-}
-
-void FrontendFreeStateContents(struct FrontendState* state) {
-  WindowFreeStateContents(&state->window);
-  FftFreeStateContents(&state->fft);
-  FilterbankFreeStateContents(&state->filterbank);
-  NoiseReductionFreeStateContents(&state->noise_reduction);
-  PcanGainControlFreeStateContents(&state->pcan_gain_control);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h
deleted file mode 100644
index 895ce6cd..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
-
-#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct FrontendConfig {
-  struct WindowConfig window;
-  struct FilterbankConfig filterbank;
-  struct NoiseReductionConfig noise_reduction;
-  struct PcanGainControlConfig pcan_gain_control;
-  struct LogScaleConfig log_scale;
-};
-
-// Fills the frontendConfig with "sane" defaults.
-void FrontendFillConfigWithDefaults(struct FrontendConfig* config);
-
-// Allocates any buffers.
-int FrontendPopulateState(const struct FrontendConfig* config,
-                          struct FrontendState* state, int sample_rate);
-
-// Frees any allocated buffers.
-void FrontendFreeStateContents(struct FrontendState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h b/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h
deleted file mode 100644
index f704677d..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_COMMON_H_
-
-// This header file should be included in all variants of kiss_fft_$type.{h,cc}
-// so that their sub-included source files do not mistakenly wrap libc header
-// files within their kissfft_$type namespaces.
-// E.g, This header avoids kissfft_int16.h containing:
-//   namespace kiss_fft_int16 {
-//     #include "third_party/kissfft/kiss_fft.h"
-//   }
-// where kiss_fft_.h contains:
-//   #include <math.h>
-//
-// TRICK: By including the following header files here, their preprocessor
-// header guards prevent them being re-defined inside of the kiss_fft_$type
-// namespaces declared within the kiss_fft_$type.{h,cc} sources.
-// Note that the original kiss_fft*.h files are untouched since they
-// may be used in libraries that include them directly.
-
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef FIXED_POINT
-#include <sys/types.h>
-#endif
-
-#ifdef USE_SIMD
-#include <xmmintrin.h>
-#endif
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_COMMON_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.cpp b/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.cpp
deleted file mode 100644
index 54630661..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h"
-
-#define FIXED_POINT 16
-namespace kissfft_fixed16 {
-#include "third_party/kissfft/kiss_fft.c"
-#include "third_party/kissfft/tools/kiss_fftr.c"
-}  // namespace kissfft_fixed16
-#undef FIXED_POINT
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h b/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h
deleted file mode 100644
index 380307a4..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/kiss_fft_int16.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_INT16_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_INT16_H_
-
-#include "tensorflow/lite/experimental/microfrontend/lib/kiss_fft_common.h"
-
-// Wrap 16-bit kiss fft in its own namespace. Enables us to link an application
-// with different kiss fft resultions (16/32 bit interger, float, double)
-// without getting a linker error.
-#define FIXED_POINT 16
-namespace kissfft_fixed16 {
-#include "third_party/kissfft/kiss_fft.h"
-#include "third_party/kissfft/tools/kiss_fftr.h"
-}  // namespace kissfft_fixed16
-#undef FIXED_POINT
-#undef kiss_fft_scalar
-#undef KISS_FFT_H
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_KISS_FFT_INT16_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.c b/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.c
deleted file mode 100644
index f59618e0..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/log_lut.h"
-const uint16_t kLogLut[]
-#ifndef _MSC_VER
-    __attribute__((aligned(4)))
-#endif  // _MSV_VER
-    = {0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
-       2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
-       3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
-       5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507,
-       5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633,
-       5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370,
-       5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762,
-       4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
-       3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
-       2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
-       1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.h b/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.h
deleted file mode 100644
index b2448a32..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/log_lut.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Number of segments in the log lookup table. The table will be kLogSegments+1
-// in length (with some padding).
-#define kLogSegments 128
-#define kLogSegmentsLog2 7
-
-// Scale used by lookup table.
-#define kLogScale 65536
-#define kLogScaleLog2 16
-#define kLogCoeff 45426
-
-extern const uint16_t kLogLut[];
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.c b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
deleted file mode 100644
index c27a50a6..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
-
-#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
-#include "tensorflow/lite/experimental/microfrontend/lib/log_lut.h"
-
-#define kuint16max 0x0000FFFF
-
-// The following functions implement integer logarithms of various sizes. The
-// approximation is calculated according to method described in
-//       www.inti.gob.ar/electronicaeinformatica/instrumentacion/utic/
-//       publicaciones/SPL2007/Log10-spl07.pdf
-// It first calculates log2 of the input and then converts it to natural
-// logarithm.
-
-static uint32_t Log2FractionPart(const uint32_t x, const uint32_t log2x) {
-  // Part 1
-  int32_t frac = x - (1LL << log2x);
-  if (log2x < kLogScaleLog2) {
-    frac <<= kLogScaleLog2 - log2x;
-  } else {
-    frac >>= log2x - kLogScaleLog2;
-  }
-  // Part 2
-  const uint32_t base_seg = frac >> (kLogScaleLog2 - kLogSegmentsLog2);
-  const uint32_t seg_unit =
-      (((uint32_t)1) << kLogScaleLog2) >> kLogSegmentsLog2;
-
-  const int32_t c0 = kLogLut[base_seg];
-  const int32_t c1 = kLogLut[base_seg + 1];
-  const int32_t seg_base = seg_unit * base_seg;
-  const int32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> kLogScaleLog2;
-  return frac + c0 + rel_pos;
-}
-
-static uint32_t Log(const uint32_t x, const uint32_t scale_shift) {
-  const uint32_t integer = MostSignificantBit32(x) - 1;
-  const uint32_t fraction = Log2FractionPart(x, integer);
-  const uint32_t log2 = (integer << kLogScaleLog2) + fraction;
-  const uint32_t round = kLogScale / 2;
-  const uint32_t loge = (((uint64_t)kLogCoeff) * log2 + round) >> kLogScaleLog2;
-  // Finally scale to our output scale
-  const uint32_t loge_scaled = ((loge << scale_shift) + round) >> kLogScaleLog2;
-  return loge_scaled;
-}
-
-uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
-                        int signal_size, int correction_bits) {
-  const int scale_shift = state->scale_shift;
-  uint16_t* output = (uint16_t*)signal;
-  uint16_t* ret = output;
-  int i;
-  for (i = 0; i < signal_size; ++i) {
-    uint32_t value = *signal++;
-    if (state->enable_log) {
-      if (correction_bits < 0) {
-        value >>= -correction_bits;
-      } else {
-        value <<= correction_bits;
-      }
-      if (value > 1) {
-        value = Log(value, scale_shift);
-      } else {
-        value = 0;
-      }
-    }
-    *output++ = (value < kuint16max) ? value : kuint16max;
-  }
-  return ret;
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.h b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.h
deleted file mode 100644
index a383f32f..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct LogScaleState {
-  int enable_log;
-  int scale_shift;
-};
-
-// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note
-// that the signal array will be modified.
-uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
-                        int signal_size, int correction_bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c
deleted file mode 100644
index 0e3dd1d1..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
-
-void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config) {
-  config->enable_log = 1;
-  config->scale_shift = 6;
-}
-
-int LogScalePopulateState(const struct LogScaleConfig* config,
-                          struct LogScaleState* state) {
-  state->enable_log = config->enable_log;
-  state->scale_shift = config->scale_shift;
-  return 1;
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h
deleted file mode 100644
index 11f7d9ee..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct LogScaleConfig {
-  // set to false (0) to disable this module
-  int enable_log;
-  // scale results by 2^(scale_shift)
-  int scale_shift;
-};
-
-// Populates the LogScaleConfig with "sane" default values.
-void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config);
-
-// Allocates any buffers.
-int LogScalePopulateState(const struct LogScaleConfig* config,
-                          struct LogScaleState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c
deleted file mode 100644
index 16b30e66..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
-
-#include <string.h>
-
-void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal) {
-  int i;
-  for (i = 0; i < state->num_channels; ++i) {
-    const uint32_t smoothing =
-        ((i & 1) == 0) ? state->even_smoothing : state->odd_smoothing;
-    const uint32_t one_minus_smoothing = (1 << kNoiseReductionBits) - smoothing;
-
-    // Update the estimate of the noise.
-    const uint32_t signal_scaled_up = signal[i] << state->smoothing_bits;
-    uint32_t estimate =
-        (((uint64_t)signal_scaled_up * smoothing) +
-         ((uint64_t)state->estimate[i] * one_minus_smoothing)) >>
-        kNoiseReductionBits;
-    state->estimate[i] = estimate;
-
-    // Make sure that we can't get a negative value for the signal - estimate.
-    if (estimate > signal_scaled_up) {
-      estimate = signal_scaled_up;
-    }
-
-    const uint32_t floor =
-        ((uint64_t)signal[i] * state->min_signal_remaining) >>
-        kNoiseReductionBits;
-    const uint32_t subtracted =
-        (signal_scaled_up - estimate) >> state->smoothing_bits;
-    const uint32_t output = subtracted > floor ? subtracted : floor;
-    signal[i] = output;
-  }
-}
-
-void NoiseReductionReset(struct NoiseReductionState* state) {
-  memset(state->estimate, 0, sizeof(*state->estimate) * state->num_channels);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h
deleted file mode 100644
index 46d3f52e..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
-
-#define kNoiseReductionBits 14
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct NoiseReductionState {
-  int smoothing_bits;
-  uint16_t even_smoothing;
-  uint16_t odd_smoothing;
-  uint16_t min_signal_remaining;
-  int num_channels;
-  uint32_t* estimate;
-};
-
-// Removes stationary noise from each channel of the signal using a low pass
-// filter.
-void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal);
-
-void NoiseReductionReset(struct NoiseReductionState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c
deleted file mode 100644
index a6c9234e..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
-
-#include <stdio.h>
-
-void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config) {
-  config->smoothing_bits = 10;
-  config->even_smoothing = 0.025;
-  config->odd_smoothing = 0.06;
-  config->min_signal_remaining = 0.05;
-}
-
-int NoiseReductionPopulateState(const struct NoiseReductionConfig* config,
-                                struct NoiseReductionState* state,
-                                int num_channels) {
-  state->smoothing_bits = config->smoothing_bits;
-  state->odd_smoothing = config->odd_smoothing * (1 << kNoiseReductionBits);
-  state->even_smoothing = config->even_smoothing * (1 << kNoiseReductionBits);
-  state->min_signal_remaining =
-      config->min_signal_remaining * (1 << kNoiseReductionBits);
-  state->num_channels = num_channels;
-  state->estimate = calloc(state->num_channels, sizeof(*state->estimate));
-  if (state->estimate == NULL) {
-    fprintf(stderr, "Failed to alloc estimate buffer\n");
-    return 0;
-  }
-  return 1;
-}
-
-void NoiseReductionFreeStateContents(struct NoiseReductionState* state) {
-  free(state->estimate);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h
deleted file mode 100644
index fa555391..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
-
-#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct NoiseReductionConfig {
-  // scale the signal up by 2^(smoothing_bits) before reduction
-  int smoothing_bits;
-  // smoothing coefficient for even-numbered channels
-  float even_smoothing;
-  // smoothing coefficient for odd-numbered channels
-  float odd_smoothing;
-  // fraction of signal to preserve (1.0 disables this module)
-  float min_signal_remaining;
-};
-
-// Populates the NoiseReductionConfig with "sane" default values.
-void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config);
-
-// Allocates any buffers.
-int NoiseReductionPopulateState(const struct NoiseReductionConfig* config,
-                                struct NoiseReductionState* state,
-                                int num_channels);
-
-// Frees any allocated buffers.
-void NoiseReductionFreeStateContents(struct NoiseReductionState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
deleted file mode 100644
index 22d58767..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
-
-#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
-
-int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut) {
-  if (x <= 2) {
-    return lut[x];
-  }
-
-  const int16_t interval = MostSignificantBit32(x);
-  lut += 4 * interval - 6;
-
-  const int16_t frac =
-      ((interval < 11) ? (x << (11 - interval)) : (x >> (interval - 11))) &
-      0x3FF;
-
-  int32_t result = ((int32_t)lut[2] * frac) >> 5;
-  result += (int32_t)((uint32_t)lut[1] << 5);
-  result *= frac;
-  result = (result + (1 << 14)) >> 15;
-  result += lut[0];
-  return (int16_t)result;
-}
-
-uint32_t PcanShrink(const uint32_t x) {
-  if (x < (2 << kPcanSnrBits)) {
-    return (x * x) >> (2 + 2 * kPcanSnrBits - kPcanOutputBits);
-  } else {
-    return (x >> (kPcanSnrBits - kPcanOutputBits)) - (1 << kPcanOutputBits);
-  }
-}
-
-void PcanGainControlApply(struct PcanGainControlState* state,
-                          uint32_t* signal) {
-  int i;
-  for (i = 0; i < state->num_channels; ++i) {
-    const uint32_t gain =
-        WideDynamicFunction(state->noise_estimate[i], state->gain_lut);
-    const uint32_t snr = ((uint64_t)signal[i] * gain) >> state->snr_shift;
-    signal[i] = PcanShrink(snr);
-  }
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
deleted file mode 100644
index 3f6222be..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#define kPcanSnrBits 12
-#define kPcanOutputBits 6
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Details at https://research.google/pubs/pub45911.pdf
-struct PcanGainControlState {
-  int enable_pcan;
-  uint32_t* noise_estimate;
-  int num_channels;
-  int16_t* gain_lut;
-  int32_t snr_shift;
-};
-
-int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut);
-
-uint32_t PcanShrink(const uint32_t x);
-
-void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
deleted file mode 100644
index e850d439..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
-
-#include <math.h>
-#include <stdio.h>
-
-#define kint16max 0x00007FFF
-
-void PcanGainControlFillConfigWithDefaults(
-    struct PcanGainControlConfig* config) {
-  config->enable_pcan = 0;
-  config->strength = 0.95;
-  config->offset = 80.0;
-  config->gain_bits = 21;
-}
-
-int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
-                               int32_t input_bits, uint32_t x) {
-  const float x_as_float = ((float)x) / ((uint32_t)1 << input_bits);
-  const float gain_as_float =
-      ((uint32_t)1 << config->gain_bits) *
-      powf(x_as_float + config->offset, -config->strength);
-
-  if (gain_as_float > kint16max) {
-    return kint16max;
-  }
-  return (int16_t)(gain_as_float + 0.5f);
-}
-
-int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
-                                 struct PcanGainControlState* state,
-                                 uint32_t* noise_estimate,
-                                 const int num_channels,
-                                 const uint16_t smoothing_bits,
-                                 const int32_t input_correction_bits) {
-  state->enable_pcan = config->enable_pcan;
-  if (!state->enable_pcan) {
-    return 1;
-  }
-  state->noise_estimate = noise_estimate;
-  state->num_channels = num_channels;
-  state->gain_lut = malloc(kWideDynamicFunctionLUTSize * sizeof(int16_t));
-  if (state->gain_lut == NULL) {
-    fprintf(stderr, "Failed to allocate gain LUT\n");
-    return 0;
-  }
-  state->snr_shift = config->gain_bits - input_correction_bits - kPcanSnrBits;
-
-  const int32_t input_bits = smoothing_bits - input_correction_bits;
-  state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
-  state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
-  state->gain_lut -= 6;
-  int interval;
-  for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
-    const uint32_t x0 = (uint32_t)1 << (interval - 1);
-    const uint32_t x1 = x0 + (x0 >> 1);
-    const uint32_t x2 =
-        (interval == kWideDynamicFunctionBits) ? x0 + (x0 - 1) : 2 * x0;
-
-    const int16_t y0 = PcanGainLookupFunction(config, input_bits, x0);
-    const int16_t y1 = PcanGainLookupFunction(config, input_bits, x1);
-    const int16_t y2 = PcanGainLookupFunction(config, input_bits, x2);
-
-    const int32_t diff1 = (int32_t)y1 - y0;
-    const int32_t diff2 = (int32_t)y2 - y0;
-    const int32_t a1 = 4 * diff1 - diff2;
-    const int32_t a2 = diff2 - a1;
-
-    state->gain_lut[4 * interval] = y0;
-    state->gain_lut[4 * interval + 1] = (int16_t)a1;
-    state->gain_lut[4 * interval + 2] = (int16_t)a2;
-  }
-  state->gain_lut += 6;
-  return 1;
-}
-
-void PcanGainControlFreeStateContents(struct PcanGainControlState* state) {
-  free(state->gain_lut);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h
deleted file mode 100644
index d4bfaa2e..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
-
-#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
-
-#define kWideDynamicFunctionBits 32
-#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct PcanGainControlConfig {
-  // set to false (0) to disable this module
-  int enable_pcan;
-  // gain normalization exponent (0.0 disables, 1.0 full strength)
-  float strength;
-  // positive value added in the normalization denominator
-  float offset;
-  // number of fractional bits in the gain
-  int gain_bits;
-};
-
-void PcanGainControlFillConfigWithDefaults(
-    struct PcanGainControlConfig* config);
-
-int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
-                               int32_t input_bits, uint32_t x);
-
-int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
-                                 struct PcanGainControlState* state,
-                                 uint32_t* noise_estimate,
-                                 const int num_channels,
-                                 const uint16_t smoothing_bits,
-                                 const int32_t input_correction_bits);
-
-void PcanGainControlFreeStateContents(struct PcanGainControlState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window.c b/src/tensorflow/lite/experimental/microfrontend/lib/window.c
deleted file mode 100644
index 10da6762..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/window.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
-
-#include <string.h>
-
-int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
-                         size_t num_samples, size_t* num_samples_read) {
-  const int size = state->size;
-
-  // Copy samples from the samples buffer over to our local input.
-  size_t max_samples_to_copy = state->size - state->input_used;
-  if (max_samples_to_copy > num_samples) {
-    max_samples_to_copy = num_samples;
-  }
-  memcpy(state->input + state->input_used, samples,
-         max_samples_to_copy * sizeof(*samples));
-  *num_samples_read = max_samples_to_copy;
-  state->input_used += max_samples_to_copy;
-
-  if (state->input_used < state->size) {
-    // We don't have enough samples to compute a window.
-    return 0;
-  }
-
-  // Apply the window to the input.
-  const int16_t* coefficients = state->coefficients;
-  const int16_t* input = state->input;
-  int16_t* output = state->output;
-  int i;
-  int16_t max_abs_output_value = 0;
-  for (i = 0; i < size; ++i) {
-    int16_t new_value =
-        (((int32_t)*input++) * *coefficients++) >> kFrontendWindowBits;
-    *output++ = new_value;
-    if (new_value < 0) {
-      new_value = -new_value;
-    }
-    if (new_value > max_abs_output_value) {
-      max_abs_output_value = new_value;
-    }
-  }
-  // Shuffle the input down by the step size, and update how much we have used.
-  memmove(state->input, state->input + state->step,
-          sizeof(*state->input) * (state->size - state->step));
-  state->input_used -= state->step;
-  state->max_abs_output_value = max_abs_output_value;
-
-  // Indicate that the output buffer is valid for the next stage.
-  return 1;
-}
-
-void WindowReset(struct WindowState* state) {
-  memset(state->input, 0, state->size * sizeof(*state->input));
-  memset(state->output, 0, state->size * sizeof(*state->output));
-  state->input_used = 0;
-  state->max_abs_output_value = 0;
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window.h b/src/tensorflow/lite/experimental/microfrontend/lib/window.h
deleted file mode 100644
index bad81514..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/window.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#define kFrontendWindowBits 12
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct WindowState {
-  size_t size;
-  int16_t* coefficients;
-  size_t step;
-
-  int16_t* input;
-  size_t input_used;
-  int16_t* output;
-  int16_t max_abs_output_value;
-};
-
-// Applies a window to the samples coming in, stepping forward at the given
-// rate.
-int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
-                         size_t num_samples, size_t* num_samples_read);
-
-void WindowReset(struct WindowState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.c b/src/tensorflow/lite/experimental/microfrontend/lib/window_util.c
deleted file mode 100644
index eee6e7b5..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-// Some platforms don't have M_PI
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-void WindowFillConfigWithDefaults(struct WindowConfig* config) {
-  config->size_ms = 25;
-  config->step_size_ms = 10;
-}
-
-int WindowPopulateState(const struct WindowConfig* config,
-                        struct WindowState* state, int sample_rate) {
-  state->size = config->size_ms * sample_rate / 1000;
-  state->step = config->step_size_ms * sample_rate / 1000;
-
-  state->coefficients = malloc(state->size * sizeof(*state->coefficients));
-  if (state->coefficients == NULL) {
-    fprintf(stderr, "Failed to allocate window coefficients\n");
-    return 0;
-  }
-
-  // Populate the window values.
-  const float arg = M_PI * 2.0 / ((float)state->size);
-  int i;
-  for (i = 0; i < state->size; ++i) {
-    float float_value = 0.5 - (0.5 * cos(arg * (i + 0.5)));
-    // Scale it to fixed point and round it.
-    state->coefficients[i] =
-        floor(float_value * (1 << kFrontendWindowBits) + 0.5);
-  }
-
-  state->input_used = 0;
-  state->input = malloc(state->size * sizeof(*state->input));
-  if (state->input == NULL) {
-    fprintf(stderr, "Failed to allocate window input\n");
-    return 0;
-  }
-
-  state->output = malloc(state->size * sizeof(*state->output));
-  if (state->output == NULL) {
-    fprintf(stderr, "Failed to allocate window output\n");
-    return 0;
-  }
-
-  return 1;
-}
-
-void WindowFreeStateContents(struct WindowState* state) {
-  free(state->coefficients);
-  free(state->input);
-  free(state->output);
-}
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.h b/src/tensorflow/lite/experimental/microfrontend/lib/window_util.h
deleted file mode 100644
index 68e4de9e..00000000
--- a/src/tensorflow/lite/experimental/microfrontend/lib/window_util.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
-
-#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct WindowConfig {
-  // length of window frame in milliseconds
-  size_t size_ms;
-  // length of step for next frame in milliseconds
-  size_t step_size_ms;
-};
-
-// Populates the WindowConfig with "sane" default values.
-void WindowFillConfigWithDefaults(struct WindowConfig* config);
-
-// Allocates any buffers.
-int WindowPopulateState(const struct WindowConfig* config,
-                        struct WindowState* state, int sample_rate);
-
-// Frees any allocated buffers.
-void WindowFreeStateContents(struct WindowState* state);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
diff --git a/src/tensorflow/lite/kernels/internal/common.cpp b/src/tensorflow/lite/kernels/internal/common.cpp
new file mode 100644
index 00000000..1654ab84
--- /dev/null
+++ b/src/tensorflow/lite/kernels/internal/common.cpp
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
+int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  // Inputs:
+  // - quantized_multiplier has fixed point at bit 31
+  // - shift is -31 to +7 (negative for right shift)
+  //
+  // Assumptions: The following input ranges are assumed
+  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
+  // - scaling is chosen so final scaled result fits in int32_t
+  // - input x is in the range -(1<<47) <= x < (1<<47)
+  assert(quantized_multiplier >= 0);
+  assert(shift >= -31 && shift < 8);
+  assert(x >= -(static_cast<int64_t>(1) << 47) &&
+         x < (static_cast<int64_t>(1) << 47));
+
+  int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
+                                   ? ((quantized_multiplier + (1 << 15)) >> 16)
+                                   : 0x7FFF;
+  int total_shift = 15 - shift;
+  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
+  int32_t result = x >> total_shift;
+  return result;
+}
+
+}  // namespace tflite
diff --git a/src/tensorflow/lite/kernels/internal/common.h b/src/tensorflow/lite/kernels/internal/common.h
index 7d38cd14..a9acb4f2 100644
--- a/src/tensorflow/lite/kernels/internal/common.h
+++ b/src/tensorflow/lite/kernels/internal/common.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <functional>
 
 #include "third_party/gemmlowp/fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -250,42 +251,11 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
 
-inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  int left_shift = shift > 0 ? shift : 0;
-  int right_shift = shift > 0 ? 0 : -shift;
-  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                                 x * (1 << left_shift), quantized_multiplier),
-                             right_shift);
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
 
-inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // Inputs:
-  // - quantized_multiplier has fixed point at bit 31
-  // - shift is -31 to +7 (negative for right shift)
-  //
-  // Assumptions: The following input ranges are assumed
-  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32_t
-  // - input x is in the range -(1<<47) <= x < (1<<47)
-  assert(quantized_multiplier >= 0);
-  assert(shift >= -31 && shift < 8);
-  assert(x >= -(static_cast<int64_t>(1) << 47) &&
-         x < (static_cast<int64_t>(1) << 47));
-
-  int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
-                                   ? ((quantized_multiplier + (1 << 15)) >> 16)
-                                   : 0x7FFF;
-  int total_shift = 15 - shift;
-  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
-  int32_t result = x >> total_shift;
-  return result;
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
 
 #ifdef USE_NEON
 // Round uses ARM's rounding shift right.
@@ -328,14 +298,16 @@ template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
                 "Only unsigned integer types handled.");
-#if defined(__GNUC__)
-  return integer_input ? __builtin_clz(integer_input)
-                       : std::numeric_limits<T>::digits;
-#else
   if (integer_input == 0) {
     return std::numeric_limits<T>::digits;
   }
-
+#if defined(__GNUC__)
+  if (std::is_same<T, uint32_t>::value) {
+    return __builtin_clz(integer_input);
+  } else if (std::is_same<T, uint64_t>::value) {
+    return __builtin_clzll(integer_input);
+  }
+#endif
   const T one_in_leading_positive = static_cast<T>(1)
                                     << (std::numeric_limits<T>::digits - 1);
   int leading_zeros = 0;
@@ -344,7 +316,6 @@ int CountLeadingZeros(T integer_input) {
     ++leading_zeros;
   }
   return leading_zeros;
-#endif
 }
 
 template <typename T>
@@ -1039,8 +1010,8 @@ inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
 
 // Copies dims to desc, calculating strides.
 template <int N>
-inline void CopyDimsToDesc(const RuntimeShape& input_shape,
-                           NdArrayDesc<N>* desc_out) {
+TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape,
+                                    NdArrayDesc<N>* desc_out) {
   int desc_stride = 1;
   for (int i = N - 1; i >= 0; --i) {
     desc_out->extents[i] = input_shape.Dims(i);
diff --git a/src/tensorflow/lite/kernels/internal/portable_tensor.h b/src/tensorflow/lite/kernels/internal/portable_tensor.h
index 45135b1f..1eee6217 100644
--- a/src/tensorflow/lite/kernels/internal/portable_tensor.h
+++ b/src/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -23,10 +23,6 @@ limitations under the License.
 
 namespace tflite {
 
-inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
-  return RuntimeShape(data.size(), data.data());
-}
-
 // A list of tensors in a format that can be used by kernels like split and
 // concatenation.
 template <typename T>
diff --git a/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp b/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp
index a9cfee8e..024043d7 100644
--- a/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp
+++ b/src/tensorflow/lite/kernels/internal/portable_tensor_utils.cpp
@@ -70,13 +70,19 @@ void ApplySignbitToVector(const float* __restrict__ vector, int v_size,
 
 void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements,
                              int8_t* dst_buffer) {
-  for (int i = 0; i < num_elements; i += 2) {
+  for (int i = 0; i < num_elements / 2; i++) {
+    int8_t byte = src_buffer[i];
     // Shift left first so that sign is properly extended when shifted right
-    dst_buffer[i] = static_cast<int8_t>(src_buffer[i / 2] << 4) >> 4;
-    // Break early if the tensor has odd length and the higher nibble should be
-    // ignored.
-    if (i + 1 == num_elements) break;
-    dst_buffer[i + 1] = static_cast<int8_t>(src_buffer[i / 2]) >> 4;
+    int8_t lower = static_cast<int8_t>(byte << 4) >> 4;
+    int8_t higher = byte >> 4;
+    dst_buffer[2 * i] = lower;
+    dst_buffer[2 * i + 1] = higher;
+  }
+
+  // If the buffer size is odd, extract the final lower nibble.
+  if (num_elements % 2 != 0) {
+    dst_buffer[num_elements - 1] =
+        static_cast<int8_t>(src_buffer[num_elements / 2] << 4) >> 4;
   }
 }
 
diff --git a/src/tensorflow/lite/kernels/internal/reference/add.h b/src/tensorflow/lite/kernels/internal/reference/add.h
index ae1f47a8..faffb097 100644
--- a/src/tensorflow/lite/kernels/internal/reference/add.h
+++ b/src/tensorflow/lite/kernels/internal/reference/add.h
@@ -194,18 +194,20 @@ inline void Add(const ArithmeticParams& params,
   }
 }
 
-template <typename T>
-inline typename std::enable_if<!is_small_integer<T>::value, void>::type
-BroadcastAdd4DSlow(const ArithmeticParams& params,
+template <typename T,
+          // For unquantized add for small integers, explictly set to true.
+          bool dummy = false>
+inline typename std::enable_if<!is_small_integer<T>::value || dummy, void>::type
+BroadcastAdd6DSlow(const ArithmeticParams& params,
                    const RuntimeShape& input1_shape, const T* input1_data,
                    const RuntimeShape& input2_shape, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+  NdArrayDesc<6> desc1;
+  NdArrayDesc<6> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(6, output_shape);
 
   T activation_min, activation_max;
   GetActivationParams(params, &activation_min, &activation_max);
@@ -221,18 +223,64 @@ BroadcastAdd4DSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax<T>(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
+              output_data[output_offset_c] = ActivationFunctionWithMinMax<T>(
+                  input1_data[input1_offset_c] + input2_data[input2_offset_c],
                   activation_min, activation_max);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape.Dims(5);
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
         }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape.Dims(3) *
+                           extended_output_shape.Dims(4) *
+                           extended_output_shape.Dims(5);
       }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
+          extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
     }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
+        extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
+        extended_output_shape.Dims(5);
   }
 }
 
@@ -241,16 +289,16 @@ BroadcastAdd4DSlow(const ArithmeticParams& params,
 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
 template <typename T>
 inline typename std::enable_if<is_small_integer<T>::value, void>::type
-BroadcastAdd4DSlow(const ArithmeticParams& params,
+BroadcastAdd6DSlow(const ArithmeticParams& params,
                    const RuntimeShape& input1_shape, const T* input1_data,
                    const RuntimeShape& input2_shape, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+  NdArrayDesc<6> desc1;
+  NdArrayDesc<6> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(6, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -263,44 +311,98 @@ BroadcastAdd4DSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32_t input1_val =
-              params.input1_offset +
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32_t input2_val =
-              params.input2_offset +
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32_t shifted_input1_val =
-              input1_val * (1 << params.left_shift);
-          const int32_t shifted_input2_val =
-              input2_val * (1 << params.left_shift);
-          const int32_t scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, params.input1_multiplier,
-                  params.input1_shift);
-          const int32_t scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, params.input2_multiplier,
-                  params.input2_shift);
-          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32_t raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, params.output_multiplier, params.output_shift) +
-              params.output_offset;
-          const int32_t clamped_output =
-              std::min(params.quantized_activation_max,
-                       std::max(params.quantized_activation_min, raw_output));
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<T>(clamped_output);
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
+              const int32_t input1_val =
+                  params.input1_offset + input1_data[input1_offset_c];
+              const int32_t input2_val =
+                  params.input2_offset + input2_data[input2_offset_c];
+              const int32_t shifted_input1_val =
+                  input1_val * (1 << params.left_shift);
+              const int32_t shifted_input2_val =
+                  input2_val * (1 << params.left_shift);
+              const int32_t scaled_input1_val =
+                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                      shifted_input1_val, params.input1_multiplier,
+                      params.input1_shift);
+              const int32_t scaled_input2_val =
+                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                      shifted_input2_val, params.input2_multiplier,
+                      params.input2_shift);
+              const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+              const int32_t raw_output =
+                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                      raw_sum, params.output_multiplier, params.output_shift) +
+                  params.output_offset;
+              const int32_t clamped_output = std::min(
+                  params.quantized_activation_max,
+                  std::max(params.quantized_activation_min, raw_output));
+              output_data[output_offset_c] = static_cast<T>(clamped_output);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape.Dims(5);
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
         }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape.Dims(3) *
+                           extended_output_shape.Dims(4) *
+                           extended_output_shape.Dims(5);
       }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
+          extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
     }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
+        extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
+        extended_output_shape.Dims(5);
   }
 }
 
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  return BroadcastAdd6DSlow(params, input1_shape, input1_data, input2_shape,
+                            input2_data, output_shape, output_data);
+}
+
 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
                                  const uint8_t* unswitched_input1_data,
diff --git a/src/tensorflow/lite/kernels/internal/reference/comparisons.cpp b/src/tensorflow/lite/kernels/internal/reference/comparisons.cpp
new file mode 100644
index 00000000..86b4a6af
--- /dev/null
+++ b/src/tensorflow/lite/kernels/internal/reference/comparisons.cpp
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+
+namespace tflite {
+namespace reference_ops {
+
+BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+    const RuntimeShape& unextended_input1_shape,
+    const RuntimeShape& unextended_input2_shape,
+    const RuntimeShape& unextended_output_shape) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
+          desc2};
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
diff --git a/src/tensorflow/lite/kernels/internal/reference/comparisons.h b/src/tensorflow/lite/kernels/internal/reference/comparisons.h
index d3b8c115..35583195 100644
--- a/src/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/src/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -112,20 +112,11 @@ struct BroadcastComparison4DSlowCommon {
   NdArrayDesc<4> desc2;
 };
 
-inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+TFLITE_NOINLINE
+BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
     const RuntimeShape& unextended_input1_shape,
     const RuntimeShape& unextended_input2_shape,
-    const RuntimeShape& unextended_output_shape) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
-          desc2};
-}
+    const RuntimeShape& unextended_output_shape);
 
 template <typename T, ComparisonFn<T> F>
 inline void BroadcastComparison4DSlowImpl(
diff --git a/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index 8d9b318c..579964dc 100644
--- a/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -35,30 +35,31 @@ inline void CheckArithmeticParams(const ArithmeticParams& params) {
   TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
 }
 
-inline void ElementWise(
-    int size, const ArithmeticParams& params, const int8_t* input1_data,
-    const int8_t* input2_data, int8_t* output_data,
-    void (*check_arithmetic_params)(const ArithmeticParams&),
-    int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
+// TODO(b/270589088): move to a more appropriate file (b/270589088#comment2)
+template <typename T>
+void ElementWise(int size, const ArithmeticParams& params, const T* input1_data,
+                 const T* input2_data, T* output_data,
+                 void (*check_arithmetic_params)(const ArithmeticParams&),
+                 T (*binary_func)(T, T, const ArithmeticParams&)) {
   CheckArithmeticParams(params);
   for (int i = 0; i < size; ++i) {
     output_data[i] = binary_func(input1_data[i], input2_data[i], params);
   }
 }
-
-inline void BroadcastBinaryFunction4DSlow(
+// TODO(b/270589088): move to a more appropriate file. (b/270589088#comment2)
+template <typename T>
+void BroadcastBinaryFunction6DSlow(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
-    const int8_t* input1_data, const RuntimeShape& input2_shape,
-    const int8_t* input2_data, const RuntimeShape& output_shape,
-    int8_t* output_data,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
     void (*check_arithmetic_params)(const ArithmeticParams&),
-    int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  NdArrayDesc<6> desc1;
+  NdArrayDesc<6> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(6, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -71,19 +72,79 @@ inline void BroadcastBinaryFunction4DSlow(
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
+              output_data[output_offset_c] =
+                  binary_func(input1_data[input1_offset_c],
+                              input2_data[input2_offset_c], params);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape.Dims(5);
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
         }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape.Dims(3) *
+                           extended_output_shape.Dims(4) *
+                           extended_output_shape.Dims(5);
       }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
+          extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
     }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
+        extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
+        extended_output_shape.Dims(5);
   }
 }
 
+template <typename T>
+void BroadcastBinaryFunction4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                check_arithmetic_params, binary_func);
+}
+
 inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
   const int32_t input1_val = params.input1_offset + x;
   const int32_t input2_val = params.input2_offset + y;
@@ -127,6 +188,18 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
+inline void BroadcastAdd6DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+
 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
                                const int8_t* input1_data,
@@ -134,7 +207,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const int8_t* input2_data,
                                const RuntimeShape& output_shape,
                                int8_t* output_data) {
-  BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
                                 input2_data, output_shape, output_data,
                                 CheckArithmeticParams, AddFunc);
 }
diff --git a/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
index 09d37b72..7e3f690e 100644
--- a/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
+++ b/src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -1,10 +1,10 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,65 +15,4 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
 
-#include <algorithm>
-
-#include "tensorflow/lite/kernels/internal/common.h"
-
-namespace tflite {
-namespace reference_integer_ops {
-
-template <typename integer_type>
-inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
-                 int32_t shift, const RuntimeShape& unextended_input_shape,
-                 const integer_type* input_data, int32_t input_zero_point,
-                 const RuntimeShape& unextended_output_shape,
-                 integer_type* output_data, int32_t output_zero_point) {
-  // Current implementation only supports dimension equals 4 and simultaneous
-  // reduction over width and height.
-  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-  const int output_batch = output_shape.Dims(0);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  const int output_depth = output_shape.Dims(3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int num_elements_in_axis = input_width * input_height;
-
-  TFLITE_CHECK_EQ(op_params.axis_count, 2);
-  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
-  TFLITE_CHECK_EQ(output_height, 1);
-  TFLITE_CHECK_EQ(output_width, 1);
-
-  static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
-  static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
-
-  for (int out_b = 0; out_b < output_batch; ++out_b) {
-    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32_t acc = 0;
-      for (int in_h = 0; in_h < input_height; ++in_h) {
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
-                 input_zero_point;
-        }
-      }
-      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
-      acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
-                    : (acc - num_elements_in_axis / 2) / num_elements_in_axis;
-      acc += output_zero_point;
-      acc = std::min(std::max(acc, kMinInt), kMaxInt);
-      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-          static_cast<integer_type>(acc);
-    }
-  }
-}
-
-}  // namespace reference_integer_ops
-}  // namespace tflite
-
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
diff --git a/src/tensorflow/lite/kernels/internal/reference/mul.h b/src/tensorflow/lite/kernels/internal/reference/mul.h
index 53197732..2767fef2 100644
--- a/src/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/src/tensorflow/lite/kernels/internal/reference/mul.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ inline void Mul(const ArithmeticParams& params,
   const int flat_size =
       MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
+    output_data[i] = ActivationFunctionWithMinMax<T>(
         input1_data[i] * input2_data[i], output_activation_min,
         output_activation_max);
   }
@@ -128,14 +128,18 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
   }
 }
 
-template <typename T>
-void BroadcastMul4DSlow(const ArithmeticParams& params,
-                        const RuntimeShape& unextended_input1_shape,
-                        const T* input1_data,
-                        const RuntimeShape& unextended_input2_shape,
-                        const T* input2_data,
-                        const RuntimeShape& unextended_output_shape,
-                        T* output_data) {
+template <typename T,
+          // For unquantized mul on small integers, explictly set to true.
+          bool enable_for_short_integers = false>
+inline typename std::enable_if<
+    !is_small_integer<T>::value || enable_for_short_integers, void>::type
+BroadcastMul4DSlow(const ArithmeticParams& params,
+                   const RuntimeShape& unextended_input1_shape,
+                   const T* input1_data,
+                   const RuntimeShape& unextended_input2_shape,
+                   const T* input2_data,
+                   const RuntimeShape& unextended_output_shape,
+                   T* output_data) {
   T output_activation_min;
   T output_activation_max;
   GetActivationParams(params, &output_activation_min, &output_activation_max);
@@ -167,7 +171,7 @@ void BroadcastMul4DSlow(const ArithmeticParams& params,
       for (int x = 0; x < output_shape.Dims(2); ++x) {
         for (int c = 0; c < output_shape.Dims(3); ++c) {
           output_data[Offset(output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
+              ActivationFunctionWithMinMax<T>(
                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
                   output_activation_min, output_activation_max);
diff --git a/src/tensorflow/lite/kernels/internal/reference/reduce.h b/src/tensorflow/lite/kernels/internal/reference/reduce.h
index c4d7598b..ab4745fc 100644
--- a/src/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/src/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -268,11 +268,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
   return true;
 }
 
-template <typename T>
 inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_input_shape,
-                 const T* input_data,
-                 const RuntimeShape& unextended_output_shape, T* output_data) {
+                 const float* input_data,
+                 const RuntimeShape& unextended_output_shape,
+                 float* output_data) {
   ruy::profiler::ScopeLabel label("Mean4D");
 
   // Current implementation only supports dimension equals 4 and simultaneous
@@ -312,78 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params,
   }
 }
 
-inline void Mean(const tflite::MeanParams& op_params,
-                 const RuntimeShape& unextended_input_shape,
-                 const uint8_t* input_data, int32_t input_zero_point,
-                 float input_scale, const RuntimeShape& unextended_output_shape,
-                 uint8_t* output_data, int32_t output_zero_point,
-                 float output_scale) {
-  ruy::profiler::ScopeLabel label("Mean4D/Uint8");
-
-  // Current implementation only supports dimension equals 4 and simultaneous
-  // reduction over width and height.
-  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-  const int output_batch = output_shape.Dims(0);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  const int output_depth = output_shape.Dims(3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const float num_elements_in_axis = input_width * input_height;
-
-  TFLITE_CHECK_EQ(op_params.axis_count, 2);
-  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
-  TFLITE_CHECK_EQ(output_height, 1);
-  TFLITE_CHECK_EQ(output_width, 1);
-
-  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
-  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
-
-  float temp = input_zero_point * input_scale / output_scale;
-  temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
-  int32_t bias = output_zero_point - static_cast<int32_t>(temp);
-  double real_scale =
-      static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
-
-  int32_t multiplier;
-  int shift;
-  QuantizeMultiplier(real_scale, &multiplier, &shift);
-  for (int out_b = 0; out_b < output_batch; ++out_b) {
-    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32_t acc = 0;
-      for (int in_h = 0; in_h < input_height; ++in_h) {
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
-        }
-      }
-      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
-      acc += bias;
-      acc = std::min(std::max(acc, kMinValue), kMaxValue);
-      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-          static_cast<uint8_t>(acc);
-    }
-  }
-}
-
 // Computes the mean of elements across dimensions given in axis.
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
 template <typename T, typename U>
 inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
-                               float input_scale, const int* input_dims,
-                               const int input_num_dims, T* output_data,
-                               int32_t output_zero_point, float output_scale,
+                               const int* input_dims, const int input_num_dims,
+                               T* output_data, int32_t output_multiplier,
+                               int output_shift, int32_t output_zero_point,
                                const int* output_dims,
                                const int output_num_dims, const int* axis,
                                const int num_axis_dimensions, bool keep_dims,
                                int* temp_index, int* resolved_axis, U* temp_sum,
                                bool compute_sum) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
   const bool uint8_case = std::is_same<T, uint8_t>::value;
   const bool int16_case = std::is_same<T, int16_t>::value;
   if (uint8_case) {
@@ -430,40 +373,46 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
   }
 
   // Calculate mean by dividing output_data by num of aggregated element.
-  size_t num_elements_in_axis = 1;
+  int64_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
     // Overflow prevention.
-    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
+    if (current > static_cast<size_t>(std::numeric_limits<int64_t>::max() /
+                                      num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
   }
 
-  if (num_elements_in_axis > 0) {
-    const float scale = input_scale / output_scale;
-    if (compute_sum) {
-      // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-      const float bias = -input_zero_point * scale * num_elements_in_axis;
-      for (size_t idx = 0; idx < num_outputs; ++idx) {
-        const U value =
-            static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
-            output_zero_point;
-        output_data[idx] = static_cast<T>(value);
-      }
-    } else {
-      const float bias = -input_zero_point * scale;
-      for (size_t idx = 0; idx < num_outputs; ++idx) {
-        float float_mean = static_cast<float>(temp_sum[idx]) /
-                           static_cast<float>(num_elements_in_axis);
-        float result = TfLiteMin(
-            TfLiteRound(float_mean * scale + bias) + output_zero_point,
-            static_cast<float>(std::numeric_limits<T>::max()));
-        result = TfLiteMax(result,
-                           static_cast<float>(std::numeric_limits<T>::min()));
-        output_data[idx] = static_cast<T>(result);
-      }
-    }
+  if (num_elements_in_axis == 0) {
+    return true;
+  }
+
+  // Readapt output rescaling when calculating the mean to integrate a
+  // 1/num_elements_in_axis multiplier.
+  if (!compute_sum) {
+    TFLITE_DCHECK_GE(num_elements_in_axis, 0);
+    int shift =
+        63 - CountLeadingZeros(static_cast<uint64_t>(num_elements_in_axis));
+    // To avoid any overflow risk 'shift' should be <= 32 and to satisfy
+    // 'MultiplyByQuantizedMultiplier' pre-conditions 'output_shift - shift'
+    // should be >= -31. Clamp the value at the price of some precision loss.
+    shift = std::min(shift, 32);
+    shift = std::min(shift, 31 + output_shift);
+    output_multiplier = static_cast<int32_t>(
+        (static_cast<int64_t>(output_multiplier) << shift) /
+        num_elements_in_axis);
+    output_shift = output_shift - shift;
+  }
+
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    const U shifted_sum =
+        static_cast<U>(temp_sum[idx] - input_zero_point * num_elements_in_axis);
+    int32_t output = MultiplyByQuantizedMultiplier(
+                         shifted_sum, output_multiplier, output_shift) +
+                     output_zero_point;
+    output = std::min(std::max(output, kMinValue), kMaxValue);
+    output_data[idx] = static_cast<T>(output);
   }
   return true;
 }
@@ -478,8 +427,8 @@ inline bool QuantizedMeanOrSumExtraArgs(
     bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum,
     bool compute_sum) {
   return QuantizedMeanOrSum<T, U>(
-      input_data, input_zero_point, input_scale, input_dims, input_num_dims,
-      output_data, output_zero_point, output_scale, output_dims,
+      input_data, input_zero_point, input_dims, input_num_dims, output_data,
+      output_multiplier, output_shift, output_zero_point, output_dims,
       output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index,
       resolved_axis, temp_sum, compute_sum);
 }
diff --git a/src/tensorflow/lite/kernels/internal/runtime_shape.h b/src/tensorflow/lite/kernels/internal/runtime_shape.h
index c2678b57..0e4df2c3 100644
--- a/src/tensorflow/lite/kernels/internal/runtime_shape.h
+++ b/src/tensorflow/lite/kernels/internal/runtime_shape.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
 
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
 namespace tflite {
 
 template <int N>
@@ -27,16 +29,19 @@ class RuntimeShape {
  public:
   RuntimeShape& operator=(RuntimeShape const&) = delete;
 
-  // RuntimeShape in TFLM supports up to 5 dimensions.
+  // RuntimeShape in TFLM supports up to 6 dimensions.
   // The name kMaxSmallSize comes from the same file of the upstream
   // tensorflow lite repo and need to be kept the same for max reuse.
-  static constexpr int kMaxSmallSize = 5;
+  static constexpr int kMaxSmallSize = 6;
 
   RuntimeShape() : size_(0) {}
 
-  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {}
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
+  }
 
   RuntimeShape(int shape_size, int32_t value) : size_(shape_size) {
+    TFLITE_DCHECK_LE(shape_size, kMaxSmallSize);
     for (int i = 0; i < shape_size; ++i) {
       SetDim(i, value);
     }
@@ -44,6 +49,7 @@ class RuntimeShape {
 
   RuntimeShape(int dimensions_count, const int32_t* dims_data)
       : size_(dimensions_count) {
+    // check of dimensions_count handled by ReplaceWith()
     ReplaceWith(dimensions_count, dims_data);
   }
 
@@ -69,6 +75,7 @@ class RuntimeShape {
 
   static RuntimeShape ExtendedShape(int new_shape_size,
                                     const RuntimeShape& shape) {
+    TFLITE_DCHECK_LE(new_shape_size, kMaxSmallSize);
     return RuntimeShape(new_shape_size, shape, 1);
   }
   int32_t* DimsData() { return dims_; }
@@ -76,6 +83,7 @@ class RuntimeShape {
   const int32_t* DimsDataUpTo5D() const { return dims_; }
 
   void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
+    TFLITE_DCHECK_LE(dimensions_count, kMaxSmallSize);
     size_ = dimensions_count;
     int32_t* dst_dims = DimsData();
     std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
diff --git a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.h b/src/tensorflow/lite/kernels/internal/tensor_ctypes.cpp
similarity index 50%
rename from src/tensorflow/lite/experimental/microfrontend/lib/fft_util.h
rename to src/tensorflow/lite/kernels/internal/tensor_ctypes.cpp
index 6a471301..6bd58fc1 100644
--- a/src/tensorflow/lite/experimental/microfrontend/lib/fft_util.h
+++ b/src/tensorflow/lite/kernels/internal/tensor_ctypes.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,23 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
 
-#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include <vector>
 
-// Prepares and FFT for the given input size.
-int FftPopulateState(struct FftState* state, size_t input_size);
+namespace tflite {
 
-// Frees any allocated buffers.
-void FftFreeStateContents(struct FftState* state);
+RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
+  if (tensor == nullptr) {
+    return RuntimeShape();
+  }
 
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+  TfLiteIntArray* dims = tensor->dims;
+  const int dims_size = dims->size;
+  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
+  return RuntimeShape(dims_size, dims_data);
+}
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
+RuntimeShape GetTensorShape(std::vector<int32_t> data) {
+  return RuntimeShape(data.size(), data.data());
+}
+
+}  // namespace tflite
diff --git a/src/tensorflow/lite/kernels/internal/tensor_ctypes.h b/src/tensorflow/lite/kernels/internal/tensor_ctypes.h
index 7e639b91..9a7205c0 100644
--- a/src/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/src/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 
+#include <vector>
+
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -31,16 +34,8 @@ inline const T* GetTensorData(const TfLiteTensor* tensor) {
                            : nullptr;
 }
 
-inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
-  if (tensor == nullptr) {
-    return RuntimeShape();
-  }
-
-  TfLiteIntArray* dims = tensor->dims;
-  const int dims_size = dims->size;
-  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
-  return RuntimeShape(dims_size, dims_data);
-}
+TFLITE_NOINLINE RuntimeShape GetTensorShape(const TfLiteTensor* tensor);
+RuntimeShape GetTensorShape(std::vector<int32_t> data);
 
 }  // namespace tflite
 
diff --git a/src/tensorflow/lite/kernels/internal/types.h b/src/tensorflow/lite/kernels/internal/types.h
index 77f741bb..043a8513 100644
--- a/src/tensorflow/lite/kernels/internal/types.h
+++ b/src/tensorflow/lite/kernels/internal/types.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -659,6 +659,9 @@ struct ArithmeticParams {
   // int64_t activation params.
   int64_t int64_activation_min;
   int64_t int64_activation_max;
+  // int16_t activation params.
+  int16_t int16_activation_min;
+  int16_t int16_activation_max;
 
   // Processed output dimensions.
   // Let input "a" be the one that broadcasts in the faster-changing dimension.
@@ -1022,6 +1025,18 @@ inline void SetActivationParams(int32_t min, int32_t max, P* params) {
   params->quantized_activation_max = max;
 }
 
+template <typename P>
+inline void SetActivationParams(uint32_t min, uint32_t max, P* params) {
+  params->quantized_activation_min = min;
+  params->quantized_activation_max = max;
+}
+
+template <typename P>
+inline void SetActivationParams(int16_t min, int16_t max, P* params) {
+  params->int16_activation_min = min;
+  params->int16_activation_max = max;
+}
+
 template <typename P>
 inline void SetActivationParams(int64_t min, int64_t max, P* params) {
   params->int64_activation_min = min;
@@ -1034,6 +1049,18 @@ inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
   *max = params.quantized_activation_max;
 }
 
+template <typename P>
+inline void GetActivationParams(const P& params, uint32_t* min, uint32_t* max) {
+  *min = params.quantized_activation_min;
+  *max = params.quantized_activation_max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int16_t* min, int16_t* max) {
+  *min = params.int16_activation_min;
+  *max = params.int16_activation_max;
+}
+
 template <typename P>
 inline void GetActivationParams(const P& params, float* min, float* max) {
   *min = params.float_activation_min;
diff --git a/src/tensorflow/lite/micro/fake_micro_context.cpp b/src/tensorflow/lite/micro/fake_micro_context.cpp
index 81f74ae3..03ea6dfc 100644
--- a/src/tensorflow/lite/micro/fake_micro_context.cpp
+++ b/src/tensorflow/lite/micro/fake_micro_context.cpp
@@ -39,16 +39,26 @@ FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors,
       allocator_(allocator) {}
 
 TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) {
-  allocated_tensor_count_++;
+  allocated_temp_count_++;
   return &tensors_[tensor_index];
 }
 
 void FakeMicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
-  allocated_tensor_count_--;
+  allocated_temp_count_--;
 }
 
 bool FakeMicroContext::IsAllTempTfLiteTensorDeallocated() {
-  return !allocated_tensor_count_;
+  return !allocated_temp_count_;
+}
+
+uint8_t* FakeMicroContext::AllocateTempBuffer(size_t size, size_t alignment) {
+  allocated_temp_count_++;
+  return allocator_->AllocateTemp(size, alignment);
+}
+
+void FakeMicroContext::DeallocateTempBuffer(uint8_t* buffer) {
+  allocated_temp_count_--;
+  allocator_->DeallocateTemp(buffer);
 }
 
 TfLiteEvalTensor* FakeMicroContext::GetEvalTensor(int tensor_index) {
diff --git a/src/tensorflow/lite/micro/fake_micro_context.h b/src/tensorflow/lite/micro/fake_micro_context.h
index 31b39d38..b068f326 100644
--- a/src/tensorflow/lite/micro/fake_micro_context.h
+++ b/src/tensorflow/lite/micro/fake_micro_context.h
@@ -21,6 +21,10 @@ limitations under the License.
 
 namespace tflite {
 // A fake of MicroContext for kernel util tests.
+// TODO(b/272759060): FakeMicroContext currently inherits from MicroContext.
+// Which allow tests to use functions from MicroContext that weren't added to
+// FakeMicroContext in tests. This should be looked into further.
+
 class FakeMicroContext : public MicroContext {
  public:
   FakeMicroContext(TfLiteTensor* tensors, SingleArenaBufferAllocator* allocator,
@@ -35,6 +39,9 @@ class FakeMicroContext : public MicroContext {
   void DeallocateTempTfLiteTensor(TfLiteTensor* tensor) override;
   bool IsAllTempTfLiteTensorDeallocated();
 
+  uint8_t* AllocateTempBuffer(size_t size, size_t alignment) override;
+  void DeallocateTempBuffer(uint8_t* buffer) override;
+
   TfLiteEvalTensor* GetEvalTensor(int tensor_index) override;
 
  private:
@@ -44,7 +51,7 @@ class FakeMicroContext : public MicroContext {
   uint8_t* scratch_buffers_[kNumScratchBuffers_];
 
   TfLiteTensor* tensors_;
-  int allocated_tensor_count_ = 0;
+  int allocated_temp_count_ = 0;
 
   SingleArenaBufferAllocator* allocator_;
 
diff --git a/src/tensorflow/lite/micro/kernels/activations.cpp b/src/tensorflow/lite/micro/kernels/activations.cpp
index 716dd6fc..3227ffbf 100644
--- a/src/tensorflow/lite/micro/kernels/activations.cpp
+++ b/src/tensorflow/lite/micro/kernels/activations.cpp
@@ -109,11 +109,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_RELU() {
+TfLiteRegistration_V1 Register_RELU() {
   return tflite::micro::RegisterOp(ReluInit, ReluPrepare, ReluEval);
 }
 
-TfLiteRegistration Register_RELU6() {
+TfLiteRegistration_V1 Register_RELU6() {
   return tflite::micro::RegisterOp(Relu6Init, Relu6Prepare, Relu6Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/add.h b/src/tensorflow/lite/micro/kernels/add.h
index 5b7be70c..6ec489c0 100644
--- a/src/tensorflow/lite/micro/kernels/add.h
+++ b/src/tensorflow/lite/micro/kernels/add.h
@@ -60,17 +60,17 @@ TfLiteStatus CalculateOpDataAdd(TfLiteContext* context, TfLiteAddParams* params,
 TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node);
 
 // Generic must define registration function.
-TfLiteRegistration Register_ADD();
+TfLiteRegistration_V1 Register_ADD();
 
 #if defined(ARDUINO)
-TfLiteRegistration Register_ADD_INT8();
+TfLiteRegistration_V1 Register_ADD_INT8();
 
-TfLiteRegistration Register_ADD_INT16();
+TfLiteRegistration_V1 Register_ADD_INT16();
 #else
 // Fallback registration
-inline TfLiteRegistration Register_ADD_INT8() { return Register_ADD(); }
+inline TfLiteRegistration_V1 Register_ADD_INT8() { return Register_ADD(); }
 
-inline TfLiteRegistration Register_ADD_INT16() { return Register_ADD(); }
+inline TfLiteRegistration_V1 Register_ADD_INT16() { return Register_ADD(); }
 #endif
 }  // namespace tflite
 
diff --git a/src/tensorflow/lite/micro/kernels/add_common.cpp b/src/tensorflow/lite/micro/kernels/add_common.cpp
index b285b800..cc945091 100644
--- a/src/tensorflow/lite/micro/kernels/add_common.cpp
+++ b/src/tensorflow/lite/micro/kernels/add_common.cpp
@@ -39,6 +39,8 @@ TfLiteStatus CalculateOpDataAdd(TfLiteContext* context, TfLiteAddParams* params,
   data->requires_broadcast = !HaveSameShapes(input1, input2);
 
   if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) {
+    TFLITE_CHECK_NE(output->quantization.type, kTfLiteNoQuantization);
+
     // 8bit -> 8bit general quantized path, with general rescalings
     data->input1_offset = -input1->params.zero_point;
     data->input2_offset = -input2->params.zero_point;
@@ -97,6 +99,14 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(
       CalculateOpDataAdd(context, params, input1, input2, output, data));
 
+  if (output->type == kTfLiteInt32) {
+    // Only support int32 unquantized add for now.
+    TF_LITE_ENSURE_EQ(context, input1->quantization.type,
+                      kTfLiteNoQuantization);
+    TF_LITE_ENSURE_EQ(context, input2->quantization.type,
+                      kTfLiteNoQuantization);
+  }
+
   micro_context->DeallocateTempTfLiteTensor(input1);
   micro_context->DeallocateTempTfLiteTensor(input2);
   micro_context->DeallocateTempTfLiteTensor(output);
diff --git a/src/tensorflow/lite/micro/kernels/add_n.cpp b/src/tensorflow/lite/micro/kernels/add_n.cpp
index 1139e1a9..eea554be 100644
--- a/src/tensorflow/lite/micro/kernels/add_n.cpp
+++ b/src/tensorflow/lite/micro/kernels/add_n.cpp
@@ -208,7 +208,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_ADD_N() {
+TfLiteRegistration_V1 Register_ADD_N() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/arg_min_max.cpp b/src/tensorflow/lite/micro/kernels/arg_min_max.cpp
index 7c78e475..c38c19b3 100644
--- a/src/tensorflow/lite/micro/kernels/arg_min_max.cpp
+++ b/src/tensorflow/lite/micro/kernels/arg_min_max.cpp
@@ -107,11 +107,11 @@ TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_ARG_MAX() {
+TfLiteRegistration_V1 Register_ARG_MAX() {
   return tflite::micro::RegisterOp(nullptr, nullptr, ArgMaxEval);
 }
 
-TfLiteRegistration Register_ARG_MIN() {
+TfLiteRegistration_V1 Register_ARG_MIN() {
   return tflite::micro::RegisterOp(nullptr, nullptr, ArgMinEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/assign_variable.cpp b/src/tensorflow/lite/micro/kernels/assign_variable.cpp
index f3aa12fa..a29fa57b 100644
--- a/src/tensorflow/lite/micro/kernels/assign_variable.cpp
+++ b/src/tensorflow/lite/micro/kernels/assign_variable.cpp
@@ -60,9 +60,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   MicroGraph& graph_info = micro_context->graph();
 
   MicroResourceVariables* resources = graph_info.GetResourceVariables();
-  TF_LITE_ENSURE_OK(context,
-                    resources->Allocate(input_resource_id_tensor->data.i32[0],
-                                        context, input_value));
+  // If the data field of this tensor is nullptr, we assume that this is a case
+  // of using resource variables in another subgraph, and the resource_id
+  // will be valid during Eval time. In case it wasn't valid, this will
+  // still be caught during Invoke. More info in b/277231654.
+  if (input_resource_id_tensor->data.i32 != nullptr) {
+    TF_LITE_ENSURE_OK(context,
+                      resources->Allocate(input_resource_id_tensor->data.i32[0],
+                                          context, input_value));
+  }
 
   micro_context->DeallocateTempTfLiteTensor(input_value);
   return kTfLiteOk;
@@ -94,7 +100,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_ASSIGN_VARIABLE() {
+TfLiteRegistration_V1 Register_ASSIGN_VARIABLE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp b/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp
index 83fb3568..29ca2ff9 100644
--- a/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp
+++ b/src/tensorflow/lite/micro/kernels/batch_to_space_nd.cpp
@@ -105,7 +105,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_BATCH_TO_SPACE_ND() {
+TfLiteRegistration_V1 Register_BATCH_TO_SPACE_ND() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/broadcast_args.cpp b/src/tensorflow/lite/micro/kernels/broadcast_args.cpp
index be2672ec..a526971c 100644
--- a/src/tensorflow/lite/micro/kernels/broadcast_args.cpp
+++ b/src/tensorflow/lite/micro/kernels/broadcast_args.cpp
@@ -83,7 +83,7 @@ TfLiteStatus BroadcastArgsEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_BROADCAST_ARGS() {
+TfLiteRegistration_V1 Register_BROADCAST_ARGS() {
   return tflite::micro::RegisterOp(nullptr, BroadcastArgsPrepare,
                                    BroadcastArgsEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/broadcast_to.cpp b/src/tensorflow/lite/micro/kernels/broadcast_to.cpp
index 63a14db2..9a32331f 100644
--- a/src/tensorflow/lite/micro/kernels/broadcast_to.cpp
+++ b/src/tensorflow/lite/micro/kernels/broadcast_to.cpp
@@ -115,7 +115,7 @@ TfLiteStatus BroadcastToEval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_BROADCAST_TO() {
+TfLiteRegistration_V1 Register_BROADCAST_TO() {
   return tflite::micro::RegisterOp(nullptr, BroadcastToPrepare,
                                    BroadcastToEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/call_once.cpp b/src/tensorflow/lite/micro/kernels/call_once.cpp
index 200242b2..9fdf7d05 100644
--- a/src/tensorflow/lite/micro/kernels/call_once.cpp
+++ b/src/tensorflow/lite/micro/kernels/call_once.cpp
@@ -81,7 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_CALL_ONCE() {
+TfLiteRegistration_V1 Register_CALL_ONCE() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cast.cpp b/src/tensorflow/lite/micro/kernels/cast.cpp
index 0a0204d2..6dd20d1f 100644
--- a/src/tensorflow/lite/micro/kernels/cast.cpp
+++ b/src/tensorflow/lite/micro/kernels/cast.cpp
@@ -107,7 +107,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_CAST() {
+TfLiteRegistration_V1 Register_CAST() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/ceil.cpp b/src/tensorflow/lite/micro/kernels/ceil.cpp
index dbcd57c2..5716afef 100644
--- a/src/tensorflow/lite/micro/kernels/ceil.cpp
+++ b/src/tensorflow/lite/micro/kernels/ceil.cpp
@@ -66,7 +66,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_CEIL() {
+TfLiteRegistration_V1 Register_CEIL() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/circular_buffer.cpp b/src/tensorflow/lite/micro/kernels/circular_buffer.cpp
index 9779c32d..e598fc5a 100644
--- a/src/tensorflow/lite/micro/kernels/circular_buffer.cpp
+++ b/src/tensorflow/lite/micro/kernels/circular_buffer.cpp
@@ -108,8 +108,8 @@ TfLiteStatus CircularBufferEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteRegistration* Register_CIRCULAR_BUFFER() {
-  static TfLiteRegistration r = tflite::micro::RegisterOp(
+TfLiteRegistration_V1* Register_CIRCULAR_BUFFER() {
+  static TfLiteRegistration_V1 r = tflite::micro::RegisterOp(
       CircularBufferInit, CircularBufferPrepare, CircularBufferEval);
   return &r;
 }
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp
index 99ad9142..75cd3a52 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/add.cpp
@@ -198,29 +198,60 @@ TfLiteStatus EvalAddQuantizedInt16(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteAddParams* params, const OpData* data,
-                  const TfLiteEvalTensor* input1,
-                  const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
-  tflite::ArithmeticParams op_params;
-  SetActivationParams(data->output_activation_min_f32,
-                      data->output_activation_max_f32, &op_params);
-  if (data->requires_broadcast) {
-    reference_ops::BroadcastAdd4DSlow(
-        op_params, tflite::micro::GetTensorShape(input1),
-        tflite::micro::GetTensorData<float>(input1),
-        tflite::micro::GetTensorShape(input2),
-        tflite::micro::GetTensorData<float>(input2),
-        tflite::micro::GetTensorShape(output),
-        tflite::micro::GetTensorData<float>(output));
-  } else {
-    reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
-                       tflite::micro::GetTensorData<float>(input1),
-                       tflite::micro::GetTensorShape(input2),
-                       tflite::micro::GetTensorData<float>(input2),
-                       tflite::micro::GetTensorShape(output),
-                       tflite::micro::GetTensorData<float>(output));
+TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node,
+                     TfLiteAddParams* params, const OpData* data,
+                     const TfLiteEvalTensor* input1,
+                     const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      tflite::ArithmeticParams op_params;
+      SetActivationParams(data->output_activation_min_f32,
+                          data->output_activation_max_f32, &op_params);
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<float>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<float>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<float>(output));
+      } else {
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                           tflite::micro::GetTensorData<float>(input1),
+                           tflite::micro::GetTensorShape(input2),
+                           tflite::micro::GetTensorData<float>(input2),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<float>(output));
+      }
+    } break;
+    case kTfLiteInt32: {
+      tflite::ArithmeticParams op_params;
+      SetActivationParams(std::numeric_limits<int32_t>::lowest(),
+                          std::numeric_limits<int32_t>::max(), &op_params);
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int32_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int32_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int32_t>(output));
+      } else {
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                           tflite::micro::GetTensorData<int32_t>(input1),
+                           tflite::micro::GetTensorShape(input2),
+                           tflite::micro::GetTensorData<int32_t>(input2),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<int32_t>(output));
+      }
+    } break;
+    default:
+      MicroPrintf("Type %s (%d) not supported.",
+                  TfLiteTypeGetName(output->type), output->type);
+      return kTfLiteError;
   }
+
+  return kTfLiteOk;
 }
 
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
@@ -282,6 +313,14 @@ TfLiteStatus PrepareAdd(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(
       CalculateOpData(context, params, input1, input2, output, data));
 
+  if (output->type == kTfLiteInt32) {
+    // Only support int32 unquantized add for now.
+    TF_LITE_ENSURE_EQ(context, input1->quantization.type,
+                      kTfLiteNoQuantization);
+    TF_LITE_ENSURE_EQ(context, input2->quantization.type,
+                      kTfLiteNoQuantization);
+  }
+
   micro_context->DeallocateTempTfLiteTensor(input1);
   micro_context->DeallocateTempTfLiteTensor(input2);
   micro_context->DeallocateTempTfLiteTensor(output);
@@ -302,8 +341,9 @@ TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData* data = static_cast<const OpData*>(node->user_data);
 
-  if (output->type == kTfLiteFloat32) {
-    EvalAddFloat(context, node, params, data, input1, input2, output);
+  if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
+    TF_LITE_ENSURE_OK(
+        context, EvalAdd(context, node, params, data, input1, input2, output));
   } else if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
                                                 input1, input2, output));
@@ -356,15 +396,15 @@ TfLiteStatus EvalAddInt16(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteRegistration Register_ADD() {
+TfLiteRegistration_V1 Register_ADD() {
   return tflite::micro::RegisterOp(InitAdd, PrepareAdd, EvalAdd);
 }
 
-TfLiteRegistration Register_ADD_INT8() {
+TfLiteRegistration_V1 Register_ADD_INT8() {
   return tflite::micro::RegisterOp(InitAdd, PrepareAdd, EvalAddInt8);
 }
 
-TfLiteRegistration Register_ADD_INT16() {
+TfLiteRegistration_V1 Register_ADD_INT16() {
   return tflite::micro::RegisterOp(InitAdd, PrepareAdd, EvalAddInt16);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp
index 504fd1ee..2655918c 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/conv.h"
 
-#include "third_party/cmsis_nn/Include/arm_nn_types.h"
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -467,15 +466,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_CONV_2D() {
+TfLiteRegistration_V1 Register_CONV_2D() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
-TfLiteRegistration Register_CONV_2D_INT8() {
+TfLiteRegistration_V1 Register_CONV_2D_INT8() {
   return tflite::micro::RegisterOp(Init, Prepare, EvalInt8);
 }
 
-TfLiteRegistration Register_CONV_2D_INT16() {
+TfLiteRegistration_V1 Register_CONV_2D_INT16() {
   return tflite::micro::RegisterOp(Init, Prepare, EvalInt16x8);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp
index ebc760c4..5d46447e 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/depthwise_conv.cpp
@@ -433,15 +433,15 @@ TfLiteStatus EvalInt16x8(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
-TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8() {
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT8() {
   return tflite::micro::RegisterOp(Init, Prepare, EvalInt8);
 }
 
-TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16() {
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT16() {
   return tflite::micro::RegisterOp(Init, Prepare, EvalInt16x8);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp
index cacdf04a..0381b071 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cpp
@@ -421,15 +421,15 @@ TfLiteStatus EvalInt16(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_FULLY_CONNECTED() {
+TfLiteRegistration_V1 Register_FULLY_CONNECTED() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
-TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
+TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT8() {
   return tflite::micro::RegisterOp(Init, Prepare, EvalInt8);
 }
 
-TfLiteRegistration Register_FULLY_CONNECTED_INT16() {
+TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT16() {
   return tflite::micro::RegisterOp(Init, Prepare, EvalInt16);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp
index 5bbbf11f..8cc3027a 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/mul.cpp
@@ -169,15 +169,15 @@ TfLiteStatus EvalInt16(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteRegistration Register_MUL() {
+TfLiteRegistration_V1 Register_MUL() {
   return tflite::micro::RegisterOp(MulInit, MulPrepare, Eval);
 }
 
-TfLiteRegistration Register_MUL_INT8() {
+TfLiteRegistration_V1 Register_MUL_INT8() {
   return tflite::micro::RegisterOp(MulInit, MulPrepare, EvalInt8);
 }
 
-TfLiteRegistration Register_MUL_INT16() {
+TfLiteRegistration_V1 Register_MUL_INT16() {
   return tflite::micro::RegisterOp(MulInit, MulPrepare, EvalInt16);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp
index ce4ba765..e944ba2c 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/pooling.cpp
@@ -319,27 +319,27 @@ TfLiteStatus MaxEvalInt16(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_AVERAGE_POOL_2D_INT8() {
+TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT8() {
   return tflite::micro::RegisterOp(Init, AveragePrepare, AverageEvalInt8);
 }
 
-TfLiteRegistration Register_AVERAGE_POOL_2D_INT16() {
+TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT16() {
   return tflite::micro::RegisterOp(Init, AveragePrepare, AverageEvalInt16);
 }
 
-TfLiteRegistration Register_AVERAGE_POOL_2D() {
+TfLiteRegistration_V1 Register_AVERAGE_POOL_2D() {
   return tflite::micro::RegisterOp(Init, AveragePrepare, AverageEval);
 }
 
-TfLiteRegistration Register_MAX_POOL_2D_INT8() {
+TfLiteRegistration_V1 Register_MAX_POOL_2D_INT8() {
   return tflite::micro::RegisterOp(Init, MaxPrepare, MaxEvalInt8);
 }
 
-TfLiteRegistration Register_MAX_POOL_2D_INT16() {
+TfLiteRegistration_V1 Register_MAX_POOL_2D_INT16() {
   return tflite::micro::RegisterOp(Init, MaxPrepare, MaxEvalInt16);
 }
 
-TfLiteRegistration Register_MAX_POOL_2D() {
+TfLiteRegistration_V1 Register_MAX_POOL_2D() {
   return tflite::micro::RegisterOp(Init, MaxPrepare, MaxEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp
index 9efe8943..93ae608d 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cpp
@@ -190,19 +190,19 @@ TfLiteStatus SoftmaxEvalInt16(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SOFTMAX() {
+TfLiteRegistration_V1 Register_SOFTMAX() {
   return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEval);
 }
 
-TfLiteRegistration Register_SOFTMAX_INT8() {
+TfLiteRegistration_V1 Register_SOFTMAX_INT8() {
   return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt8);
 }
 
-TfLiteRegistration Register_SOFTMAX_INT8_INT16() {
+TfLiteRegistration_V1 Register_SOFTMAX_INT8_INT16() {
   return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt8_Int16);
 }
 
-TfLiteRegistration Register_SOFTMAX_INT16() {
+TfLiteRegistration_V1 Register_SOFTMAX_INT16() {
   return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt16);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp
index 6941d223..c3ed8095 100644
--- a/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/svdf.h"
 
-#include "third_party/cmsis_nn/Include/arm_nn_types.h"
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -213,11 +212,11 @@ TfLiteStatus EvalSvdfInt8(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SVDF() {
+TfLiteRegistration_V1 Register_SVDF() {
   return tflite::micro::RegisterOp(Init, PrepareSvdf, EvalSvdf);
 }
 
-TfLiteRegistration Register_SVDF_INT8() {
+TfLiteRegistration_V1 Register_SVDF_INT8() {
   return tflite::micro::RegisterOp(Init, PrepareSvdf, EvalSvdfInt8);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cpp b/src/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cpp
new file mode 100644
index 00000000..421d7666
--- /dev/null
+++ b/src/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cpp
@@ -0,0 +1,683 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Integer version of unidirectional sequence LSTM. Only the standard LSTM
+// (defined in the keras LSTM layer, e.g., no peephole etc.) is supported here.
+// Currently used by the 8 bits activation case only, except for fallbacks.
+
+#include <algorithm>
+#include <limits>
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/lstm_eval.h"
+#include "tensorflow/lite/micro/kernels/lstm_shared.h"
+#include "tensorflow/lite/micro/kernels/micro_tensor_utils.h"
+
+namespace tflite {
+
+namespace {
+
+struct OpData {
+  OpDataLSTM params_ref;
+  cmsis_nn_lstm_params params_cmsis_nn;
+};
+
+/*Helper Functions*/
+TfLiteStatus PrecomputeZeroPointTimesWeightWithBias(
+    TfLiteContext* context, int32_t zero_point,
+    const TfLiteTensor* weight_tensor, const TfLiteTensor* bias_tensor,
+    int32_t** output) {
+  if (weight_tensor == nullptr) {
+    return kTfLiteOk;
+  }
+
+  const RuntimeShape& weight_shape = GetTensorShape(weight_tensor);
+  TF_LITE_ENSURE_EQ(context, weight_shape.DimensionsCount(), 2);
+  const int row = weight_shape.Dims(0);
+  const int col = weight_shape.Dims(1);
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  *output = static_cast<int32_t*>(
+      context->AllocatePersistentBuffer(context, row * sizeof(int32_t)));
+
+  if (bias_tensor == nullptr) {
+    memset(*output, 0, row * sizeof(int32_t));
+  } else {
+    const int32_t* bias = GetTensorData<int32_t>(bias_tensor);
+    memcpy(*output, bias, row * sizeof(int32_t));
+  }
+
+  if (zero_point != 0) {
+    const int8_t* weight = GetTensorData<int8_t>(weight_tensor);
+    tflite::tensor_utils::MatrixScalarMultiplyAccumulate(weight, zero_point,
+                                                         row, col, *output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             const LstmTensors& lstm_tensors, OpData* op_data) {
+  const TfLiteTensor* input = lstm_tensors.GetInternalTensor(kLstmInputTensor);
+  const TfLiteTensor* output_state =
+      lstm_tensors.GetInternalTensor(tflite::kLstmOutputStateTensor);
+
+  TF_LITE_ENSURE(context, input->type == kTfLiteInt8);
+
+  op_data->params_cmsis_nn.output_state_offset =
+      output_state->params.zero_point;
+
+  const TfLiteTensor* input_to_forget_weights =
+      lstm_tensors.GetInternalTensor(kLstmInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_input_weights =
+      lstm_tensors.GetInternalTensor(kLstmInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      lstm_tensors.GetInternalTensor(kLstmInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      lstm_tensors.GetInternalTensor(kLstmInputToCellWeightsTensor);
+  const TfLiteTensor* forget_gate_bias =
+      lstm_tensors.GetInternalTensor(kLstmForgetGateBiasTensor);
+  const TfLiteTensor* cell_state =
+      lstm_tensors.GetInternalTensor(kLstmCellStateTensor);
+
+  const TfLiteTensor* cell_gate_bias =
+      lstm_tensors.GetInternalTensor(kLstmCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      lstm_tensors.GetInternalTensor(kLstmOutputGateBiasTensor);
+  const TfLiteTensor* input_gate_bias =
+      lstm_tensors.GetInternalTensor(kLstmInputGateBiasTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      lstm_tensors.GetInternalTensor(kLstmRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      lstm_tensors.GetInternalTensor(kLstmRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      lstm_tensors.GetInternalTensor(kLstmRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_input_weights =
+      lstm_tensors.GetInternalTensor(kLstmRecurrentToInputWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      lstm_tensors.GetInternalTensor(kLstmCellToOutputWeightsTensor);
+  const TfLiteTensor* forget_layer_norm_coefficients =
+      lstm_tensors.GetInternalTensor(kLstmForgetLayerNormCoefficientsTensor);
+  const TfLiteTensor* projection_weights =
+      lstm_tensors.GetInternalTensor(kLstmProjectionWeightsTensor);
+
+  const bool use_layer_norm = (forget_layer_norm_coefficients != nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool lstm_unsupported_config =
+      use_layer_norm || use_peephole || use_projection || use_cifg;
+  TFLITE_DCHECK(!lstm_unsupported_config);
+
+  // Pre-calculate bias + zero_point * weight.
+  int32_t* input_to_forget_effective_bias = nullptr;
+  int32_t* recurrent_to_forget_effective_bias = nullptr;
+  int32_t* input_to_cell_effective_bias = nullptr;
+  int32_t* recurrent_to_cell_effective_bias = nullptr;
+  int32_t* input_to_output_effective_bias = nullptr;
+  int32_t* recurrent_to_output_effective_bias = nullptr;
+  int32_t* input_to_input_effective_bias = nullptr;
+  int32_t* recurrent_to_input_effective_bias = nullptr;
+
+  const int32_t output_state_zero_point =
+      -op_data->params_cmsis_nn.output_state_offset;
+  const int32_t input_zero_point = -input->params.zero_point;
+
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, input_zero_point, input_to_forget_weights,
+                        forget_gate_bias, &input_to_forget_effective_bias));
+
+  TF_LITE_ENSURE_OK(context, PrecomputeZeroPointTimesWeightWithBias(
+                                 context, output_state_zero_point,
+                                 recurrent_to_forget_weights, nullptr,
+                                 &recurrent_to_forget_effective_bias));
+
+  // Modulation gate.
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, input_zero_point, input_to_cell_weights,
+                        cell_gate_bias, &input_to_cell_effective_bias));
+  TF_LITE_ENSURE_OK(
+      context, PrecomputeZeroPointTimesWeightWithBias(
+                   context, output_state_zero_point, recurrent_to_cell_weights,
+                   nullptr, &recurrent_to_cell_effective_bias));
+
+  // Output gate.
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, input_zero_point, input_to_output_weights,
+                        output_gate_bias, &input_to_output_effective_bias));
+
+  TF_LITE_ENSURE_OK(context, PrecomputeZeroPointTimesWeightWithBias(
+                                 context, output_state_zero_point,
+                                 recurrent_to_output_weights, nullptr,
+                                 &recurrent_to_output_effective_bias));
+
+  // Input gate. The calculation is only meaningful for non-cifg case.
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, input_zero_point, input_to_input_weights,
+                        input_gate_bias, &input_to_input_effective_bias));
+  TF_LITE_ENSURE_OK(
+      context, PrecomputeZeroPointTimesWeightWithBias(
+                   context, output_state_zero_point, recurrent_to_input_weights,
+                   nullptr, &recurrent_to_input_effective_bias));
+
+  op_data->params_cmsis_nn.i2f_effective_bias = input_to_forget_effective_bias;
+  op_data->params_cmsis_nn.r2f_effective_bias =
+      recurrent_to_forget_effective_bias;
+  op_data->params_cmsis_nn.i2c_effective_bias = input_to_cell_effective_bias;
+  op_data->params_cmsis_nn.r2c_effective_bias =
+      recurrent_to_cell_effective_bias;
+  op_data->params_cmsis_nn.i2o_effective_bias = input_to_output_effective_bias;
+  op_data->params_cmsis_nn.r2o_effective_bias =
+      recurrent_to_output_effective_bias;
+  op_data->params_cmsis_nn.i2i_effective_bias = input_to_input_effective_bias;
+  op_data->params_cmsis_nn.r2i_effective_bias =
+      recurrent_to_input_effective_bias;
+
+  // Get intermediate scales and zero points.
+  float intermediate_scale[5];
+  int32_t intermediate_zp[5];
+  for (int i = 0; i < 4; ++i) {
+    // Q3.12 for activation functions.
+    intermediate_scale[i] = std::pow(2.0f, -12.0f);
+    intermediate_zp[i] = 0;
+  }
+
+  MicroContext* micro_context = GetMicroContext(context);
+  // In the absence of projection, hidden becomes otuput and this intermediate
+  // is ignored.
+  TfLiteTensor* hidden = micro_context->AllocateTempIntermediateTensor(node, 4);
+  TF_LITE_ENSURE(context, hidden->quantization.type != kTfLiteNoQuantization);
+  auto* hidden_params =
+      static_cast<TfLiteAffineQuantization*>(hidden->quantization.params);
+  intermediate_scale[4] = hidden_params->scale->data[0];
+  intermediate_zp[4] = hidden_params->zero_point->data[0];
+  if (hidden != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(hidden);
+  }
+
+  // Scales.
+  const float default_scale = 1.0;
+  float input_scale = default_scale;
+  float input_to_input_weight_scale = default_scale;
+  float recurrent_to_input_weight_scale = default_scale;
+  float input_to_forget_weight_scale = default_scale;
+  float recurrent_to_forget_weight_scale = default_scale;
+  float input_to_cell_weight_scale = default_scale;
+  float recurrent_to_cell_weight_scale = default_scale;
+  float input_to_output_weight_scale = default_scale;
+  float recurrent_to_output_weight_scale = default_scale;
+  float output_state_scale = default_scale;
+  int cell_scale = 1;
+
+  // Effective scales.
+  float effective_input_to_input_scale = default_scale;
+  float effective_recurrent_to_input_scale = default_scale;
+  float effective_cell_to_input_scale = default_scale;
+  float effective_input_to_forget_scale = default_scale;
+  float effective_recurrent_to_forget_scale = default_scale;
+  float effective_cell_to_forget_scale = default_scale;
+  float effective_input_to_cell_scale = default_scale;
+  float effective_recurrent_to_cell_scale = default_scale;
+  float effective_input_to_output_scale = default_scale;
+  float effective_recurrent_to_output_scale = default_scale;
+  float effective_cell_to_output_scale = default_scale;
+  float effective_hidden_scale = default_scale;
+
+  // Populate scales.
+  input_to_input_weight_scale = input_to_input_weights->params.scale;
+  recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
+
+  output_state_scale = output_state->params.scale;
+
+  input_to_forget_weight_scale = input_to_forget_weights->params.scale;
+  input_to_cell_weight_scale = input_to_cell_weights->params.scale;
+  input_to_output_weight_scale = input_to_output_weights->params.scale;
+  recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale;
+  recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
+  recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
+
+  // Check cell state (already used above)
+  TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale));
+  TF_LITE_ENSURE(context, cell_scale <= -9);
+
+  op_data->params_cmsis_nn.cell_state_shift = cell_scale;
+  input_scale = input->params.scale;
+
+  // Calculate effective scales.
+  effective_input_to_input_scale =
+      input_to_input_weight_scale * input_scale / intermediate_scale[0];
+  effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
+                                       output_state_scale /
+                                       intermediate_scale[0];
+
+  effective_input_to_forget_scale =
+      input_to_forget_weight_scale * input_scale / intermediate_scale[1];
+  effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
+                                        output_state_scale /
+                                        intermediate_scale[1];
+
+  effective_input_to_cell_scale =
+      input_to_cell_weight_scale * input_scale / intermediate_scale[2];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[2];
+
+  effective_input_to_output_scale =
+      input_to_output_weight_scale * input_scale / intermediate_scale[3];
+  effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
+                                        output_state_scale /
+                                        intermediate_scale[3];
+
+  effective_hidden_scale =
+      std::pow(2.0f, -15.0f) / intermediate_scale[4] * std::pow(2.0f, -15.0f);
+
+  // Decompose scales.
+  int shift_output;
+  QuantizeMultiplier(
+      static_cast<double>(effective_input_to_input_scale),
+      &op_data->params_cmsis_nn.input_to_input_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.input_to_input_scaling.shift =
+      static_cast<int32_t>(shift_output);
+
+  QuantizeMultiplier(
+      static_cast<double>(effective_recurrent_to_input_scale),
+      &op_data->params_cmsis_nn.recurrent_to_input_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.recurrent_to_input_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_cell_to_input_scale),
+                     &op_data->params_cmsis_nn.cell_to_input_scaling.multiplier,
+                     &shift_output);
+  op_data->params_cmsis_nn.cell_to_input_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_input_to_forget_scale),
+      &op_data->params_cmsis_nn.input_to_forget_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.input_to_forget_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_recurrent_to_forget_scale),
+      &op_data->params_cmsis_nn.recurrent_to_forget_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.recurrent_to_forget_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_cell_to_forget_scale),
+      &op_data->params_cmsis_nn.cell_to_forget_scaling.multiplier,
+      &shift_output);
+  // ok
+  op_data->params_cmsis_nn.cell_to_forget_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_input_to_cell_scale),
+                     &op_data->params_cmsis_nn.input_to_cell_scaling.multiplier,
+                     &shift_output);
+  op_data->params_cmsis_nn.input_to_cell_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_recurrent_to_cell_scale),
+      &op_data->params_cmsis_nn.recurrent_to_cell_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.recurrent_to_cell_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_input_to_output_scale),
+      &op_data->params_cmsis_nn.input_to_output_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.input_to_output_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_recurrent_to_output_scale),
+      &op_data->params_cmsis_nn.recurrent_to_output_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.recurrent_to_output_scaling.shift =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(
+      static_cast<double>(effective_cell_to_output_scale),
+      &op_data->params_cmsis_nn.cell_to_output_scaling.multiplier,
+      &shift_output);
+  op_data->params_cmsis_nn.cell_to_output_scaling.shift =
+      static_cast<int32_t>(shift_output);
+
+  op_data->params_cmsis_nn.projection_scaling.shift =
+      static_cast<int32_t>(shift_output);
+
+  QuantizeMultiplier(static_cast<double>(effective_hidden_scale),
+                     &op_data->params_cmsis_nn.hidden_scaling.multiplier,
+                     &shift_output);
+  op_data->params_cmsis_nn.hidden_scaling.shift =
+      static_cast<int32_t>(shift_output);
+
+  op_data->params_cmsis_nn.hidden_offset = intermediate_zp[4];
+
+  op_data->params_cmsis_nn.activation.min = std::numeric_limits<int16_t>::min();
+  op_data->params_cmsis_nn.activation.max = std::numeric_limits<int16_t>::max();
+
+  return kTfLiteOk;
+}
+
+template <typename CellType>
+TfLiteStatus CMSIS_NN_EvalInteger8x8_16Lstm(
+    const OpData& op_data, const LSTMKernelContents& kernel_content,
+    const LSTMBuffers<CellType>& buffers) {
+  const OpDataLSTM& op_data_lstm = op_data.params_ref;
+  const TfLiteEvalTensor* input =
+      kernel_content.GetInternalTensor(tflite::kLstmInputTensor);
+  const TfLiteEvalTensor* input_gate_bias =
+      kernel_content.GetInternalTensor(tflite::kLstmInputGateBiasTensor);
+  const TfLiteEvalTensor* forget_gate_bias =
+      kernel_content.GetInternalTensor(tflite::kLstmForgetGateBiasTensor);
+  const TfLiteEvalTensor* cell_gate_bias =
+      kernel_content.GetInternalTensor(tflite::kLstmCellGateBiasTensor);
+  const TfLiteEvalTensor* output_gate_bias =
+      kernel_content.GetInternalTensor(tflite::kLstmOutputGateBiasTensor);
+  const TfLiteEvalTensor* input_to_output_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmInputToOutputWeightsTensor);
+  const TfLiteEvalTensor* recurrent_to_output_weights =
+      kernel_content.GetInternalTensor(
+          tflite::kLstmRecurrentToOutputWeightsTensor);
+  const TfLiteEvalTensor* input_to_input_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmInputToInputWeightsTensor);
+  const TfLiteEvalTensor* input_to_forget_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmInputToForgetWeightsTensor);
+  const TfLiteEvalTensor* input_to_cell_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmInputToCellWeightsTensor);
+  const TfLiteEvalTensor* recurrent_to_input_weights =
+      kernel_content.GetInternalTensor(
+          tflite::kLstmRecurrentToInputWeightsTensor);
+  const TfLiteEvalTensor* recurrent_to_forget_weights =
+      kernel_content.GetInternalTensor(
+          tflite::kLstmRecurrentToForgetWeightsTensor);
+  const TfLiteEvalTensor* recurrent_to_cell_weights =
+      kernel_content.GetInternalTensor(
+          tflite::kLstmRecurrentToCellWeightsTensor);
+  const TfLiteEvalTensor* cell_to_input_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmCellToInputWeightsTensor);
+  const TfLiteEvalTensor* cell_to_forget_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmCellToForgetWeightsTensor);
+  const TfLiteEvalTensor* cell_to_output_weights =
+      kernel_content.GetInternalTensor(tflite::kLstmCellToOutputWeightsTensor);
+  const TfLiteEvalTensor* cell_state =
+      kernel_content.GetInternalTensor(tflite::kLstmCellStateTensor);
+  const TfLiteEvalTensor* output_state =
+      kernel_content.GetInternalTensor(tflite::kLstmOutputStateTensor);
+  const TfLiteEvalTensor* output = kernel_content.output_tensor;
+
+  TFLITE_DCHECK(input->dims->size >= 2 && input->dims->size <= 3);
+
+  cmsis_nn_lstm_context scratch_buffers;
+  scratch_buffers.input_gate = reinterpret_cast<int16_t*>(buffers.buffer0);
+  scratch_buffers.forget_gate = reinterpret_cast<int16_t*>(buffers.buffer1);
+  scratch_buffers.cell_gate = reinterpret_cast<int16_t*>(buffers.buffer2);
+  scratch_buffers.output_gate = reinterpret_cast<int16_t*>(buffers.buffer3);
+
+  cmsis_nn_lstm_params cmsis_lstm_params = op_data.params_cmsis_nn;
+  cmsis_lstm_params.time_major = op_data_lstm.size_info.time_major;
+  cmsis_lstm_params.clip.cell =
+      op_data_lstm.cell_state_info.quantized_cell_clip;
+
+  cmsis_lstm_params.input_gate_bias = const_cast<int32_t*>(
+      tflite::micro::GetOptionalTensorData<int32_t>(input_gate_bias));
+  cmsis_lstm_params.forget_gate_bias = const_cast<int32_t*>(
+      tflite::micro::GetOptionalTensorData<int32_t>(forget_gate_bias));
+  cmsis_lstm_params.cell_gate_bias = const_cast<int32_t*>(
+      tflite::micro::GetOptionalTensorData<int32_t>(cell_gate_bias));
+  cmsis_lstm_params.output_gate_bias = const_cast<int32_t*>(
+      tflite::micro::GetOptionalTensorData<int32_t>(output_gate_bias));
+
+  const bool time_major = op_data_lstm.size_info.time_major;
+  const int n_input = input->dims->data[input->dims->size - 1];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  }
+
+  cmsis_nn_lstm_dims lstm_dims;
+  lstm_dims.num_inputs = n_input;
+  lstm_dims.num_outputs = n_output;
+  lstm_dims.num_batches = n_batch;
+  lstm_dims.max_time = max_time;
+
+  arm_lstm_unidirectional_s16_s8(
+      &scratch_buffers,
+      const_cast<int8_t*>(tflite::micro::GetTensorData<int8_t>(input)),
+      &lstm_dims,
+      const_cast<int8_t*>(
+          tflite::micro::GetOptionalTensorData<int8_t>(input_to_input_weights)),
+      const_cast<int8_t*>(tflite::micro::GetOptionalTensorData<int8_t>(
+          input_to_forget_weights)),
+      const_cast<int8_t*>(
+          tflite::micro::GetOptionalTensorData<int8_t>(input_to_cell_weights)),
+      const_cast<int8_t*>(tflite::micro::GetOptionalTensorData<int8_t>(
+          input_to_output_weights)),
+      const_cast<int8_t*>(tflite::micro::GetOptionalTensorData<int8_t>(
+          recurrent_to_input_weights)),
+      const_cast<int8_t*>(tflite::micro::GetOptionalTensorData<int8_t>(
+          recurrent_to_forget_weights)),
+      const_cast<int8_t*>(tflite::micro::GetOptionalTensorData<int8_t>(
+          recurrent_to_cell_weights)),
+      const_cast<int8_t*>(tflite::micro::GetOptionalTensorData<int8_t>(
+          recurrent_to_output_weights)),
+      const_cast<int16_t*>(
+          tflite::micro::GetOptionalTensorData<int16_t>(cell_to_input_weights)),
+      const_cast<int16_t*>(tflite::micro::GetOptionalTensorData<int16_t>(
+          cell_to_forget_weights)),
+      const_cast<int16_t*>(tflite::micro::GetOptionalTensorData<int16_t>(
+          cell_to_output_weights)),
+      nullptr, &cmsis_lstm_params,
+      const_cast<int8_t*>(tflite::micro::GetTensorData<int8_t>(output_state)),
+      const_cast<int16_t*>(tflite::micro::GetTensorData<int16_t>(cell_state)),
+      const_cast<int8_t*>(tflite::micro::GetTensorData<int8_t>(output)));
+
+  return kTfLiteOk;
+}
+
+/*Kernel functions*/
+
+void* UnidirectionalSequenceLstmInit(TfLiteContext* context, const char* buffer,
+                                     size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus UnidirectionalSequenceLstmPrepare(TfLiteContext* context,
+                                               TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 24);
+
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  TFLITE_DCHECK(node->user_data != nullptr);
+
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  OpDataLSTM* op_data_lstm = &op_data->params_ref;
+
+  const auto* builtin_data =
+      static_cast<TfLiteUnidirectionalSequenceLSTMParams*>(node->builtin_data);
+  // All TempTfLiteTensors will be deallocated through the destructor.
+  LstmTensors lstm_tensors(context, node);
+  TF_LITE_ENSURE_OK(context, lstm_tensors.ValidateTensorStatus(context));
+
+  op_data_lstm->cell_gate_nonlinear_type = builtin_data->activation;
+  op_data_lstm->size_info =
+      CreateLstmSizeInfo(builtin_data->time_major,
+                         lstm_tensors.GetInternalTensor(kLstmInputTensor)->dims,
+                         lstm_tensors.HiddenStateTensor()->dims);
+
+  const TfLiteTensor* input = lstm_tensors.GetInternalTensor(kLstmInputTensor);
+  const auto activation_type = input->type;
+
+  if (kTfLiteInt8 == activation_type) {
+    TF_LITE_ENSURE_STATUS(
+        CalculateOpData(context, node, lstm_tensors, op_data));
+  }
+
+  TF_LITE_ENSURE_OK(context, ValidateTensorSize(context, lstm_tensors,
+                                                op_data_lstm->size_info));
+
+  // Create cell state information and gate parameters (Fully Connected and Mul)
+  auto cell_state_type =
+      lstm_tensors.GetInternalTensor(kLstmCellStateTensor)->type;
+  if (cell_state_type == kTfLiteFloat32) {
+    op_data_lstm->cell_state_info =
+        CreateLstmCellStateInfoFloat(builtin_data->cell_clip);
+    TF_LITE_ENSURE_OK(context, PrepareGateParametersFloat(context, lstm_tensors,
+                                                          op_data_lstm));
+  } else if (cell_state_type == kTfLiteInt16) {
+    op_data_lstm->cell_state_info = CreateLstmCellStateInfo(
+        lstm_tensors.CellStateTensor()->params.scale, builtin_data->cell_clip);
+    TF_LITE_ENSURE_OK(context, PrepareGateParametersInteger(
+                                   context, lstm_tensors, op_data_lstm));
+  } else {
+    MicroPrintf(
+        "Cell state type %s (%d) not supported. The quantized Unidirectional "
+        "Sequence LSTM Op only support int16 cell state",
+        TfLiteTypeGetName(cell_state_type), cell_state_type);
+    return kTfLiteError;
+  }
+  // request buffers (four buffers)
+  for (size_t i = 0; i < 4; i++) {
+    TF_LITE_ENSURE_OK(context, context->RequestScratchBufferInArena(
+                                   context,
+                                   op_data_lstm->size_info.batch_size *
+                                       op_data_lstm->size_info.state_dimension *
+                                       TfLiteTypeGetSize(cell_state_type),
+                                   &(op_data_lstm->buffer_indices[i])));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus UnidirectionalSequenceLstmEval(TfLiteContext* context,
+                                            TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& op_data = *reinterpret_cast<const OpData*>(node->user_data);
+  const OpDataLSTM& op_data_lstm = op_data.params_ref;
+
+  auto kernel_content = CreateLSTMKernelContent(context, node);
+
+  const auto activation_type =
+      kernel_content.internal_tensors[kLstmInputTensor]->type;
+  const auto weight_type =
+      kernel_content.internal_tensors[kLstmInputToInputWeightsTensor]->type;
+
+  switch (activation_type) {
+    case kTfLiteFloat32: {
+      LSTMBuffers<float> buffers =
+          CreateLSTMBuffers<float>(context, op_data_lstm.buffer_indices);
+      EvalLstm<float, float, float, float>(op_data_lstm, kernel_content,
+                                           buffers);
+      break;
+    }
+    case kTfLiteInt8: {
+      switch (weight_type) {
+        case kTfLiteInt8: {
+          // 8(activation)x8(weight)->16(cell) LSTM with 32 bits bias
+          LSTMBuffers<int16_t> buffers =
+              CreateLSTMBuffers<int16_t>(context, op_data_lstm.buffer_indices);
+          return CMSIS_NN_EvalInteger8x8_16Lstm<int16_t>(
+              op_data, kernel_content, buffers);
+          break;
+        }
+        default: {
+          MicroPrintf("Filter type %s (%d) not supported.",
+                      TfLiteTypeGetName(weight_type), activation_type);
+          return kTfLiteError;
+        }
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      switch (weight_type) {
+        case kTfLiteInt8: {
+          // 16(activation)x8(weight)->16(cell) LSTM with 64 bits bias
+          LSTMBuffers<int16_t> buffers =
+              CreateLSTMBuffers<int16_t>(context, op_data_lstm.buffer_indices);
+          EvalLstm<int16_t, int8_t, int16_t, int64_t>(op_data_lstm,
+                                                      kernel_content, buffers);
+          break;
+        }
+        default: {
+          MicroPrintf("Filter type %s (%d) not supported.",
+                      TfLiteTypeGetName(weight_type), weight_type);
+          return kTfLiteError;
+        }
+      }
+      break;
+    }
+    default: {
+      MicroPrintf("Input type %s (%d) not supported.",
+                  TfLiteTypeGetName(activation_type), activation_type);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus UnidirectionalSequenceLstmEvalInt8(TfLiteContext* context,
+                                                TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& op_data = *reinterpret_cast<const OpData*>(node->user_data);
+  const OpDataLSTM& op_data_lstm = op_data.params_ref;
+  auto kernel_content = CreateLSTMKernelContent(context, node);
+  const auto activation_type =
+      kernel_content.internal_tensors[kLstmInputTensor]->type;
+  const auto weight_type =
+      kernel_content.internal_tensors[kLstmInputToInputWeightsTensor]->type;
+
+  TFLITE_DCHECK(weight_type == kTfLiteInt16 &&
+                "Only int16 filter type supported.");
+
+  if (activation_type == kTfLiteInt8) {
+    LSTMBuffers<int16_t> buffers =
+        CreateLSTMBuffers<int16_t>(context, op_data_lstm.buffer_indices);
+
+    return CMSIS_NN_EvalInteger8x8_16Lstm<int16_t>(op_data, kernel_content,
+                                                   buffers);
+  } else {
+    MicroPrintf("Input type %s (%d) not supported.",
+                TfLiteTypeGetName(activation_type), activation_type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
+  return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit,
+                                   UnidirectionalSequenceLstmPrepare,
+                                   UnidirectionalSequenceLstmEval);
+}
+
+TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM_INT8() {
+  return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit,
+                                   UnidirectionalSequenceLstmPrepare,
+                                   UnidirectionalSequenceLstmEvalInt8);
+}
+
+}  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/comparisons.cpp b/src/tensorflow/lite/micro/kernels/comparisons.cpp
index 31ab9259..76a820a8 100644
--- a/src/tensorflow/lite/micro/kernels/comparisons.cpp
+++ b/src/tensorflow/lite/micro/kernels/comparisons.cpp
@@ -579,27 +579,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_EQUAL() {
+TfLiteRegistration_V1 Register_EQUAL() {
   return tflite::micro::RegisterOp(Init, Prepare, EqualEval);
 }
 
-TfLiteRegistration Register_NOT_EQUAL() {
+TfLiteRegistration_V1 Register_NOT_EQUAL() {
   return tflite::micro::RegisterOp(Init, Prepare, NotEqualEval);
 }
 
-TfLiteRegistration Register_GREATER() {
+TfLiteRegistration_V1 Register_GREATER() {
   return tflite::micro::RegisterOp(Init, Prepare, GreaterEval);
 }
 
-TfLiteRegistration Register_GREATER_EQUAL() {
+TfLiteRegistration_V1 Register_GREATER_EQUAL() {
   return tflite::micro::RegisterOp(Init, Prepare, GreaterEqualEval);
 }
 
-TfLiteRegistration Register_LESS() {
+TfLiteRegistration_V1 Register_LESS() {
   return tflite::micro::RegisterOp(Init, Prepare, LessEval);
 }
 
-TfLiteRegistration Register_LESS_EQUAL() {
+TfLiteRegistration_V1 Register_LESS_EQUAL() {
   return tflite::micro::RegisterOp(Init, Prepare, LessEqualEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/concatenation.cpp b/src/tensorflow/lite/micro/kernels/concatenation.cpp
index 59157564..4e1a7968 100644
--- a/src/tensorflow/lite/micro/kernels/concatenation.cpp
+++ b/src/tensorflow/lite/micro/kernels/concatenation.cpp
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -252,7 +251,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_CONCATENATION() {
+TfLiteRegistration_V1 Register_CONCATENATION() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/conv.h b/src/tensorflow/lite/micro/kernels/conv.h
index d50ddc6f..5ad35bca 100644
--- a/src/tensorflow/lite/micro/kernels/conv.h
+++ b/src/tensorflow/lite/micro/kernels/conv.h
@@ -76,37 +76,39 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node);
 
-// This is the most generic TfLiteRegistration. The actual supported types may
-// still be target dependent. The only requirement is that every implementation
-// (reference or optimized) must define this function.
-TfLiteRegistration Register_CONV_2D();
+// This is the most generic TfLiteRegistration_V1. The actual supported types
+// may still be target dependent. The only requirement is that every
+// implementation (reference or optimized) must define this function.
+TfLiteRegistration_V1 Register_CONV_2D();
 
 #if defined(XTENSA)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int8 activations and int8 weights and always calls the reference
 // implementation.
-TfLiteRegistration Register_CONV_2D_INT8REF();
+TfLiteRegistration_V1 Register_CONV_2D_INT8REF();
 #else
-inline TfLiteRegistration Register_CONV_2D_INT8REF() {
+inline TfLiteRegistration_V1 Register_CONV_2D_INT8REF() {
   return Register_CONV_2D();
 }
 #endif
 
 #if defined(ARDUINO)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int8 activations and int8 weights and uses the latency optimized
 // implementations.
-TfLiteRegistration Register_CONV_2D_INT8();
+TfLiteRegistration_V1 Register_CONV_2D_INT8();
 
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int16 activations and int8 weights and uses the latency optimized
 // implementations.
-TfLiteRegistration Register_CONV_2D_INT16();
+TfLiteRegistration_V1 Register_CONV_2D_INT16();
 
 #else
-inline TfLiteRegistration Register_CONV_2D_INT8() { return Register_CONV_2D(); }
+inline TfLiteRegistration_V1 Register_CONV_2D_INT8() {
+  return Register_CONV_2D();
+}
 
-inline TfLiteRegistration Register_CONV_2D_INT16() {
+inline TfLiteRegistration_V1 Register_CONV_2D_INT16() {
   return Register_CONV_2D();
 }
 #endif
diff --git a/src/tensorflow/lite/micro/kernels/conv_test.h b/src/tensorflow/lite/micro/kernels/conv_test.h
index aa7ea443..5ea0261e 100644
--- a/src/tensorflow/lite/micro/kernels/conv_test.h
+++ b/src/tensorflow/lite/micro/kernels/conv_test.h
@@ -28,35 +28,37 @@ namespace testing {
 
 TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
                         int output_length, TfLiteConvParams* conv_params,
-                        TfLiteRegistration registration, float* output_data);
+                        TfLiteRegistration_V1 registration, float* output_data);
 
 TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
                         int output_length, TfLiteConvParams* conv_params,
-                        TfLiteRegistration registration, int8_t* output_data);
+                        TfLiteRegistration_V1 registration,
+                        int8_t* output_data);
 
 TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
                         int output_length, TfLiteConvParams* conv_params,
-                        TfLiteRegistration registration, uint8_t* output_data);
+                        TfLiteRegistration_V1 registration,
+                        uint8_t* output_data);
 
 TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
                                  const float* expected_output_data,
                                  int output_length,
                                  TfLiteConvParams* conv_params,
-                                 TfLiteRegistration registration,
+                                 TfLiteRegistration_V1 registration,
                                  float* output_data, float tolerance = 1e-5);
 
 TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
                                  const int8_t* expected_output_data,
                                  int output_length,
                                  TfLiteConvParams* conv_params,
-                                 TfLiteRegistration registration,
+                                 TfLiteRegistration_V1 registration,
                                  int8_t* output_data, float tolerance = 1e-5);
 
 TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
                                  const uint8_t* expected_output_data,
                                  int output_length,
                                  TfLiteConvParams* conv_params,
-                                 TfLiteRegistration registration,
+                                 TfLiteRegistration_V1 registration,
                                  uint8_t* output_data, float tolerance = 1e-5);
 
 TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data,
@@ -65,7 +67,8 @@ TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data,
                            int* output_dims_data,
                            const float* expected_output_data,
                            TfLiteConvParams* conv_params,
-                           TfLiteRegistration registration, float* output_data);
+                           TfLiteRegistration_V1 registration,
+                           float* output_data);
 
 TfLiteStatus TestConvQuantizedPerLayer(
     int* input_dims_data, const float* input_data, uint8_t* input_quantized,
@@ -74,7 +77,7 @@ TfLiteStatus TestConvQuantizedPerLayer(
     const float* bias_data, int32_t* bias_quantized, int* output_dims_data,
     const float* expected_output_data, uint8_t* expected_output_quantized,
     float output_scale, TfLiteConvParams* conv_params,
-    TfLiteRegistration registration, uint8_t* output_data);
+    TfLiteRegistration_V1 registration, uint8_t* output_data);
 
 TfLiteStatus TestConvQuantizedPerChannel(
     int* input_dims_data, const float* input_data, int8_t* input_quantized,
@@ -84,7 +87,7 @@ TfLiteStatus TestConvQuantizedPerChannel(
     float* bias_scales, int* bias_zero_points, int* output_dims_data,
     const float* expected_output_data, int8_t* expected_output_data_quantized,
     float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
-    TfLiteRegistration registration, int8_t* output_data,
+    TfLiteRegistration_V1 registration, int8_t* output_data,
     TfLiteType tensor_weight_type = kTfLiteNoType);
 
 TfLiteStatus TestConvQuantizedPerChannel(
@@ -96,7 +99,7 @@ TfLiteStatus TestConvQuantizedPerChannel(
     int* bias_zero_points, int* output_dims_data,
     const float* expected_output_data, int16_t* expected_output_data_quantized,
     float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
-    TfLiteRegistration registration, int16_t* output_data);
+    TfLiteRegistration_V1 registration, int16_t* output_data);
 
 TfLiteStatus TestConvQuantizedPerChannel(
     int* input_dims_data, const float* input_data, int16_t* input_quantized,
@@ -106,7 +109,7 @@ TfLiteStatus TestConvQuantizedPerChannel(
     float* bias_scales, int* bias_zero_points, int* output_dims_data,
     const float* expected_output_data, int16_t* expected_output_data_quantized,
     float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
-    TfLiteRegistration registration, int16_t* output_data);
+    TfLiteRegistration_V1 registration, int16_t* output_data);
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/cumsum.cpp b/src/tensorflow/lite/micro/kernels/cumsum.cpp
index 4f8a9659..1b005e6a 100644
--- a/src/tensorflow/lite/micro/kernels/cumsum.cpp
+++ b/src/tensorflow/lite/micro/kernels/cumsum.cpp
@@ -168,7 +168,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_CUMSUM() {
+TfLiteRegistration_V1 Register_CUMSUM() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/depth_to_space.cpp b/src/tensorflow/lite/micro/kernels/depth_to_space.cpp
index 7f229fbf..932e295c 100644
--- a/src/tensorflow/lite/micro/kernels/depth_to_space.cpp
+++ b/src/tensorflow/lite/micro/kernels/depth_to_space.cpp
@@ -135,7 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_DEPTH_TO_SPACE() {
+TfLiteRegistration_V1 Register_DEPTH_TO_SPACE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/depthwise_conv.h b/src/tensorflow/lite/micro/kernels/depthwise_conv.h
index 589a02a5..72d4bf02 100644
--- a/src/tensorflow/lite/micro/kernels/depthwise_conv.h
+++ b/src/tensorflow/lite/micro/kernels/depthwise_conv.h
@@ -49,28 +49,28 @@ TfLiteStatus CalculateOpDataDepthwiseConv(
 
 TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node);
 
-// This is the most generic TfLiteRegistration. The actual supported types may
-// still be target dependent. The only requirement is that every implementation
-// (reference or optimized) must define this function.
-TfLiteRegistration Register_DEPTHWISE_CONV_2D();
+// This is the most generic TfLiteRegistration_V1. The actual supported types
+// may still be target dependent. The only requirement is that every
+// implementation (reference or optimized) must define this function.
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D();
 
 #if defined(ARDUINO)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int8 activations and int8 weights and uses the latency optimized
 // implementations.
-TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8();
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT8();
 
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int16 activations and int8 weights and uses the latency optimized
 // implementations.
-TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16();
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT16();
 
 #else
-inline TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8() {
+inline TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT8() {
   return Register_DEPTHWISE_CONV_2D();
 }
 
-inline TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16() {
+inline TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D_INT16() {
   return Register_DEPTHWISE_CONV_2D();
 }
 #endif
diff --git a/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp b/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp
index 2a0ae2f4..6d5f6c27 100644
--- a/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp
+++ b/src/tensorflow/lite/micro/kernels/depthwise_conv_common.cpp
@@ -188,6 +188,13 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) {
                       affine_quantization->zero_point->size);
   }
 
+  TF_LITE_ENSURE_MSG(
+      context,
+      input->type == filter->type ||
+          (input->type == kTfLiteInt8 &&
+           (filter->type == kTfLiteInt4 || filter->type == kTfLiteInt8)),
+      "Hybrid models are not supported on TFLite Micro.");
+
   if (filter->type == kTfLiteInt4) {
     int filter_size =
         RuntimeShape(filter->dims->size,
diff --git a/src/tensorflow/lite/micro/kernels/dequantize.cpp b/src/tensorflow/lite/micro/kernels/dequantize.cpp
index f51db508..1a62176f 100644
--- a/src/tensorflow/lite/micro/kernels/dequantize.cpp
+++ b/src/tensorflow/lite/micro/kernels/dequantize.cpp
@@ -80,7 +80,7 @@ TfLiteStatus DequantizeEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteRegistration Register_DEQUANTIZE() {
+TfLiteRegistration_V1 Register_DEQUANTIZE() {
   return tflite::micro::RegisterOp(DequantizeInit, DequantizePrepare,
                                    DequantizeEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp b/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp
index 7aadbbf8..3a750549 100644
--- a/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp
+++ b/src/tensorflow/lite/micro/kernels/detection_postprocess.cpp
@@ -799,8 +799,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration* Register_DETECTION_POSTPROCESS() {
-  static TfLiteRegistration r = tflite::micro::RegisterOp(Init, Prepare, Eval);
+TfLiteRegistration_V1* Register_DETECTION_POSTPROCESS() {
+  static TfLiteRegistration_V1 r =
+      tflite::micro::RegisterOp(Init, Prepare, Eval);
   return &r;
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/div.cpp b/src/tensorflow/lite/micro/kernels/div.cpp
index 5c986126..8771ebc0 100644
--- a/src/tensorflow/lite/micro/kernels/div.cpp
+++ b/src/tensorflow/lite/micro/kernels/div.cpp
@@ -201,7 +201,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_DIV() {
+TfLiteRegistration_V1 Register_DIV() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/elementwise.cpp b/src/tensorflow/lite/micro/kernels/elementwise.cpp
index 81b27039..1f3b5ecb 100644
--- a/src/tensorflow/lite/micro/kernels/elementwise.cpp
+++ b/src/tensorflow/lite/micro/kernels/elementwise.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,9 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace elementwise {
 namespace {
 
 constexpr int kAbsNameId = 0;
@@ -351,9 +348,11 @@ TfLiteStatus RsqrtEval(TfLiteContext* context, TfLiteNode* node) {
           context, node, [](float f) { return 1.f / std::sqrt(f); },
           /*validate_input_func=*/nullptr, type);
     case kTfLiteInt8:
-      return EvalImplQuantized<int8_t>(context, node,
-                                       elementwise::RsqrtEvalQuantized,
-                                       elementwise::validate_input_func, type);
+      return EvalImplQuantized<int8_t>(context, node, RsqrtEvalQuantized,
+                                       validate_input_func, type);
+    case kTfLiteInt16:
+      return EvalImplQuantized<int16_t>(context, node, RsqrtEvalQuantized,
+                                        validate_input_func, type);
 
     default:
       MicroPrintf("Current data type %s is not supported.",
@@ -371,60 +370,47 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 }  // namespace
-}  // namespace elementwise
 
-TfLiteRegistration Register_ABS() {
+TfLiteRegistration_V1 Register_ABS() {
   return tflite::micro::RegisterOp(
-      elementwise::ElementWiseAbsRsqrtInit,
-      elementwise::PrepareAbsRsqrt<elementwise::IsAbsSupportedType,
-                                   elementwise::kAbsNameId>,
-      elementwise::AbsEval);
+      ElementWiseAbsRsqrtInit, PrepareAbsRsqrt<IsAbsSupportedType, kAbsNameId>,
+      AbsEval);
 }
 
-TfLiteRegistration Register_SIN() {
+TfLiteRegistration_V1 Register_SIN() {
   return tflite::micro::RegisterOp(
-      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::SinEval);
+      nullptr, GenericPrepare<IsNumericSupportedType>, SinEval);
 }
 
-TfLiteRegistration Register_COS() {
+TfLiteRegistration_V1 Register_COS() {
   return tflite::micro::RegisterOp(
-      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::CosEval);
+      nullptr, GenericPrepare<IsNumericSupportedType>, CosEval);
 }
 
-TfLiteRegistration Register_LOG() {
+TfLiteRegistration_V1 Register_LOG() {
   return tflite::micro::RegisterOp(
-      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::LogEval);
+      nullptr, GenericPrepare<IsNumericSupportedType>, LogEval);
 }
 
-TfLiteRegistration Register_SQRT() {
+TfLiteRegistration_V1 Register_SQRT() {
   return tflite::micro::RegisterOp(
-      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::SqrtEval);
+      nullptr, GenericPrepare<IsNumericSupportedType>, SqrtEval);
 }
 
-TfLiteRegistration Register_RSQRT() {
+TfLiteRegistration_V1 Register_RSQRT() {
   return tflite::micro::RegisterOp(
-      elementwise::ElementWiseAbsRsqrtInit,
-      elementwise::PrepareAbsRsqrt<elementwise::IsRsqrtSupportedType,
-                                   elementwise::kRsrqtNameId>,
-      elementwise::RsqrtEval);
+      ElementWiseAbsRsqrtInit,
+      PrepareAbsRsqrt<IsRsqrtSupportedType, kRsrqtNameId>, RsqrtEval);
 }
 
-TfLiteRegistration Register_SQUARE() {
+TfLiteRegistration_V1 Register_SQUARE() {
   return tflite::micro::RegisterOp(
-      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::SquareEval);
+      nullptr, GenericPrepare<IsNumericSupportedType>, SquareEval);
 }
 
-TfLiteRegistration Register_LOGICAL_NOT() {
+TfLiteRegistration_V1 Register_LOGICAL_NOT() {
   return tflite::micro::RegisterOp(
-      nullptr, elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
-      elementwise::LogicalNotEval);
+      nullptr, GenericPrepare<IsLogicalSupportedType>, LogicalNotEval);
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/elu.cpp b/src/tensorflow/lite/micro/kernels/elu.cpp
index c4786d6f..482baed2 100644
--- a/src/tensorflow/lite/micro/kernels/elu.cpp
+++ b/src/tensorflow/lite/micro/kernels/elu.cpp
@@ -144,7 +144,7 @@ TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_ELU() {
+TfLiteRegistration_V1 Register_ELU() {
   return tflite::micro::RegisterOp(EluInit, EluPrepare, EluEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/ethosu.cpp b/src/tensorflow/lite/micro/kernels/ethosu.cpp
index c305121e..1b792fb8 100644
--- a/src/tensorflow/lite/micro/kernels/ethosu.cpp
+++ b/src/tensorflow/lite/micro/kernels/ethosu.cpp
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace tflite {
 
-TfLiteRegistration* Register_ETHOSU() { return nullptr; }
+TfLiteRegistration_V1* Register_ETHOSU() { return nullptr; }
 
 const char* GetString_ETHOSU() { return ""; }
 
diff --git a/src/tensorflow/lite/micro/kernels/ethosu.h b/src/tensorflow/lite/micro/kernels/ethosu.h
index cfbb0d3f..93ef1d5b 100644
--- a/src/tensorflow/lite/micro/kernels/ethosu.h
+++ b/src/tensorflow/lite/micro/kernels/ethosu.h
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tflite {
 
-TfLiteRegistration* Register_ETHOSU();
+TfLiteRegistration_V1* Register_ETHOSU();
 
 const char* GetString_ETHOSU();
 
diff --git a/src/tensorflow/lite/micro/kernels/exp.cpp b/src/tensorflow/lite/micro/kernels/exp.cpp
index a835ee0a..44a39f45 100644
--- a/src/tensorflow/lite/micro/kernels/exp.cpp
+++ b/src/tensorflow/lite/micro/kernels/exp.cpp
@@ -72,7 +72,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_EXP() {
+TfLiteRegistration_V1 Register_EXP() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/expand_dims.cpp b/src/tensorflow/lite/micro/kernels/expand_dims.cpp
index ad45dd88..4c98ef9d 100644
--- a/src/tensorflow/lite/micro/kernels/expand_dims.cpp
+++ b/src/tensorflow/lite/micro/kernels/expand_dims.cpp
@@ -142,7 +142,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_EXPAND_DIMS() {
+TfLiteRegistration_V1 Register_EXPAND_DIMS() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/fill.cpp b/src/tensorflow/lite/micro/kernels/fill.cpp
index 6a3f4998..a759a0fe 100644
--- a/src/tensorflow/lite/micro/kernels/fill.cpp
+++ b/src/tensorflow/lite/micro/kernels/fill.cpp
@@ -133,7 +133,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_FILL() {
+TfLiteRegistration_V1 Register_FILL() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/floor.cpp b/src/tensorflow/lite/micro/kernels/floor.cpp
index 207b5c4b..bf6404c3 100644
--- a/src/tensorflow/lite/micro/kernels/floor.cpp
+++ b/src/tensorflow/lite/micro/kernels/floor.cpp
@@ -41,7 +41,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_FLOOR() {
+TfLiteRegistration_V1 Register_FLOOR() {
   return tflite::micro::RegisterOp(nullptr, nullptr, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/floor_div.cpp b/src/tensorflow/lite/micro/kernels/floor_div.cpp
index f143d28a..d70080e7 100644
--- a/src/tensorflow/lite/micro/kernels/floor_div.cpp
+++ b/src/tensorflow/lite/micro/kernels/floor_div.cpp
@@ -123,7 +123,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_FLOOR_DIV() {
+TfLiteRegistration_V1 Register_FLOOR_DIV() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/floor_mod.cpp b/src/tensorflow/lite/micro/kernels/floor_mod.cpp
index 939a4dd7..aa53b157 100644
--- a/src/tensorflow/lite/micro/kernels/floor_mod.cpp
+++ b/src/tensorflow/lite/micro/kernels/floor_mod.cpp
@@ -121,7 +121,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_FLOOR_MOD() {
+TfLiteRegistration_V1 Register_FLOOR_MOD() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/fully_connected.h b/src/tensorflow/lite/micro/kernels/fully_connected.h
index 3ecf27e4..cbeaf4c5 100644
--- a/src/tensorflow/lite/micro/kernels/fully_connected.h
+++ b/src/tensorflow/lite/micro/kernels/fully_connected.h
@@ -68,15 +68,15 @@ TfLiteStatus CalculateOpDataFullyConnected(
     TfLiteType data_type, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output, OpDataFullyConnected* data);
 
-// This is the most generic TfLiteRegistration. The actual supported types may
-// still be target dependent. The only requirement is that every implementation
-// (reference or optimized) must define this function.
-TfLiteRegistration Register_FULLY_CONNECTED();
+// This is the most generic TfLiteRegistration_V1. The actual supported types
+// may still be target dependent. The only requirement is that every
+// implementation (reference or optimized) must define this function.
+TfLiteRegistration_V1 Register_FULLY_CONNECTED();
 
 #if defined(ARDUINO) || defined(HEXAGON) || defined(XTENSA)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int8.
-TfLiteRegistration Register_FULLY_CONNECTED_INT8();
+TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT8();
 
 #else
 // Note that while this block gets used for both reference and optimized kernels
@@ -84,16 +84,16 @@ TfLiteRegistration Register_FULLY_CONNECTED_INT8();
 // define fallback implementation that allow reference kernels to still be used
 // from applications that call a more specific kernel variant.
 
-inline TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
+inline TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT8() {
   return Register_FULLY_CONNECTED();
 }
 
 #endif
 
 #if defined(ARDUINO)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int16.
-TfLiteRegistration Register_FULLY_CONNECTED_INT16();
+TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT16();
 
 #else
 // Note that while this block gets used for both reference and optimized kernels
@@ -101,7 +101,7 @@ TfLiteRegistration Register_FULLY_CONNECTED_INT16();
 // define fallback implementation that allow reference kernels to still be used
 // from applications that call a more specific kernel variant.
 
-inline TfLiteRegistration Register_FULLY_CONNECTED_INT16() {
+inline TfLiteRegistration_V1 Register_FULLY_CONNECTED_INT16() {
   return Register_FULLY_CONNECTED();
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/gather.cpp b/src/tensorflow/lite/micro/kernels/gather.cpp
index 4ec53473..9c858957 100644
--- a/src/tensorflow/lite/micro/kernels/gather.cpp
+++ b/src/tensorflow/lite/micro/kernels/gather.cpp
@@ -217,7 +217,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_GATHER() {
+TfLiteRegistration_V1 Register_GATHER() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/gather_nd.cpp b/src/tensorflow/lite/micro/kernels/gather_nd.cpp
index 3a02e815..27307d1a 100644
--- a/src/tensorflow/lite/micro/kernels/gather_nd.cpp
+++ b/src/tensorflow/lite/micro/kernels/gather_nd.cpp
@@ -205,7 +205,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_GATHER_ND() {
+TfLiteRegistration_V1 Register_GATHER_ND() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/hard_swish.cpp b/src/tensorflow/lite/micro/kernels/hard_swish.cpp
index a0b3f7c6..8e3a9cde 100644
--- a/src/tensorflow/lite/micro/kernels/hard_swish.cpp
+++ b/src/tensorflow/lite/micro/kernels/hard_swish.cpp
@@ -67,7 +67,7 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_HARD_SWISH() {
+TfLiteRegistration_V1 Register_HARD_SWISH() {
   return tflite::micro::RegisterOp(HardSwishInit, tflite::HardSwishPrepare,
                                    HardSwishEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/if.cpp b/src/tensorflow/lite/micro/kernels/if.cpp
index 39eca8b4..a23bfc53 100644
--- a/src/tensorflow/lite/micro/kernels/if.cpp
+++ b/src/tensorflow/lite/micro/kernels/if.cpp
@@ -114,7 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_IF() {
+TfLiteRegistration_V1 Register_IF() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/kernel_runner.cpp b/src/tensorflow/lite/micro/kernels/kernel_runner.cpp
index 070f32a5..6ec2e350 100644
--- a/src/tensorflow/lite/micro/kernels/kernel_runner.cpp
+++ b/src/tensorflow/lite/micro/kernels/kernel_runner.cpp
@@ -34,7 +34,7 @@ void ClearBufferApi(TfLiteContext* context_) {
   context_->RequestScratchBufferInArena = nullptr;
 }
 
-KernelRunner::KernelRunner(const TfLiteRegistration& registration,
+KernelRunner::KernelRunner(const TfLiteRegistration_V1& registration,
                            TfLiteTensor* tensors, int tensors_size,
                            TfLiteIntArray* inputs, TfLiteIntArray* outputs,
                            void* builtin_data, TfLiteIntArray* intermediates)
@@ -94,7 +94,7 @@ TfLiteStatus KernelRunner::Invoke() {
   context_.GetScratchBuffer = MicroContextGetScratchBuffer;
 
   if (registration_.invoke == nullptr) {
-    MicroPrintf("TfLiteRegistration missing invoke function pointer!");
+    MicroPrintf("TfLiteRegistration_V1 missing invoke function pointer!");
     return kTfLiteError;
   }
 
@@ -110,7 +110,7 @@ TfLiteStatus KernelRunner::Free() {
   context_.GetScratchBuffer = MicroContextGetScratchBuffer;
 
   if (registration_.free == nullptr) {
-    MicroPrintf("TfLiteRegistration missing free function pointer!");
+    MicroPrintf("TfLiteRegistration_V1 missing free function pointer!");
     return kTfLiteError;
   }
 
diff --git a/src/tensorflow/lite/micro/kernels/kernel_runner.h b/src/tensorflow/lite/micro/kernels/kernel_runner.h
index c7d53c3a..64eac8a6 100644
--- a/src/tensorflow/lite/micro/kernels/kernel_runner.h
+++ b/src/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tflite {
 namespace micro {
 
-// Helper class to perform a simulated kernel (i.e. TfLiteRegistration)
+// Helper class to perform a simulated kernel (i.e. TfLiteRegistration_V1)
 // lifecycle (init, prepare, invoke). All internal allocations are handled by
 // this class. Simply pass in the registration, list of required tensors, inputs
 // array, outputs array, and any pre-builtin data. Calling Invoke() will
@@ -33,22 +33,22 @@ namespace micro {
 // output provided during construction.
 class KernelRunner {
  public:
-  KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors,
+  KernelRunner(const TfLiteRegistration_V1& registration, TfLiteTensor* tensors,
                int tensors_size, TfLiteIntArray* inputs,
                TfLiteIntArray* outputs, void* builtin_data,
                TfLiteIntArray* intermediates = nullptr);
 
-  // Calls init and prepare on the kernel (i.e. TfLiteRegistration) struct. Any
-  // exceptions will be DebugLog'd and returned as a status code.
+  // Calls init and prepare on the kernel (i.e. TfLiteRegistration_V1) struct.
+  // Any exceptions will be DebugLog'd and returned as a status code.
   TfLiteStatus InitAndPrepare(const char* init_data = nullptr,
                               size_t length = 0);
 
-  // Calls init, prepare, and invoke on a given TfLiteRegistration pointer.
-  // After successful invoke, results will be available in the output tensor as
-  // passed into the constructor of this class.
+  // Calls invoke on a given TfLiteRegistration_V1 pointer. After successful
+  // invoke, results will be available in the output tensor as passed into the
+  // constructor of this class.
   TfLiteStatus Invoke();
 
-  // Calls Free on a given TfLiteRegistration pointer(if it's implemented).
+  // Calls Free on a given TfLiteRegistration_V1 pointer(if it's implemented).
   // After successful Free, kTfLiteOk status will be returned. If Free is not
   // implemented for a given kernel kTfLiteError will be returned.
   TfLiteStatus Free();
@@ -68,7 +68,7 @@ class KernelRunner {
 
   TfLiteContext context_ = {};
   TfLiteNode node_ = {};
-  const TfLiteRegistration& registration_;
+  const TfLiteRegistration_V1& registration_;
 
   SingleArenaBufferAllocator* allocator_;
   MockMicroGraph mock_micro_graph_;
diff --git a/src/tensorflow/lite/micro/kernels/kernel_util.cpp b/src/tensorflow/lite/micro/kernels/kernel_util.cpp
index 76031b87..6d766672 100644
--- a/src/tensorflow/lite/micro/kernels/kernel_util.cpp
+++ b/src/tensorflow/lite/micro/kernels/kernel_util.cpp
@@ -38,7 +38,7 @@ int ValidateTensorIndexing(const TfLiteContext* context, int index,
 
 }  // namespace
 
-TfLiteRegistration RegisterOp(
+TfLiteRegistration_V1 RegisterOp(
     void* (*init)(TfLiteContext* context, const char* buffer, size_t length),
     TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node),
     TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node),
@@ -50,8 +50,7 @@ TfLiteRegistration RegisterOp(
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
-          /*version=*/0,
-          /*registration_external=*/nullptr};
+          /*version=*/0};
 }
 
 // Returns a mutable tensor for a given input index. is_variable must be checked
diff --git a/src/tensorflow/lite/micro/kernels/kernel_util.h b/src/tensorflow/lite/micro/kernels/kernel_util.h
index f30ae44c..191ab2db 100644
--- a/src/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/src/tensorflow/lite/micro/kernels/kernel_util.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tflite {
 namespace micro {
 
-TfLiteRegistration RegisterOp(
+TfLiteRegistration_V1 RegisterOp(
     void* (*init)(TfLiteContext* context, const char* buffer, size_t length),
     TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node),
     TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node),
diff --git a/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp b/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp
index d4225e46..794f2b67 100644
--- a/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp
+++ b/src/tensorflow/lite/micro/kernels/l2_pool_2d.cpp
@@ -135,7 +135,7 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_L2_POOL_2D() {
+TfLiteRegistration_V1 Register_L2_POOL_2D() {
   return tflite::micro::RegisterOp(nullptr, L2Prepare, L2Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/l2norm.cpp b/src/tensorflow/lite/micro/kernels/l2norm.cpp
index 97f372aa..6dbf93c7 100644
--- a/src/tensorflow/lite/micro/kernels/l2norm.cpp
+++ b/src/tensorflow/lite/micro/kernels/l2norm.cpp
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -132,10 +131,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_L2NORM_REF() {
+TfLiteRegistration_V1 Register_L2NORM_REF() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
-TfLiteRegistration Register_L2_NORMALIZATION() { return Register_L2NORM_REF(); }
+TfLiteRegistration_V1 Register_L2_NORMALIZATION() {
+  return Register_L2NORM_REF();
+}
 
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/leaky_relu.cpp b/src/tensorflow/lite/micro/kernels/leaky_relu.cpp
index 7b51ebcb..1873e3cc 100644
--- a/src/tensorflow/lite/micro/kernels/leaky_relu.cpp
+++ b/src/tensorflow/lite/micro/kernels/leaky_relu.cpp
@@ -87,7 +87,7 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteError;
 }
 
-TfLiteRegistration Register_LEAKY_RELU() {
+TfLiteRegistration_V1 Register_LEAKY_RELU() {
   return tflite::micro::RegisterOp(LeakyReluInit, LeakyReluPrepare,
                                    LeakyReluEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/log_softmax.cpp b/src/tensorflow/lite/micro/kernels/log_softmax.cpp
index 0b1838c3..1ce04c65 100644
--- a/src/tensorflow/lite/micro/kernels/log_softmax.cpp
+++ b/src/tensorflow/lite/micro/kernels/log_softmax.cpp
@@ -141,7 +141,7 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_LOG_SOFTMAX() {
+TfLiteRegistration_V1 Register_LOG_SOFTMAX() {
   return tflite::micro::RegisterOp(nullptr, LogSoftmaxPrepare, LogSoftmaxEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/logical.cpp b/src/tensorflow/lite/micro/kernels/logical.cpp
index c85e0c5b..415c85c5 100644
--- a/src/tensorflow/lite/micro/kernels/logical.cpp
+++ b/src/tensorflow/lite/micro/kernels/logical.cpp
@@ -33,11 +33,11 @@ TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_LOGICAL_OR() {
+TfLiteRegistration_V1 Register_LOGICAL_OR() {
   return tflite::micro::RegisterOp(nullptr, nullptr, LogicalOrEval);
 }
 
-TfLiteRegistration Register_LOGICAL_AND() {
+TfLiteRegistration_V1 Register_LOGICAL_AND() {
   return tflite::micro::RegisterOp(nullptr, nullptr, LogicalAndEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/logistic.cpp b/src/tensorflow/lite/micro/kernels/logistic.cpp
index 108206ad..f968771c 100644
--- a/src/tensorflow/lite/micro/kernels/logistic.cpp
+++ b/src/tensorflow/lite/micro/kernels/logistic.cpp
@@ -105,7 +105,7 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_LOGISTIC() {
+TfLiteRegistration_V1 Register_LOGISTIC() {
   return tflite::micro::RegisterOp(LogisticInit, LogisticPrepare, LogisticEval);
 }
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/lstm_eval.cpp b/src/tensorflow/lite/micro/kernels/lstm_eval.cpp
index 4666e908..93d6bc7e 100644
--- a/src/tensorflow/lite/micro/kernels/lstm_eval.cpp
+++ b/src/tensorflow/lite/micro/kernels/lstm_eval.cpp
@@ -27,6 +27,81 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
+
+LstmTensors::LstmTensors(TfLiteContext* context, TfLiteNode* node) {
+  micro_context_ = GetMicroContext(context);
+  // 24 internal tensors. see lstm_shared.h for tensor names
+  for (size_t i = 0; i < 24; i++) {
+    internal_tensors_[i] = micro_context_->AllocateTempInputTensor(node, i);
+  }
+  output_tensor_ =
+      micro_context_->AllocateTempOutputTensor(node, kLstmOutputTensor);
+}
+
+LstmTensors::~LstmTensors() {
+  for (size_t i = 0; i < 24; i++) {
+    if (internal_tensors_[i] != nullptr) {
+      micro_context_->DeallocateTempTfLiteTensor(internal_tensors_[i]);
+    }
+  }
+  micro_context_->DeallocateTempTfLiteTensor(output_tensor_);
+}
+
+// Verify the LSTM internal tensor properties (e.g., type checks)
+// Input/output/states/fc weights tensors are required for kernel evaulation.
+// The state tensors should be variables. Variants of the standard LSTM
+// are not supported here, therefore their corresponding tensors should be
+// invalid
+TfLiteStatus LstmTensors::ValidateTensorStatus(TfLiteContext* context) const {
+  // Verify certain tensor properties
+  // input tensor
+  TF_LITE_ENSURE(context, internal_tensors_[kLstmInputTensor] != nullptr);
+  // hidden state
+  TF_LITE_ENSURE(context, internal_tensors_[kLstmOutputStateTensor] != nullptr);
+  TF_LITE_ENSURE(context,
+                 internal_tensors_[kLstmOutputStateTensor]->is_variable);
+  // hidden state becomes input so they must have the same type
+  TF_LITE_ENSURE_EQ(context, internal_tensors_[kLstmOutputStateTensor]->type,
+                    internal_tensors_[kLstmInputTensor]->type);
+  // cell state
+  TF_LITE_ENSURE(context, internal_tensors_[kLstmCellStateTensor] != nullptr);
+  TF_LITE_ENSURE(context, internal_tensors_[kLstmCellStateTensor]->is_variable);
+  // output
+  TF_LITE_ENSURE(context, output_tensor_ != nullptr);
+  // output type is the same as the input type (activations)
+  TF_LITE_ENSURE_EQ(context, output_tensor_->type,
+                    internal_tensors_[kLstmInputTensor]->type);
+
+  // weight tensors (1-9, see lstm_shared for index definition)
+  const auto weight_type =
+      internal_tensors_[kLstmInputToForgetWeightsTensor]->type;
+  for (size_t i = 1; i < 9; i++) {
+    TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr);
+    TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, weight_type);
+  }
+
+  // bias tensors (12-15, see lstm_shared for index definition)
+  const auto bias_type = internal_tensors_[kLstmForgetGateBiasTensor]->type;
+  for (size_t i = 12; i < 16; i++) {
+    TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr);
+    TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, bias_type);
+  }
+  // Tensors from LSTM variants are invalid
+  // No peephole
+  for (size_t i = 9; i < 12; i++) {
+    TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr);
+  }
+  // No projection
+  for (size_t i = 16; i < 18; i++) {
+    TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr);
+  }
+  // No internal layer norm
+  for (size_t i = 20; i < 24; i++) {
+    TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr);
+  }
+  return kTfLiteOk;
+}
+
 namespace lstm_internal {
 
 const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
@@ -70,17 +145,15 @@ void Tanh(int32_t cell_state_scale_power, const RuntimeShape& input_data_shape,
           int16_t* input_data, const RuntimeShape& output_data_shape,
           int16_t* output_data) {
   int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3;
+  int32_t input_multiplier = 0;
   if (tanh_input_left_shift < 0) /* handling negative shift value */
   {
-    int32_t i;
     tanh_input_left_shift = -tanh_input_left_shift;
-    for (i = 0; i < input_data_shape.FlatSize(); i++) {
-      input_data[i] = input_data[i] >> tanh_input_left_shift;
-    }
-    tanh_input_left_shift = 0;
+    input_multiplier = 3;
   }
-  reference_integer_ops::Tanh(0, tanh_input_left_shift, input_data_shape,
-                              input_data, output_data_shape, output_data);
+  reference_integer_ops::Tanh(input_multiplier, tanh_input_left_shift,
+                              input_data_shape, input_data, output_data_shape,
+                              output_data);
 }
 
 void Tanh(int32_t cell_state_scale_power, const RuntimeShape& input_data_shape,
diff --git a/src/tensorflow/lite/micro/kernels/lstm_eval.h b/src/tensorflow/lite/micro/kernels/lstm_eval.h
index ebede610..62bc6354 100644
--- a/src/tensorflow/lite/micro/kernels/lstm_eval.h
+++ b/src/tensorflow/lite/micro/kernels/lstm_eval.h
@@ -29,6 +29,130 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_log.h"
 
 namespace tflite {
+
+// Interface to access all the TempTfLiteTensors of the LSTM kernel during the
+// preparation phase. Can only be constructed through the constructor to avoid
+// memory leakage. All TempTfLiteTensors will be deallocated through the
+// destructor.
+class LstmTensors {
+ public:
+  LstmTensors(const LstmTensors& other) = delete;
+  LstmTensors& operator=(const LstmTensors& other) = delete;
+
+  LstmTensors(TfLiteContext* context, TfLiteNode* node);
+  ~LstmTensors();
+
+  // Verify the LSTM internal tensor properties (e.g., type checks)
+  // Input/output/states/fc weights tensors are required for kernel evaulation.
+  // The state tensors should be variables. Variants of the standard LSTM
+  // are not supported here, therefore their corresponding tensors should be
+  // invalid
+  TfLiteStatus ValidateTensorStatus(TfLiteContext* context) const;
+
+  // Internal tensors. see lstm_shared.h for tensor names
+  const TfLiteTensor* GetInternalTensor(const int tensor_index) const {
+    return internal_tensors_[tensor_index];
+  }
+
+  const TfLiteTensor* HiddenStateTensor() const {
+    return internal_tensors_[kLstmOutputStateTensor];
+  }
+  const TfLiteTensor* CellStateTensor() const {
+    return internal_tensors_[kLstmCellStateTensor];
+  }
+  const TfLiteTensor* OutputTensor() const { return output_tensor_; }
+
+ private:
+  // see lstm_shared.h for tensor names
+  MicroContext* micro_context_;
+  TfLiteTensor* internal_tensors_[24];
+  TfLiteTensor* output_tensor_;
+};
+
+// Deduce the size information (Batch (B), Time Steps (T), Input dimension (I),
+// State dimension (S)) that defines the LSTM using the input and hidden state
+// tensor
+LstmSizeInfo CreateLstmSizeInfo(
+    const bool time_major, const TfLiteIntArray* input_tensor_shape,
+    const TfLiteIntArray* hidden_state_tensor_shape);
+
+TfLiteStatus ValidateWeightTensorSize(TfLiteContext* context,
+                                      const TfLiteTensor* tensor, int dim1_size,
+                                      int dim2_size);
+
+TfLiteStatus ValidateBiasTensorSize(TfLiteContext* context,
+                                    const TfLiteTensor* tensor, int size);
+
+// Go through every tensors and make sure their shape match the kernel
+// configuration
+TfLiteStatus ValidateTensorSize(TfLiteContext* context,
+                                const LstmTensors& tensors,
+                                const LstmSizeInfo& size_info);
+
+// Wrapper function to create gate parameters for the four internal LSTM gates
+TfLiteStatus CreateGateParams(
+    TfLiteContext* context,
+    /*Input tensors*/
+    const TfLiteTensor* input, const TfLiteTensor* input_weight,
+    const TfLiteTensor* input_bias,
+    /*Hidden state tensors*/
+    const TfLiteTensor* hidden_state, const TfLiteTensor* hidden_state_weight,
+    const TfLiteTensor* hidden_state_bias,
+    /*Scale of the fc output (input to non-linear activation)*/
+    const float nonlinear_activation_input_scale, const TfLiteType cell_type,
+    const tflite::GateParameters& gate_params);
+
+// Create parameters for element wise multiplication that happens in a) cell
+// state update ; b) hidden state update
+// Note that all the output of gates are symmetrically quantized so only scales
+// are required for input. However, during the hidden state update phase, the
+// output is the updated hidden state, which is asymmetrically quantized. Thus
+// output may require zero point
+tflite::ArithmeticParams CreateInterGateMulParams(const float input1_scale,
+                                                  const float input2_scale,
+                                                  const float output_scale,
+                                                  const TfLiteType output_type,
+                                                  const int output_zp = 0);
+
+// Create the additional information about the cell state, which include:
+// cell_state_scale_power: used in integer nonlinear function (e.g., tanh)
+// quantized_cell_clip: quantized cell clip range
+CellStateInfo CreateLstmCellStateInfo(const float cell_state_scale,
+                                      const float cell_clip);
+
+CellStateInfo CreateLstmCellStateInfoFloat(const float cell_clip);
+tflite::FullyConnectedParams CreateFCParamsFloat();
+
+tflite::GateParameters CreateGateParamsFloat();
+
+tflite::ArithmeticParams CreateInterGateMulParamsFloat();
+
+TfLiteStatus PrepareGateParametersFloat(TfLiteContext* context,
+                                        const LstmTensors& lstm_tensors,
+                                        OpDataLSTM* op_data_lstm);
+
+TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context,
+                                          const LstmTensors& lstm_tensors,
+                                          OpDataLSTM* op_data_lstm);
+
+LSTMKernelContents CreateLSTMKernelContent(TfLiteContext* context,
+                                           TfLiteNode* node);
+
+template <typename CellType>
+LSTMBuffers<CellType> CreateLSTMBuffers(TfLiteContext* context,
+                                        const int* buffer_indices) {
+  LSTMBuffers<CellType> buffers;
+  buffers.buffer0 = reinterpret_cast<CellType*>(
+      context->GetScratchBuffer(context, buffer_indices[0]));
+  buffers.buffer1 = reinterpret_cast<CellType*>(
+      context->GetScratchBuffer(context, buffer_indices[1]));
+  buffers.buffer2 = reinterpret_cast<CellType*>(
+      context->GetScratchBuffer(context, buffer_indices[2]));
+  buffers.buffer3 = reinterpret_cast<CellType*>(
+      context->GetScratchBuffer(context, buffer_indices[3]));
+  return buffers;
+}
+
 // Since LSTM includes multiple intermediate stages, introducing the internal
 // namespace to expose them for testing
 namespace lstm_internal {
@@ -269,7 +393,7 @@ template <typename ActivationType, typename WeightType, typename CellType,
           typename BiasType>
 void LstmStep(const LstmStepManager& step_info, const OpDataLSTM& op_data,
               LSTMKernelContents& kernel_content,
-              LSTMBuffers<CellType>& buffers) {
+              const LSTMBuffers<CellType>& buffers) {
   /*Step1: Calculate gate outputs to prepare cell state update*/
   CellType* gate_internal_buffer = buffers.buffer3;
   CellType* forget_gate_output = buffers.buffer0;
@@ -385,7 +509,7 @@ template <typename ActivationType, typename WeightType, typename CellType,
           typename BiasType>
 TfLiteStatus EvalLstm(const OpDataLSTM& op_data,
                       LSTMKernelContents& kernel_content,
-                      LSTMBuffers<CellType>& buffers) {
+                      const LSTMBuffers<CellType>& buffers) {
   lstm_internal::LstmStepManager step_info(&op_data.size_info);
   const auto& size_info = op_data.size_info;
   // time is the first dimention, enable batch computation
diff --git a/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cpp b/src/tensorflow/lite/micro/kernels/lstm_eval_common.cpp
similarity index 53%
rename from src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cpp
rename to src/tensorflow/lite/micro/kernels/lstm_eval_common.cpp
index e671abec..9631b4c1 100644
--- a/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cpp
+++ b/src/tensorflow/lite/micro/kernels/lstm_eval_common.cpp
@@ -13,130 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Integer version of unidirectional sequence lstm. Only the standard LSTM
-// (defined in the keras LSTM layer, e.g., no peephole etc.) is supported here.
-// Currently used by the 16 bits activation case only
-
-#include <algorithm>
-#include <limits>
-
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/fully_connected.h"
-#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/lstm_eval.h"
-#include "tensorflow/lite/micro/kernels/lstm_shared.h"
 
 namespace tflite {
 
-namespace {
-/*Helper Functions*/
-
-// Interface to access all the TempTfLiteTensors of the LSTM kernel during the
-// preparation phase. Can only be constructed through the constructor to avoid
-// memory leakage. All TempTfLiteTensors will be deallocated through the
-// destructor.
-class LstmTensors {
- public:
-  LstmTensors(const LstmTensors& other) = delete;
-  LstmTensors& operator=(const LstmTensors& other) = delete;
-
-  LstmTensors(TfLiteContext* context, TfLiteNode* node) {
-    micro_context_ = GetMicroContext(context);
-    // 24 internal tensors. see lstm_shared.h for tensor names
-    for (size_t i = 0; i < 24; i++) {
-      internal_tensors_[i] = micro_context_->AllocateTempInputTensor(node, i);
-    }
-    output_tensor_ =
-        micro_context_->AllocateTempOutputTensor(node, kLstmOutputTensor);
-  }
-
-  ~LstmTensors() {
-    for (size_t i = 0; i < 24; i++) {
-      if (internal_tensors_[i] != nullptr) {
-        micro_context_->DeallocateTempTfLiteTensor(internal_tensors_[i]);
-      }
-    }
-    micro_context_->DeallocateTempTfLiteTensor(output_tensor_);
-  }
-
-  // Verify the LSTM internal tensor properties (e.g., type checks)
-  // Input/output/states/fc weights tensors are required for kernel evaulation.
-  // The state tensors should be variables. Variants of the standard LSTM
-  // are not supported here, therefore their corresponding tensors should be
-  // invalid
-  TfLiteStatus ValidateTensorStatus(TfLiteContext* context) const {
-    // Verify certain tensor properties
-    // input tensor
-    TF_LITE_ENSURE(context, internal_tensors_[kLstmInputTensor] != nullptr);
-    // hidden state
-    TF_LITE_ENSURE(context,
-                   internal_tensors_[kLstmOutputStateTensor] != nullptr);
-    TF_LITE_ENSURE(context,
-                   internal_tensors_[kLstmOutputStateTensor]->is_variable);
-    // hidden state becomes input so they must have the same type
-    TF_LITE_ENSURE_EQ(context, internal_tensors_[kLstmOutputStateTensor]->type,
-                      internal_tensors_[kLstmInputTensor]->type);
-    // cell state
-    TF_LITE_ENSURE(context, internal_tensors_[kLstmCellStateTensor] != nullptr);
-    TF_LITE_ENSURE(context,
-                   internal_tensors_[kLstmCellStateTensor]->is_variable);
-    // output
-    TF_LITE_ENSURE(context, output_tensor_ != nullptr);
-    // output type is the same as the input type (activations)
-    TF_LITE_ENSURE_EQ(context, output_tensor_->type,
-                      internal_tensors_[kLstmInputTensor]->type);
-
-    // weight tensors (1-9, see lstm_shared for index definition)
-    const auto weight_type =
-        internal_tensors_[kLstmInputToForgetWeightsTensor]->type;
-    for (size_t i = 1; i < 9; i++) {
-      TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr);
-      TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, weight_type);
-    }
-
-    // bias tensors (12-15, see lstm_shared for index definition)
-    const auto bias_type = internal_tensors_[kLstmForgetGateBiasTensor]->type;
-    for (size_t i = 12; i < 16; i++) {
-      TF_LITE_ENSURE(context, internal_tensors_[i] != nullptr);
-      TF_LITE_ENSURE_EQ(context, internal_tensors_[i]->type, bias_type);
-    }
-    // Tensors from LSTM variants are invalid
-    // No peephole
-    for (size_t i = 9; i < 12; i++) {
-      TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr);
-    }
-    // No projection
-    for (size_t i = 16; i < 18; i++) {
-      TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr);
-    }
-    // No internal layer norm
-    for (size_t i = 20; i < 24; i++) {
-      TF_LITE_ENSURE(context, internal_tensors_[i] == nullptr);
-    }
-    return kTfLiteOk;
-  }
-
-  // Internal tensors. see lstm_shared.h for tensor names
-  const TfLiteTensor* GetInternalTensor(const int tensor_index) const {
-    return internal_tensors_[tensor_index];
-  }
-
-  const TfLiteTensor* HiddenStateTensor() const {
-    return internal_tensors_[kLstmOutputStateTensor];
-  }
-  const TfLiteTensor* CellStateTensor() const {
-    return internal_tensors_[kLstmCellStateTensor];
-  }
-  const TfLiteTensor* OutputTensor() const { return output_tensor_; }
-
- private:
-  // see lstm_shared.h for tensor names
-  MicroContext* micro_context_;
-  TfLiteTensor* internal_tensors_[24];
-  TfLiteTensor* output_tensor_;
-};
-
 // Deduce the size information (Batch (B), Time Steps (T), Input dimension (I),
 // State dimension (S)) that defines the LSTM using the input and hidden state
 // tensor
@@ -269,7 +152,7 @@ tflite::ArithmeticParams CreateInterGateMulParams(const float input1_scale,
                                                   const float input2_scale,
                                                   const float output_scale,
                                                   const TfLiteType output_type,
-                                                  const int output_zp = 0) {
+                                                  const int output_zp) {
   tflite::ArithmeticParams op_params = {};
   if (output_type == kTfLiteInt16) {
     op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
@@ -310,6 +193,7 @@ CellStateInfo CreateLstmCellStateInfo(const float cell_state_scale,
                             static_cast<double>(cell_state_scale),
                         -32768.0),
                32767.0));
+
   return cell_state_info;
 }
 
@@ -344,26 +228,26 @@ tflite::ArithmeticParams CreateInterGateMulParamsFloat() {
 
 TfLiteStatus PrepareGateParametersFloat(TfLiteContext* context,
                                         const LstmTensors& lstm_tensors,
-                                        OpDataLSTM* op_data) {
+                                        OpDataLSTM* op_data_lstm) {
   // Gate Parameters
-  op_data->forget_gate_parameters = CreateGateParamsFloat();
-  op_data->input_gate_parameters = CreateGateParamsFloat();
-  op_data->cell_gate_parameters = CreateGateParamsFloat();
-  op_data->output_gate_parameters = CreateGateParamsFloat();
+  op_data_lstm->forget_gate_parameters = CreateGateParamsFloat();
+  op_data_lstm->input_gate_parameters = CreateGateParamsFloat();
+  op_data_lstm->cell_gate_parameters = CreateGateParamsFloat();
+  op_data_lstm->output_gate_parameters = CreateGateParamsFloat();
   // Inter gate multiplication parameters
-  op_data->inter_gate_parameters.forget_cell_mul_params =
+  op_data_lstm->inter_gate_parameters.forget_cell_mul_params =
       CreateInterGateMulParamsFloat();
-  op_data->inter_gate_parameters.input_mul_params =
+  op_data_lstm->inter_gate_parameters.input_mul_params =
       CreateInterGateMulParamsFloat();
-  op_data->inter_gate_parameters.output_mul_params =
+  op_data_lstm->inter_gate_parameters.output_mul_params =
       CreateInterGateMulParamsFloat();
   return kTfLiteOk;
 }
 
 TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context,
                                           const LstmTensors& lstm_tensors,
-                                          OpDataLSTM* op_data) {
-  float nonlinear_input_scale = 0.00024414062;  // 2^-12 Q3.12 -> Q0.15
+                                          OpDataLSTM* op_data_lstm) {
+  float nonlinear_input_scale = 0.000244140625;  // 2^-12 Q3.12 -> Q0.15
   TF_LITE_ENSURE_OK(
       context,
       CreateGateParams(
@@ -373,7 +257,7 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context,
           lstm_tensors.GetInternalTensor(kLstmOutputStateTensor),
           lstm_tensors.GetInternalTensor(kLstmRecurrentToForgetWeightsTensor),
           /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16,
-          op_data->forget_gate_parameters));
+          op_data_lstm->forget_gate_parameters));
   TF_LITE_ENSURE_OK(
       context,
       CreateGateParams(
@@ -383,7 +267,7 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context,
           lstm_tensors.GetInternalTensor(kLstmOutputStateTensor),
           lstm_tensors.GetInternalTensor(kLstmRecurrentToInputWeightsTensor),
           /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16,
-          op_data->input_gate_parameters));
+          op_data_lstm->input_gate_parameters));
   TF_LITE_ENSURE_OK(
       context,
       CreateGateParams(
@@ -393,7 +277,7 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context,
           lstm_tensors.GetInternalTensor(kLstmOutputStateTensor),
           lstm_tensors.GetInternalTensor(kLstmRecurrentToCellWeightsTensor),
           /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16,
-          op_data->cell_gate_parameters));
+          op_data_lstm->cell_gate_parameters));
   TF_LITE_ENSURE_OK(
       context,
       CreateGateParams(
@@ -403,25 +287,26 @@ TfLiteStatus PrepareGateParametersInteger(TfLiteContext* context,
           lstm_tensors.GetInternalTensor(kLstmOutputStateTensor),
           lstm_tensors.GetInternalTensor(kLstmRecurrentToOutputWeightsTensor),
           /*hidden_state_bias=*/nullptr, nonlinear_input_scale, kTfLiteInt16,
-          op_data->output_gate_parameters));
+          op_data_lstm->output_gate_parameters));
 
   // Inter gate multiplication parameters
-  float nonlinear_output_scale = 0.00003051757;  // 2^-15 Q3.12 -> Q0.15
+  float nonlinear_output_scale = 0.000030517578125;  // 2^-15 Q3.12 -> Q0.15
   float cell_state_scale = lstm_tensors.CellStateTensor()->params.scale;
   // forget gate output (nonlinear output) x cell state -> cell state
-  op_data->inter_gate_parameters.forget_cell_mul_params =
+  op_data_lstm->inter_gate_parameters.forget_cell_mul_params =
       CreateInterGateMulParams(nonlinear_output_scale, cell_state_scale,
                                cell_state_scale, kTfLiteInt16);
   // input gate output x cell gate output -> cell state
-  op_data->inter_gate_parameters.input_mul_params =
+  op_data_lstm->inter_gate_parameters.input_mul_params =
       CreateInterGateMulParams(nonlinear_output_scale, nonlinear_output_scale,
                                cell_state_scale, kTfLiteInt16);
   // tanh output x output gate output -> hidden state (potentially asymmetric)
-  op_data->inter_gate_parameters.output_mul_params = CreateInterGateMulParams(
-      nonlinear_output_scale, nonlinear_output_scale,
-      lstm_tensors.HiddenStateTensor()->params.scale,
-      lstm_tensors.HiddenStateTensor()->type,
-      lstm_tensors.HiddenStateTensor()->params.zero_point);
+  op_data_lstm->inter_gate_parameters.output_mul_params =
+      CreateInterGateMulParams(
+          nonlinear_output_scale, nonlinear_output_scale,
+          lstm_tensors.HiddenStateTensor()->params.scale,
+          lstm_tensors.HiddenStateTensor()->type,
+          lstm_tensors.HiddenStateTensor()->params.zero_point);
   return kTfLiteOk;
 }
 
@@ -438,152 +323,4 @@ LSTMKernelContents CreateLSTMKernelContent(TfLiteContext* context,
   return kernel_content;
 }
 
-template <typename CellType>
-LSTMBuffers<CellType> CreateLSTMBuffers(TfLiteContext* context,
-                                        const int* buffer_indices) {
-  LSTMBuffers<CellType> buffers;
-  buffers.buffer0 = reinterpret_cast<CellType*>(
-      context->GetScratchBuffer(context, buffer_indices[0]));
-  buffers.buffer1 = reinterpret_cast<CellType*>(
-      context->GetScratchBuffer(context, buffer_indices[1]));
-  buffers.buffer2 = reinterpret_cast<CellType*>(
-      context->GetScratchBuffer(context, buffer_indices[2]));
-  buffers.buffer3 = reinterpret_cast<CellType*>(
-      context->GetScratchBuffer(context, buffer_indices[3]));
-  return buffers;
-}
-
-/*Kernel functions*/
-
-void* UnidirectionalSequenceLstmInit(TfLiteContext* context, const char* buffer,
-                                     size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpDataLSTM));
-}
-
-TfLiteStatus UnidirectionalSequenceLstmPrepare(TfLiteContext* context,
-                                               TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 24);
-
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-  TFLITE_DCHECK(node->user_data != nullptr);
-
-  OpDataLSTM* op_data = reinterpret_cast<OpDataLSTM*>(node->user_data);
-  const auto* builtin_data =
-      static_cast<TfLiteUnidirectionalSequenceLSTMParams*>(node->builtin_data);
-  // All TempTfLiteTensors will be deallocated through the destructor.
-  LstmTensors lstm_tensors(context, node);
-  TF_LITE_ENSURE_OK(context, lstm_tensors.ValidateTensorStatus(context));
-
-  op_data->cell_gate_nonlinear_type = builtin_data->activation;
-  op_data->size_info =
-      CreateLstmSizeInfo(builtin_data->time_major,
-                         lstm_tensors.GetInternalTensor(kLstmInputTensor)->dims,
-                         lstm_tensors.HiddenStateTensor()->dims);
-  TF_LITE_ENSURE_OK(
-      context, ValidateTensorSize(context, lstm_tensors, op_data->size_info));
-
-  // Create cell state information and gate parameters (Fully Connected and Mul)
-  auto cell_state_type =
-      lstm_tensors.GetInternalTensor(kLstmCellStateTensor)->type;
-  if (cell_state_type == kTfLiteFloat32) {
-    op_data->cell_state_info =
-        CreateLstmCellStateInfoFloat(builtin_data->cell_clip);
-    TF_LITE_ENSURE_OK(
-        context, PrepareGateParametersFloat(context, lstm_tensors, op_data));
-  } else if (cell_state_type == kTfLiteInt16) {
-    op_data->cell_state_info = CreateLstmCellStateInfo(
-        lstm_tensors.CellStateTensor()->params.scale, builtin_data->cell_clip);
-    TF_LITE_ENSURE_OK(
-        context, PrepareGateParametersInteger(context, lstm_tensors, op_data));
-  } else {
-    MicroPrintf(
-        "Cell state type %s (%d) not supported. The quantized Unidirectional "
-        "Sequence LSTM Op only support int16 cell state",
-        TfLiteTypeGetName(cell_state_type), cell_state_type);
-    return kTfLiteError;
-  }
-  // request buffers (four buffers)
-  for (size_t i = 0; i < 4; i++) {
-    TF_LITE_ENSURE_OK(context, context->RequestScratchBufferInArena(
-                                   context,
-                                   op_data->size_info.batch_size *
-                                       op_data->size_info.state_dimension *
-                                       TfLiteTypeGetSize(cell_state_type),
-                                   &(op_data->buffer_indices[i])));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus UnidirectionalSequenceLstmEval(TfLiteContext* context,
-                                            TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  const OpDataLSTM& op_data = *reinterpret_cast<OpDataLSTM*>(node->user_data);
-  auto kernel_content = CreateLSTMKernelContent(context, node);
-
-  const auto activation_type =
-      kernel_content.internal_tensors[kLstmInputTensor]->type;
-  const auto weight_type =
-      kernel_content.internal_tensors[kLstmInputToInputWeightsTensor]->type;
-
-  switch (activation_type) {
-    case kTfLiteFloat32: {
-      LSTMBuffers<float> buffers =
-          CreateLSTMBuffers<float>(context, op_data.buffer_indices);
-      EvalLstm<float, float, float, float>(op_data, kernel_content, buffers);
-      break;
-    }
-    case kTfLiteInt8: {
-      switch (weight_type) {
-        case kTfLiteInt8: {
-          // 8(activation)x8(weight)->16(cell) LSTM with 32 bits bias
-          LSTMBuffers<int16_t> buffers =
-              CreateLSTMBuffers<int16_t>(context, op_data.buffer_indices);
-          EvalLstm<int8_t, int8_t, int16_t, int32_t>(op_data, kernel_content,
-                                                     buffers);
-          break;
-        }
-        default: {
-          MicroPrintf("Filter type %s (%d) not supported.",
-                      TfLiteTypeGetName(weight_type), activation_type);
-          return kTfLiteError;
-        }
-      }
-      break;
-    }
-    case kTfLiteInt16: {
-      switch (weight_type) {
-        case kTfLiteInt8: {
-          // 16(activation)x8(weight)->16(cell) LSTM with 64 bits bias
-          LSTMBuffers<int16_t> buffers =
-              CreateLSTMBuffers<int16_t>(context, op_data.buffer_indices);
-          EvalLstm<int16_t, int8_t, int16_t, int64_t>(op_data, kernel_content,
-                                                      buffers);
-          break;
-        }
-        default: {
-          MicroPrintf("Filter type %s (%d) not supported.",
-                      TfLiteTypeGetName(weight_type), weight_type);
-          return kTfLiteError;
-        }
-      }
-      break;
-    }
-    default: {
-      MicroPrintf("Input type %s (%d) not supported.",
-                  TfLiteTypeGetName(activation_type), activation_type);
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
-  return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit,
-                                   UnidirectionalSequenceLstmPrepare,
-                                   UnidirectionalSequenceLstmEval);
-}
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp b/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp
index b7b9cba8..434e4efa 100644
--- a/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp
+++ b/src/tensorflow/lite/micro/kernels/maximum_minimum.cpp
@@ -109,12 +109,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_MAXIMUM() {
+TfLiteRegistration_V1 Register_MAXIMUM() {
   return tflite::micro::RegisterOp(nullptr, nullptr,
                                    Eval<kReference, MaximumOp>);
 }
 
-TfLiteRegistration Register_MINIMUM() {
+TfLiteRegistration_V1 Register_MINIMUM() {
   return tflite::micro::RegisterOp(nullptr, nullptr,
                                    Eval<kReference, MinimumOp>);
 }
diff --git a/src/tensorflow/lite/micro/kernels/micro_ops.h b/src/tensorflow/lite/micro/kernels/micro_ops.h
index 252efc62..14b874d0 100644
--- a/src/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/src/tensorflow/lite/micro/kernels/micro_ops.h
@@ -31,108 +31,107 @@ namespace tflite {
 // (https://abseil.io/tips/130). Any new ops (or cleanup of existing ops should
 // have their Register function declarations in the tflite namespace.
 
-TfLiteRegistration Register_ADD();
-TfLiteRegistration Register_ADD_N();
-TfLiteRegistration Register_ARG_MAX();
-TfLiteRegistration Register_ARG_MIN();
-TfLiteRegistration Register_ASSIGN_VARIABLE();
-TfLiteRegistration Register_AVERAGE_POOL_2D();
-TfLiteRegistration Register_BATCH_TO_SPACE_ND();
-TfLiteRegistration Register_BROADCAST_ARGS();
-TfLiteRegistration Register_BROADCAST_TO();
-TfLiteRegistration Register_CALL_ONCE();
-TfLiteRegistration Register_CAST();
-TfLiteRegistration Register_CEIL();
+TfLiteRegistration_V1 Register_ABS();
+TfLiteRegistration_V1 Register_ADD();
+TfLiteRegistration_V1 Register_ADD_N();
+TfLiteRegistration_V1 Register_ARG_MAX();
+TfLiteRegistration_V1 Register_ARG_MIN();
+TfLiteRegistration_V1 Register_ASSIGN_VARIABLE();
+TfLiteRegistration_V1 Register_AVERAGE_POOL_2D();
+TfLiteRegistration_V1 Register_BATCH_TO_SPACE_ND();
+TfLiteRegistration_V1 Register_BROADCAST_ARGS();
+TfLiteRegistration_V1 Register_BROADCAST_TO();
+TfLiteRegistration_V1 Register_CALL_ONCE();
+TfLiteRegistration_V1 Register_CAST();
+TfLiteRegistration_V1 Register_CEIL();
 // TODO(b/160234179): Change custom OPs to also return by value.
-TfLiteRegistration* Register_CIRCULAR_BUFFER();
-TfLiteRegistration Register_CONCATENATION();
-TfLiteRegistration Register_CONV_2D();
-TfLiteRegistration Register_CUMSUM();
-TfLiteRegistration Register_DEPTH_TO_SPACE();
-TfLiteRegistration Register_DEPTHWISE_CONV_2D();
-TfLiteRegistration Register_DEQUANTIZE();
-TfLiteRegistration Register_DIV();
-TfLiteRegistration Register_ELU();
-TfLiteRegistration Register_EQUAL();
-TfLiteRegistration* Register_ETHOSU();
-TfLiteRegistration Register_EXP();
-TfLiteRegistration Register_EXPAND_DIMS();
-TfLiteRegistration Register_FILL();
-TfLiteRegistration Register_FLOOR();
-TfLiteRegistration Register_FLOOR_DIV();
-TfLiteRegistration Register_FLOOR_MOD();
-TfLiteRegistration Register_FULLY_CONNECTED();
-TfLiteRegistration Register_GATHER();
-TfLiteRegistration Register_GATHER_ND();
-TfLiteRegistration Register_GREATER();
-TfLiteRegistration Register_GREATER_EQUAL();
-TfLiteRegistration Register_HARD_SWISH();
-TfLiteRegistration Register_IF();
-TfLiteRegistration Register_L2_NORMALIZATION();
-TfLiteRegistration Register_L2_POOL_2D();
-TfLiteRegistration Register_LEAKY_RELU();
-TfLiteRegistration Register_LESS();
-TfLiteRegistration Register_LESS_EQUAL();
-TfLiteRegistration Register_LOG_SOFTMAX();
-TfLiteRegistration Register_LOGICAL_AND();
-TfLiteRegistration Register_LOGICAL_OR();
-TfLiteRegistration Register_LOGISTIC();
-TfLiteRegistration Register_MAX_POOL_2D();
-TfLiteRegistration Register_MAXIMUM();
-TfLiteRegistration Register_MEAN();
-TfLiteRegistration Register_MINIMUM();
-TfLiteRegistration Register_MIRROR_PAD();
-TfLiteRegistration Register_MUL();
-TfLiteRegistration Register_NEG();
-TfLiteRegistration Register_NOT_EQUAL();
-TfLiteRegistration Register_PACK();
-TfLiteRegistration Register_PAD();
-TfLiteRegistration Register_PADV2();
-TfLiteRegistration Register_PRELU();
-TfLiteRegistration Register_QUANTIZE();
-TfLiteRegistration Register_READ_VARIABLE();
-TfLiteRegistration Register_REDUCE_MAX();
-TfLiteRegistration Register_RELU();
-TfLiteRegistration Register_RELU6();
-TfLiteRegistration Register_RESIZE_BILINEAR();
-TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR();
-TfLiteRegistration Register_SELECT_V2();
-TfLiteRegistration Register_SHAPE();
-TfLiteRegistration Register_SLICE();
-TfLiteRegistration Register_SOFTMAX();
-TfLiteRegistration Register_SPACE_TO_BATCH_ND();
-TfLiteRegistration Register_SPACE_TO_DEPTH();
-TfLiteRegistration Register_SPLIT();
-TfLiteRegistration Register_SPLIT_V();
-TfLiteRegistration Register_SQUARED_DIFFERENCE();
-TfLiteRegistration Register_SQUEEZE();
-TfLiteRegistration Register_STRIDED_SLICE();
-TfLiteRegistration Register_SUB();
-TfLiteRegistration Register_SUM();
-TfLiteRegistration Register_SVDF();
-TfLiteRegistration Register_TANH();
-TfLiteRegistration Register_TRANSPOSE();
-TfLiteRegistration Register_TRANSPOSE_CONV();
+TfLiteRegistration_V1* Register_CIRCULAR_BUFFER();
+TfLiteRegistration_V1 Register_CONCATENATION();
+TfLiteRegistration_V1 Register_CONV_2D();
+TfLiteRegistration_V1 Register_COS();
+TfLiteRegistration_V1 Register_CUMSUM();
+TfLiteRegistration_V1 Register_DEPTH_TO_SPACE();
+TfLiteRegistration_V1 Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration_V1 Register_DEQUANTIZE();
+TfLiteRegistration_V1 Register_DIV();
+TfLiteRegistration_V1 Register_ELU();
+TfLiteRegistration_V1 Register_EQUAL();
+TfLiteRegistration_V1* Register_ETHOSU();
+TfLiteRegistration_V1 Register_EXP();
+TfLiteRegistration_V1 Register_EXPAND_DIMS();
+TfLiteRegistration_V1 Register_FILL();
+TfLiteRegistration_V1 Register_FLOOR();
+TfLiteRegistration_V1 Register_FLOOR_DIV();
+TfLiteRegistration_V1 Register_FLOOR_MOD();
+TfLiteRegistration_V1 Register_FULLY_CONNECTED();
+TfLiteRegistration_V1 Register_GATHER();
+TfLiteRegistration_V1 Register_GATHER_ND();
+TfLiteRegistration_V1 Register_GREATER();
+TfLiteRegistration_V1 Register_GREATER_EQUAL();
+TfLiteRegistration_V1 Register_HARD_SWISH();
+TfLiteRegistration_V1 Register_IF();
+TfLiteRegistration_V1 Register_L2_NORMALIZATION();
+TfLiteRegistration_V1 Register_L2_POOL_2D();
+TfLiteRegistration_V1 Register_LEAKY_RELU();
+TfLiteRegistration_V1 Register_LESS();
+TfLiteRegistration_V1 Register_LESS_EQUAL();
+TfLiteRegistration_V1 Register_LOG();
+TfLiteRegistration_V1 Register_LOG_SOFTMAX();
+TfLiteRegistration_V1 Register_LOGICAL_AND();
+TfLiteRegistration_V1 Register_LOGICAL_NOT();
+TfLiteRegistration_V1 Register_LOGICAL_OR();
+TfLiteRegistration_V1 Register_LOGISTIC();
+TfLiteRegistration_V1 Register_MAX_POOL_2D();
+TfLiteRegistration_V1 Register_MAXIMUM();
+TfLiteRegistration_V1 Register_MEAN();
+TfLiteRegistration_V1 Register_MINIMUM();
+TfLiteRegistration_V1 Register_MIRROR_PAD();
+TfLiteRegistration_V1 Register_MUL();
+TfLiteRegistration_V1 Register_NEG();
+TfLiteRegistration_V1 Register_NOT_EQUAL();
+TfLiteRegistration_V1 Register_PACK();
+TfLiteRegistration_V1 Register_PAD();
+TfLiteRegistration_V1 Register_PADV2();
+TfLiteRegistration_V1 Register_PRELU();
+TfLiteRegistration_V1 Register_QUANTIZE();
+TfLiteRegistration_V1 Register_READ_VARIABLE();
+TfLiteRegistration_V1 Register_REDUCE_MAX();
+TfLiteRegistration_V1 Register_RELU();
+TfLiteRegistration_V1 Register_RELU6();
+TfLiteRegistration_V1 Register_RESIZE_BILINEAR();
+TfLiteRegistration_V1 Register_RESIZE_NEAREST_NEIGHBOR();
+TfLiteRegistration_V1 Register_RSQRT();
+TfLiteRegistration_V1 Register_SELECT_V2();
+TfLiteRegistration_V1 Register_SHAPE();
+TfLiteRegistration_V1 Register_SIN();
+TfLiteRegistration_V1 Register_SLICE();
+TfLiteRegistration_V1 Register_SOFTMAX();
+TfLiteRegistration_V1 Register_SPACE_TO_BATCH_ND();
+TfLiteRegistration_V1 Register_SPACE_TO_DEPTH();
+TfLiteRegistration_V1 Register_SPLIT();
+TfLiteRegistration_V1 Register_SPLIT_V();
+TfLiteRegistration_V1 Register_SQRT();
+TfLiteRegistration_V1 Register_SQUARE();
+TfLiteRegistration_V1 Register_SQUARED_DIFFERENCE();
+TfLiteRegistration_V1 Register_SQUEEZE();
+TfLiteRegistration_V1 Register_STRIDED_SLICE();
+TfLiteRegistration_V1 Register_SUB();
+TfLiteRegistration_V1 Register_SUM();
+TfLiteRegistration_V1 Register_SVDF();
+TfLiteRegistration_V1 Register_TANH();
+TfLiteRegistration_V1 Register_TRANSPOSE();
+TfLiteRegistration_V1 Register_TRANSPOSE_CONV();
 // TODO(b/230666079): resolve conflict with xtensa implementation
-TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
-TfLiteRegistration Register_UNPACK();
-TfLiteRegistration Register_VAR_HANDLE();
-TfLiteRegistration Register_WHILE();
-TfLiteRegistration Register_ZEROS_LIKE();
+TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration_V1 Register_UNPACK();
+TfLiteRegistration_V1 Register_VAR_HANDLE();
+TfLiteRegistration_V1 Register_WHILE();
+TfLiteRegistration_V1 Register_ZEROS_LIKE();
 
 namespace ops {
 namespace micro {
-
-TfLiteRegistration Register_ABS();
-TfLiteRegistration Register_COS();
-TfLiteRegistration Register_LOG();
-TfLiteRegistration Register_LOGICAL_NOT();
-TfLiteRegistration Register_RESHAPE();
-TfLiteRegistration Register_ROUND();
-TfLiteRegistration Register_RSQRT();
-TfLiteRegistration Register_SIN();
-TfLiteRegistration Register_SQRT();
-TfLiteRegistration Register_SQUARE();
+TfLiteRegistration_V1 Register_RESHAPE();
+TfLiteRegistration_V1 Register_ROUND();
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/mirror_pad.cpp b/src/tensorflow/lite/micro/kernels/mirror_pad.cpp
index 90d3bd9e..c6ee1da7 100644
--- a/src/tensorflow/lite/micro/kernels/mirror_pad.cpp
+++ b/src/tensorflow/lite/micro/kernels/mirror_pad.cpp
@@ -208,7 +208,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_MIRROR_PAD() {
+TfLiteRegistration_V1 Register_MIRROR_PAD() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/mul.h b/src/tensorflow/lite/micro/kernels/mul.h
index d0148643..d64248db 100644
--- a/src/tensorflow/lite/micro/kernels/mul.h
+++ b/src/tensorflow/lite/micro/kernels/mul.h
@@ -61,13 +61,13 @@ void EvalMulFloatReference(TfLiteContext* context, TfLiteNode* node,
                            TfLiteEvalTensor* output);
 
 // Generic must define registration function.
-TfLiteRegistration Register_MUL();
+TfLiteRegistration_V1 Register_MUL();
 
 #if defined(ARDUINO)
-TfLiteRegistration Register_MUL_INT8();
+TfLiteRegistration_V1 Register_MUL_INT8();
 #else
 // Fallback registration
-inline TfLiteRegistration Register_MUL_INT8() { return Register_MUL(); }
+inline TfLiteRegistration_V1 Register_MUL_INT8() { return Register_MUL(); }
 #endif
 }  // namespace tflite
 
diff --git a/src/tensorflow/lite/micro/kernels/neg.cpp b/src/tensorflow/lite/micro/kernels/neg.cpp
index db26f6c6..cde9979f 100644
--- a/src/tensorflow/lite/micro/kernels/neg.cpp
+++ b/src/tensorflow/lite/micro/kernels/neg.cpp
@@ -50,7 +50,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_NEG() {
+TfLiteRegistration_V1 Register_NEG() {
   return tflite::micro::RegisterOp(nullptr, nullptr, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/pack.cpp b/src/tensorflow/lite/micro/kernels/pack.cpp
index 5a4eb4f5..4c2a9724 100644
--- a/src/tensorflow/lite/micro/kernels/pack.cpp
+++ b/src/tensorflow/lite/micro/kernels/pack.cpp
@@ -105,7 +105,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_PACK() {
+TfLiteRegistration_V1 Register_PACK() {
   return tflite::micro::RegisterOp(nullptr, nullptr, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/pad.cpp b/src/tensorflow/lite/micro/kernels/pad.cpp
index 579df1a6..f169e45e 100644
--- a/src/tensorflow/lite/micro/kernels/pad.cpp
+++ b/src/tensorflow/lite/micro/kernels/pad.cpp
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -218,12 +217,12 @@ TfLiteStatus PadPrepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteRegistration Register_PAD() {
+TfLiteRegistration_V1 Register_PAD() {
   return tflite::micro::RegisterOp(Init, PadPrepare, Eval);
 }
 
 // Also register Pad as PadV2.
-TfLiteRegistration Register_PADV2() {
+TfLiteRegistration_V1 Register_PADV2() {
   return tflite::micro::RegisterOp(Init, PadPrepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/pooling.h b/src/tensorflow/lite/micro/kernels/pooling.h
index fd0d2c93..e7e0b4dc 100644
--- a/src/tensorflow/lite/micro/kernels/pooling.h
+++ b/src/tensorflow/lite/micro/kernels/pooling.h
@@ -113,27 +113,27 @@ void MaxPoolingEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 #if defined(ARDUINO) || defined(XTENSA)
-TfLiteRegistration Register_AVERAGE_POOL_2D_INT8();
+TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT8();
 
-TfLiteRegistration Register_MAX_POOL_2D_INT8();
+TfLiteRegistration_V1 Register_MAX_POOL_2D_INT8();
 
-TfLiteRegistration Register_AVERAGE_POOL_2D_INT16();
+TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT16();
 
-TfLiteRegistration Register_MAX_POOL_2D_INT16();
+TfLiteRegistration_V1 Register_MAX_POOL_2D_INT16();
 #else
-inline TfLiteRegistration Register_AVERAGE_POOL_2D_INT8() {
+inline TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT8() {
   return tflite::Register_AVERAGE_POOL_2D();
 }
 
-inline TfLiteRegistration Register_MAX_POOL_2D_INT8() {
+inline TfLiteRegistration_V1 Register_MAX_POOL_2D_INT8() {
   return tflite::Register_MAX_POOL_2D();
 }
 
-inline TfLiteRegistration Register_AVERAGE_POOL_2D_INT16() {
+inline TfLiteRegistration_V1 Register_AVERAGE_POOL_2D_INT16() {
   return tflite::Register_AVERAGE_POOL_2D();
 }
 
-inline TfLiteRegistration Register_MAX_POOL_2D_INT16() {
+inline TfLiteRegistration_V1 Register_MAX_POOL_2D_INT16() {
   return tflite::Register_MAX_POOL_2D();
 }
 #endif
diff --git a/src/tensorflow/lite/micro/kernels/prelu.cpp b/src/tensorflow/lite/micro/kernels/prelu.cpp
index f4294723..62e8eb9d 100644
--- a/src/tensorflow/lite/micro/kernels/prelu.cpp
+++ b/src/tensorflow/lite/micro/kernels/prelu.cpp
@@ -68,7 +68,7 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
-TfLiteRegistration Register_PRELU() {
+TfLiteRegistration_V1 Register_PRELU() {
   return tflite::micro::RegisterOp(PreluInit, PreluPrepare, PreluEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/quantize.cpp b/src/tensorflow/lite/micro/kernels/quantize.cpp
index b5eb9c3c..0e3336d9 100644
--- a/src/tensorflow/lite/micro/kernels/quantize.cpp
+++ b/src/tensorflow/lite/micro/kernels/quantize.cpp
@@ -33,7 +33,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 
 }  // namespace
 
-TfLiteRegistration Register_QUANTIZE() {
+TfLiteRegistration_V1 Register_QUANTIZE() {
   return tflite::micro::RegisterOp(Init, PrepareQuantizeReference,
                                    EvalQuantizeReference);
 }
diff --git a/src/tensorflow/lite/micro/kernels/read_variable.cpp b/src/tensorflow/lite/micro/kernels/read_variable.cpp
index 600a1bdd..d173bc5f 100644
--- a/src/tensorflow/lite/micro/kernels/read_variable.cpp
+++ b/src/tensorflow/lite/micro/kernels/read_variable.cpp
@@ -80,7 +80,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_READ_VARIABLE() {
+TfLiteRegistration_V1 Register_READ_VARIABLE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/reduce.cpp b/src/tensorflow/lite/micro/kernels/reduce.cpp
index b4734f93..810d96f6 100644
--- a/src/tensorflow/lite/micro/kernels/reduce.cpp
+++ b/src/tensorflow/lite/micro/kernels/reduce.cpp
@@ -57,15 +57,15 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
                        static_cast<OpDataReduce*>(node->user_data));
 }
 
-TfLiteRegistration Register_MEAN() {
+TfLiteRegistration_V1 Register_MEAN() {
   return tflite::micro::RegisterOp(InitReduce, PrepareMeanOrSum, EvalMean);
 }
 
-TfLiteRegistration Register_REDUCE_MAX() {
+TfLiteRegistration_V1 Register_REDUCE_MAX() {
   return tflite::micro::RegisterOp(InitReduce, PrepareMax, EvalMax);
 }
 
-TfLiteRegistration Register_SUM() {
+TfLiteRegistration_V1 Register_SUM() {
   return tflite::micro::RegisterOp(InitReduce, PrepareMeanOrSum, EvalSum);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/reduce.h b/src/tensorflow/lite/micro/kernels/reduce.h
index 5956974e..3b70665d 100644
--- a/src/tensorflow/lite/micro/kernels/reduce.h
+++ b/src/tensorflow/lite/micro/kernels/reduce.h
@@ -56,9 +56,9 @@ TfLiteStatus EvalSumHelper(TfLiteContext* context, TfLiteNode* node,
 void ReduceResolveAxis(const int* axis_data, int axis_count,
                        MeanParams* op_params);
 
-TfLiteRegistration Register_MEAN();
-TfLiteRegistration Register_REDUCE_MAX();
-TfLiteRegistration Register_SUM();
+TfLiteRegistration_V1 Register_MEAN();
+TfLiteRegistration_V1 Register_REDUCE_MAX();
+TfLiteRegistration_V1 Register_SUM();
 
 }  // namespace tflite
 
diff --git a/src/tensorflow/lite/micro/kernels/reduce_common.cpp b/src/tensorflow/lite/micro/kernels/reduce_common.cpp
index b2fceeb8..0dab49c2 100644
--- a/src/tensorflow/lite/micro/kernels/reduce_common.cpp
+++ b/src/tensorflow/lite/micro/kernels/reduce_common.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -160,26 +160,6 @@ TfLiteStatus QuantizedMeanOrSum(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-template <typename T, typename U>
-TfLiteStatus Mean(TfLiteContext* context, TfLiteNode* node,
-                  OpDataReduce* op_data, int* temp_index, int* resolved_axis,
-                  U* temp_sum) {
-  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
-  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
-  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
-  TfLiteReducerParams* params =
-      static_cast<TfLiteReducerParams*>(node->builtin_data);
-
-  reference_ops::Mean<T, U>(
-      tflite::micro::GetTensorData<T>(input), &input->dims->data[0],
-      input->dims->size, tflite::micro::GetTensorData<T>(output),
-      &output->dims->data[0], output->dims->size,
-      tflite::micro::GetTensorData<int>(axis), op_data->num_axis,
-      params->keep_dims, temp_index, resolved_axis, temp_sum);
-
-  return kTfLiteOk;
-}
-
 template <typename integer_type>
 TfLiteStatus EvalIntegerMean(TfLiteContext* context, TfLiteNode* node,
                              int num_axis, OpDataReduce* op_data,
@@ -187,14 +167,9 @@ TfLiteStatus EvalIntegerMean(TfLiteContext* context, TfLiteNode* node,
   int32_t* temp_sum = static_cast<int32_t*>(
       context->GetScratchBuffer(context, op_data->temp_buffer_idx));
 
-  if (op_data->input_zp == op_data->output_zp &&
-      op_data->input_scale == op_data->output_scale) {
-    Mean<integer_type, int32_t>(context, node, op_data, temp_index,
-                                resolved_axis, temp_sum);
-  } else {
-    QuantizedMeanOrSum<integer_type>(context, node, temp_index, resolved_axis,
-                                     temp_sum, op_data, /*compute_sum=*/false);
-  }
+  QuantizedMeanOrSum<integer_type>(context, node, temp_index, resolved_axis,
+                                   temp_sum, op_data, /*compute_sum=*/false);
+
   return kTfLiteOk;
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/reshape.cpp b/src/tensorflow/lite/micro/kernels/reshape.cpp
index 0c6806d1..7c8549a3 100644
--- a/src/tensorflow/lite/micro/kernels/reshape.cpp
+++ b/src/tensorflow/lite/micro/kernels/reshape.cpp
@@ -114,7 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace reshape
 
-TfLiteRegistration Register_RESHAPE() {
+TfLiteRegistration_V1 Register_RESHAPE() {
   return tflite::micro::RegisterOp(nullptr, reshape::Prepare, reshape::Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp b/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp
index 56432e1b..48f3b9d6 100644
--- a/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp
+++ b/src/tensorflow/lite/micro/kernels/resize_bilinear.cpp
@@ -109,7 +109,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_RESIZE_BILINEAR() {
+TfLiteRegistration_V1 Register_RESIZE_BILINEAR() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp b/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp
index 4ed09d00..c6c8f6ff 100644
--- a/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp
+++ b/src/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cpp
@@ -116,7 +116,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR() {
+TfLiteRegistration_V1 Register_RESIZE_NEAREST_NEIGHBOR() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/round.cpp b/src/tensorflow/lite/micro/kernels/round.cpp
index 0bda8783..8db5fa2e 100644
--- a/src/tensorflow/lite/micro/kernels/round.cpp
+++ b/src/tensorflow/lite/micro/kernels/round.cpp
@@ -67,7 +67,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace round
 
-TfLiteRegistration Register_ROUND() {
+TfLiteRegistration_V1 Register_ROUND() {
   return tflite::micro::RegisterOp(nullptr, round::Prepare, round::Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/select.cpp b/src/tensorflow/lite/micro/kernels/select.cpp
index 1b05bd2f..d467c07f 100644
--- a/src/tensorflow/lite/micro/kernels/select.cpp
+++ b/src/tensorflow/lite/micro/kernels/select.cpp
@@ -189,7 +189,7 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
 //
 // 1. Either the same shape (in which case the select is elementwise), or
 // 2. Broadcastable shapes between 'condition', 'x' and 'y'.
-TfLiteRegistration Register_SELECT_V2() {
+TfLiteRegistration_V1 Register_SELECT_V2() {
   return tflite::micro::RegisterOp(tflite::SelectInit, tflite::SelectPrepare,
                                    tflite::SelectEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/shape.cpp b/src/tensorflow/lite/micro/kernels/shape.cpp
index e85bb81f..3ced3209 100644
--- a/src/tensorflow/lite/micro/kernels/shape.cpp
+++ b/src/tensorflow/lite/micro/kernels/shape.cpp
@@ -60,7 +60,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SHAPE() {
+TfLiteRegistration_V1 Register_SHAPE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/slice.cpp b/src/tensorflow/lite/micro/kernels/slice.cpp
index cc3cd5b4..90e977a0 100644
--- a/src/tensorflow/lite/micro/kernels/slice.cpp
+++ b/src/tensorflow/lite/micro/kernels/slice.cpp
@@ -140,6 +140,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int16_t>(output));
       break;
+    case kTfLiteBool:
+      reference_ops::Slice<bool>(op_params,
+                                 tflite::micro::GetTensorShape(input),
+                                 tflite::micro::GetTensorData<bool>(input),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<bool>(output));
+      break;
     default:
       MicroPrintf("Input tensor type %s (%d) not supported.",
                   TfLiteTypeGetName(input->type), input->type);
@@ -150,7 +157,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SLICE() {
+TfLiteRegistration_V1 Register_SLICE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/softmax.h b/src/tensorflow/lite/micro/kernels/softmax.h
index 0b498bd8..c9c18ca2 100644
--- a/src/tensorflow/lite/micro/kernels/softmax.h
+++ b/src/tensorflow/lite/micro/kernels/softmax.h
@@ -32,34 +32,36 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node);
 
-// This is the most generic TfLiteRegistration. The actual supported types may
-// still be target dependent. The only requirement is that every implementation
-// (reference or optimized) must define this function.
-TfLiteRegistration Register_SOFTMAX();
+// This is the most generic TfLiteRegistration_V1. The actual supported types
+// may still be target dependent. The only requirement is that every
+// implementation (reference or optimized) must define this function.
+TfLiteRegistration_V1 Register_SOFTMAX();
 
 #if defined(XTENSA) || defined(ARDUINO)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int8 input and int16 output.
-TfLiteRegistration Register_SOFTMAX_INT8_INT16();
+TfLiteRegistration_V1 Register_SOFTMAX_INT8_INT16();
 #else
-inline TfLiteRegistration Register_SOFTMAX_INT8_INT16() {
+inline TfLiteRegistration_V1 Register_SOFTMAX_INT8_INT16() {
   return Register_SOFTMAX();
 }
 #endif
 
 #if defined(ARDUINO)
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int8 input/output and uses the latency optimized implementations.
-TfLiteRegistration Register_SOFTMAX_INT8();
+TfLiteRegistration_V1 Register_SOFTMAX_INT8();
 
-// Returns a TfLiteRegistration struct for kernel variant that only supports
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
 // int16 input/output and uses the latency optimized implementations.
-TfLiteRegistration Register_SOFTMAX_INT16();
+TfLiteRegistration_V1 Register_SOFTMAX_INT16();
 
 #else
-inline TfLiteRegistration Register_SOFTMAX_INT8() { return Register_SOFTMAX(); }
+inline TfLiteRegistration_V1 Register_SOFTMAX_INT8() {
+  return Register_SOFTMAX();
+}
 
-inline TfLiteRegistration Register_SOFTMAX_INT16() {
+inline TfLiteRegistration_V1 Register_SOFTMAX_INT16() {
   return Register_SOFTMAX();
 }
 #endif
diff --git a/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp b/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp
index 11b32c3f..a4dab2af 100644
--- a/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp
+++ b/src/tensorflow/lite/micro/kernels/space_to_batch_nd.cpp
@@ -114,7 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_SPACE_TO_BATCH_ND() {
+TfLiteRegistration_V1 Register_SPACE_TO_BATCH_ND() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/space_to_depth.cpp b/src/tensorflow/lite/micro/kernels/space_to_depth.cpp
index 3640e2cd..99837ee0 100644
--- a/src/tensorflow/lite/micro/kernels/space_to_depth.cpp
+++ b/src/tensorflow/lite/micro/kernels/space_to_depth.cpp
@@ -120,7 +120,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SPACE_TO_DEPTH() {
+TfLiteRegistration_V1 Register_SPACE_TO_DEPTH() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/split.cpp b/src/tensorflow/lite/micro/kernels/split.cpp
index 226e4bf7..97d9a2d1 100644
--- a/src/tensorflow/lite/micro/kernels/split.cpp
+++ b/src/tensorflow/lite/micro/kernels/split.cpp
@@ -118,7 +118,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SPLIT() {
+TfLiteRegistration_V1 Register_SPLIT() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/split_v.cpp b/src/tensorflow/lite/micro/kernels/split_v.cpp
index 1d2fb559..ef5594eb 100644
--- a/src/tensorflow/lite/micro/kernels/split_v.cpp
+++ b/src/tensorflow/lite/micro/kernels/split_v.cpp
@@ -120,7 +120,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SPLIT_V() {
+TfLiteRegistration_V1 Register_SPLIT_V() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/squared_difference.cpp b/src/tensorflow/lite/micro/kernels/squared_difference.cpp
index 8786a871..b0cd389c 100644
--- a/src/tensorflow/lite/micro/kernels/squared_difference.cpp
+++ b/src/tensorflow/lite/micro/kernels/squared_difference.cpp
@@ -44,6 +44,44 @@ void* SquaredDifferenceInit(TfLiteContext* context, const char* buffer,
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
+void PrepareQuantized(
+    const TfLiteQuantizationParams& input1_quantization_params,
+    const TfLiteQuantizationParams& input2_quantization_params,
+    const TfLiteQuantizationParams& output_quantization_params,
+    const int left_shift, const int32_t quantized_activation_min,
+    const int32_t quantized_activation_max, OpData* data) {
+  data->arithmetic_params.input1_offset =
+      -input1_quantization_params.zero_point;
+  data->arithmetic_params.input2_offset =
+      -input2_quantization_params.zero_point;
+  data->arithmetic_params.output_offset = output_quantization_params.zero_point;
+  data->arithmetic_params.left_shift = left_shift;
+  const double twice_max_input_scale =
+      2.0 * static_cast<double>(std::max(input1_quantization_params.scale,
+                                         input2_quantization_params.scale));
+  const double real_input1_multiplier =
+      static_cast<double>(input1_quantization_params.scale) /
+      twice_max_input_scale;
+  double real_input2_multiplier =
+      static_cast<double>(input2_quantization_params.scale) /
+      twice_max_input_scale;
+  const double real_output_multiplier =
+      (twice_max_input_scale * twice_max_input_scale) /
+      static_cast<double>((1 << data->arithmetic_params.left_shift * 2) *
+                          output_quantization_params.scale);
+  QuantizeMultiplierSmallerThanOneExp(
+      real_input1_multiplier, &data->arithmetic_params.input1_multiplier,
+      &data->arithmetic_params.input1_shift);
+  QuantizeMultiplierSmallerThanOneExp(
+      real_input2_multiplier, &data->arithmetic_params.input2_multiplier,
+      &data->arithmetic_params.input2_shift);
+  QuantizeMultiplier(real_output_multiplier,
+                     &data->arithmetic_params.output_multiplier,
+                     &data->arithmetic_params.output_shift);
+  data->arithmetic_params.quantized_activation_min = quantized_activation_min;
+  data->arithmetic_params.quantized_activation_max = quantized_activation_max;
+}
+
 TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context,
                                       TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
@@ -68,11 +106,10 @@ TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context,
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
 
-  // Ensure the quantization parameters are equivalent.
+  const TfLiteQuantizationParams& input1_quantization_params = input1->params;
+  const TfLiteQuantizationParams& input2_quantization_params = input2->params;
+  const TfLiteQuantizationParams& output_quantization_params = output->params;
   if (input1->type == kTfLiteInt8) {
-    const auto& input1_quantization_params = input1->params;
-    const auto& input2_quantization_params = input2->params;
-    const auto& output_quantization_params = output->params;
     const int32_t integer_type_min = std::numeric_limits<int8_t>::min();
     const int32_t integer_type_max = std::numeric_limits<int8_t>::max();
     TF_LITE_ENSURE(context,
@@ -87,43 +124,25 @@ TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context,
                    output_quantization_params.zero_point >= integer_type_min);
     TF_LITE_ENSURE(context,
                    output_quantization_params.zero_point <= integer_type_max);
-    data->arithmetic_params.input1_offset =
-        -input1_quantization_params.zero_point;
-    data->arithmetic_params.input2_offset =
-        -input2_quantization_params.zero_point;
-    data->arithmetic_params.output_offset =
-        output_quantization_params.zero_point;
-
-    // shift to make integer for scales.
-    // 7 is selected so that maximum shifted result 255^2 * (1 << (7 * 2 ))
-    // does not overflow signed 32-bit integer
-    data->arithmetic_params.left_shift = 7;
-    const double twice_max_input_scale =
-        2.0 * static_cast<double>(std::max(input1_quantization_params.scale,
-                                           input2_quantization_params.scale));
-    const double real_input1_multiplier =
-        static_cast<double>(input1_quantization_params.scale) /
-        twice_max_input_scale;
-    double real_input2_multiplier =
-        static_cast<double>(input2_quantization_params.scale) /
-        twice_max_input_scale;
-    const double real_output_multiplier =
-        (twice_max_input_scale * twice_max_input_scale) /
-        static_cast<double>((1 << data->arithmetic_params.left_shift * 2) *
-                            output_quantization_params.scale);
-    QuantizeMultiplierSmallerThanOneExp(
-        real_input1_multiplier, &data->arithmetic_params.input1_multiplier,
-        &data->arithmetic_params.input1_shift);
-    QuantizeMultiplierSmallerThanOneExp(
-        real_input2_multiplier, &data->arithmetic_params.input2_multiplier,
-        &data->arithmetic_params.input2_shift);
-    QuantizeMultiplierSmallerThanOneExp(
-        real_output_multiplier, &data->arithmetic_params.output_multiplier,
-        &data->arithmetic_params.output_shift);
-    data->arithmetic_params.quantized_activation_min =
-        std::numeric_limits<int8_t>::min();
-    data->arithmetic_params.quantized_activation_max =
-        std::numeric_limits<int8_t>::max();
+    // leftshift = 7 is selected so that maximum shifted result 255^2 * (1 << (7
+    // * 2 )) does not overflow signed 32-bit integer
+    PrepareQuantized(input1_quantization_params, input2_quantization_params,
+                     output_quantization_params, /*left_shift=*/7,
+                     /*quantized_activation_min*/ integer_type_min,
+                     /*quantized_activation_max*/ integer_type_max, data);
+  } else if (input1->type == kTfLiteInt16) {
+    const int32_t integer_type_min = std::numeric_limits<int16_t>::min();
+    const int32_t integer_type_max = std::numeric_limits<int16_t>::max();
+    TF_LITE_ENSURE(context, input1_quantization_params.zero_point == 0);
+    TF_LITE_ENSURE(context, input2_quantization_params.zero_point == 0);
+    TF_LITE_ENSURE(context, output_quantization_params.zero_point == 0);
+
+    // leftshift = 0 as number is already 16-bit. so that maximum shifted result
+    // 32767^2 * (1 << (0 * 2 ))
+    PrepareQuantized(input1_quantization_params, input2_quantization_params,
+                     output_quantization_params, /*left_shift=*/0,
+                     /*quantized_activation_min*/ integer_type_min,
+                     /*quantized_activation_max*/ integer_type_max, data);
   }
 
   data->requires_broadcast = !HaveSameShapes(input1, input2);
@@ -134,8 +153,8 @@ TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-inline int8_t SquaredDifference(int8_t x, int8_t y,
-                                const ArithmeticParams& params) {
+template <typename T>
+T SquaredDifference(T x, T y, const ArithmeticParams& params) {
   const int32_t input1_val = params.input1_offset + x;
   const int32_t input2_val = params.input2_offset + y;
   const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
@@ -148,16 +167,16 @@ inline int8_t SquaredDifference(int8_t x, int8_t y,
           shifted_input2_val, params.input2_multiplier, params.input2_shift);
   const int32_t raw_diff = scaled_input1_val - scaled_input2_val;
 
-  // Max of this is 255^2 * (1 << 14), so won't overflow 32 bits.
+  // Max of this is 32767^2 * (1 << 0), so won't overflow 32 bits.
   const int32_t squared_raw_diff = raw_diff * raw_diff;
   const int32_t raw_output =
-      MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          squared_raw_diff, params.output_multiplier, params.output_shift) +
+      MultiplyByQuantizedMultiplier(squared_raw_diff, params.output_multiplier,
+                                    params.output_shift) +
       params.output_offset;
   const int32_t clamped_output =
       std::min(params.quantized_activation_max,
                std::max(params.quantized_activation_min, raw_output));
-  return static_cast<int8_t>(clamped_output);
+  return static_cast<T>(clamped_output);
 }
 
 template <typename T>
@@ -180,9 +199,9 @@ void EvalQuantizedSquaredDifference(TfLiteContext* context, TfLiteNode* node,
     const int flat_size = tflite::micro::GetTensorShape(input1).FlatSize();
     reference_integer_ops::ElementWise(
         flat_size, op_data->arithmetic_params,
-        tflite::micro::GetTensorData<int8_t>(input1),
-        tflite::micro::GetTensorData<int8_t>(input2),
-        tflite::micro::GetTensorData<int8_t>(output),
+        tflite::micro::GetTensorData<T>(input1),
+        tflite::micro::GetTensorData<T>(input2),
+        tflite::micro::GetTensorData<T>(output),
         reference_integer_ops::CheckArithmeticParams, SquaredDifference);
   }
 }
@@ -228,9 +247,13 @@ TfLiteStatus SquaredDifferenceEval(TfLiteContext* context, TfLiteNode* node) {
   } else if (output->type == kTfLiteInt8) {
     EvalQuantizedSquaredDifference<int8_t>(context, node, data, input1, input2,
                                            output);
+  } else if (output->type == kTfLiteInt16) {
+    EvalQuantizedSquaredDifference<int16_t>(context, node, data, input1, input2,
+                                            output);
   } else {
     MicroPrintf(
-        "SquaredDifference only supports FLOAT32, INT32 and INT8 now, got %d.",
+        "SquaredDifference only supports FLOAT32, INT32 , INT16 and INT8 now, "
+        "got %d.",
         output->type);
     return kTfLiteError;
   }
@@ -239,7 +262,7 @@ TfLiteStatus SquaredDifferenceEval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_SQUARED_DIFFERENCE() {
+TfLiteRegistration_V1 Register_SQUARED_DIFFERENCE() {
   return tflite::micro::RegisterOp(
       SquaredDifferenceInit, SquaredDifferencePrepare, SquaredDifferenceEval);
 }
diff --git a/src/tensorflow/lite/micro/kernels/squeeze.cpp b/src/tensorflow/lite/micro/kernels/squeeze.cpp
index 01753849..3ebf448d 100644
--- a/src/tensorflow/lite/micro/kernels/squeeze.cpp
+++ b/src/tensorflow/lite/micro/kernels/squeeze.cpp
@@ -111,7 +111,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_SQUEEZE() {
+TfLiteRegistration_V1 Register_SQUEEZE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/strided_slice.cpp b/src/tensorflow/lite/micro/kernels/strided_slice.cpp
index fede9548..e31f32c6 100644
--- a/src/tensorflow/lite/micro/kernels/strided_slice.cpp
+++ b/src/tensorflow/lite/micro/kernels/strided_slice.cpp
@@ -200,7 +200,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_STRIDED_SLICE() {
+TfLiteRegistration_V1 Register_STRIDED_SLICE() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/sub.cpp b/src/tensorflow/lite/micro/kernels/sub.cpp
index a54c488f..38df0bb2 100644
--- a/src/tensorflow/lite/micro/kernels/sub.cpp
+++ b/src/tensorflow/lite/micro/kernels/sub.cpp
@@ -161,7 +161,7 @@ TfLiteStatus SubEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteRegistration Register_SUB() {
+TfLiteRegistration_V1 Register_SUB() {
   return tflite::micro::RegisterOp(SubInit, SubPrepare, SubEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/svdf.h b/src/tensorflow/lite/micro/kernels/svdf.h
index c081cf2f..33390854 100644
--- a/src/tensorflow/lite/micro/kernels/svdf.h
+++ b/src/tensorflow/lite/micro/kernels/svdf.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -77,13 +77,14 @@ void EvalFloatSvdfReference(
 
 TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node);
 
-// This is the most generic TfLiteRegistration. The actual supported types may
-// still be target dependent. The only requirement is that every implementation
-// (reference or optimized) must define this function.
-TfLiteRegistration Register_SVDF();
+// This is the most generic TfLiteRegistration_V1. The actual supported types
+// may still be target dependent. The only requirement is that every
+// implementation (reference or optimized) must define this function.
+TfLiteRegistration_V1 Register_SVDF();
 
-#if defined(HEXAGON) || defined(ARDUINO)
-TfLiteRegistration Register_SVDF_INT8();
+#if defined(HEXAGON) || defined(ARDUINO) || defined(XTENSA)
+
+TfLiteRegistration_V1 Register_SVDF_INT8();
 
 #else
 // Note that while this block gets used for both reference and optimized kernels
@@ -91,7 +92,7 @@ TfLiteRegistration Register_SVDF_INT8();
 // define fallback implementation that allow reference kernels to still be used
 // from applications that call a more specific kernel variant.
 
-inline TfLiteRegistration Register_SVDF_INT8() { return Register_SVDF(); }
+inline TfLiteRegistration_V1 Register_SVDF_INT8() { return Register_SVDF(); }
 
 #endif
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/svdf_common.cpp b/src/tensorflow/lite/micro/kernels/svdf_common.cpp
index fb92b4fd..d7dd963f 100644
--- a/src/tensorflow/lite/micro/kernels/svdf_common.cpp
+++ b/src/tensorflow/lite/micro/kernels/svdf_common.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -303,6 +303,7 @@ void EvalFloatSvdfReference(
       tflite::micro::GetTensorData<float>(weights_feature);
   const float* weights_time_ptr =
       tflite::micro::GetTensorData<float>(weights_time);
+  // TODO(#1751): account for optional bias tensor
   const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
   const float* input_ptr = tflite::micro::GetTensorData<float>(input);
 
@@ -459,6 +460,7 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
                             weights_time->params.scale / output->params.scale);
 
     // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
+    // TODO(#1751): account for optional bias tensor
     TF_LITE_ENSURE(
         context,
         std::abs(static_cast<double>(bias->params.scale) -
@@ -507,6 +509,7 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
   micro_context->DeallocateTempTfLiteTensor(weights_time);
   micro_context->DeallocateTempTfLiteTensor(activation_state);
   micro_context->DeallocateTempTfLiteTensor(output);
+  // TODO(#1751): account for optional bias tensor
   micro_context->DeallocateTempTfLiteTensor(bias);
   return kTfLiteOk;
 }
diff --git a/src/tensorflow/lite/micro/kernels/tanh.cpp b/src/tensorflow/lite/micro/kernels/tanh.cpp
index 33ea8d2b..060cb38c 100644
--- a/src/tensorflow/lite/micro/kernels/tanh.cpp
+++ b/src/tensorflow/lite/micro/kernels/tanh.cpp
@@ -192,7 +192,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_TANH() {
+TfLiteRegistration_V1 Register_TANH() {
   return tflite::micro::RegisterOp(TanhInit, TanhPrepare, TanhEval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/transpose.cpp b/src/tensorflow/lite/micro/kernels/transpose.cpp
index daa75f17..00e907e5 100644
--- a/src/tensorflow/lite/micro/kernels/transpose.cpp
+++ b/src/tensorflow/lite/micro/kernels/transpose.cpp
@@ -116,7 +116,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_TRANSPOSE() {
+TfLiteRegistration_V1 Register_TRANSPOSE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/kernels/transpose_conv.cpp b/src/tensorflow/lite/micro/kernels/transpose_conv.cpp
index 9ea31454..dc0ee171 100644
--- a/src/tensorflow/lite/micro/kernels/transpose_conv.cpp
+++ b/src/tensorflow/lite/micro/kernels/transpose_conv.cpp
@@ -166,6 +166,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       micro_context->AllocateTempInputTensor(node, kFilterTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
 
+  TF_LITE_ENSURE_MSG(
+      context,
+      input->type == filter->type ||
+          (input->type == kTfLiteInt16 && filter->type == kTfLiteInt8),
+      "Hybrid models are not supported on TFLite Micro.");
+
   // Get height and width of the output.
   const int width = SizeOfDimension(output, 2);
   const int height = SizeOfDimension(output, 1);
@@ -253,11 +259,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
-  TF_LITE_ENSURE_MSG(
-      context,
-      input->type == filter->type ||
-          (input->type == kTfLiteInt16 && filter->type == kTfLiteInt8),
-      "Hybrid models are not supported on TFLite Micro.");
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32: {
@@ -344,7 +345,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_TRANSPOSE_CONV() {
+TfLiteRegistration_V1 Register_TRANSPOSE_CONV() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h b/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h
new file mode 100644
index 00000000..a6071663
--- /dev/null
+++ b/src/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// This is the most generic TfLiteRegistration_V1. The actual supported types
+// may still be target dependent. The only requirement is that every
+// implementation (reference or optimized) must define this function.
+// TODO(b/230666079): resolve conflict with xtensa implementation
+TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+
+#if defined(ARDUINO)
+// Returns a TfLiteRegistration_V1 struct for kernel variant that only supports
+// int8 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM_INT8();
+
+#else
+inline TfLiteRegistration_V1 Register_UNIDIRECTIONAL_SEQUENCE_LSTM_INT8() {
+  return Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H_
diff --git a/src/tensorflow/lite/micro/kernels/unpack.cpp b/src/tensorflow/lite/micro/kernels/unpack.cpp
index 4ade8f3f..d6fcf62c 100644
--- a/src/tensorflow/lite/micro/kernels/unpack.cpp
+++ b/src/tensorflow/lite/micro/kernels/unpack.cpp
@@ -101,7 +101,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace
 
-TfLiteRegistration Register_UNPACK() {
+TfLiteRegistration_V1 Register_UNPACK() {
   return tflite::micro::RegisterOp(nullptr, nullptr, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/var_handle.cpp b/src/tensorflow/lite/micro/kernels/var_handle.cpp
index cbd2485c..5ddf90f2 100644
--- a/src/tensorflow/lite/micro/kernels/var_handle.cpp
+++ b/src/tensorflow/lite/micro/kernels/var_handle.cpp
@@ -86,7 +86,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_VAR_HANDLE() {
+TfLiteRegistration_V1 Register_VAR_HANDLE() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/while.cpp b/src/tensorflow/lite/micro/kernels/while.cpp
index 811c9eae..65c5ac8a 100644
--- a/src/tensorflow/lite/micro/kernels/while.cpp
+++ b/src/tensorflow/lite/micro/kernels/while.cpp
@@ -126,7 +126,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace.
 
-TfLiteRegistration Register_WHILE() {
+TfLiteRegistration_V1 Register_WHILE() {
   return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/kernels/zeros_like.cpp b/src/tensorflow/lite/micro/kernels/zeros_like.cpp
index bb0c3147..5c702abd 100644
--- a/src/tensorflow/lite/micro/kernels/zeros_like.cpp
+++ b/src/tensorflow/lite/micro/kernels/zeros_like.cpp
@@ -81,7 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 
-TfLiteRegistration Register_ZEROS_LIKE() {
+TfLiteRegistration_V1 Register_ZEROS_LIKE() {
   return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
diff --git a/src/tensorflow/lite/micro/memory_helpers.cpp b/src/tensorflow/lite/micro/memory_helpers.cpp
index dbc5e014..b306811f 100644
--- a/src/tensorflow/lite/micro/memory_helpers.cpp
+++ b/src/tensorflow/lite/micro/memory_helpers.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
   // If flatbuffer_tensor.shape == nullptr, then flatbuffer_tensor is a scalar
   // so has 1 element.
   if (flatbuffer_tensor.shape() != nullptr) {
-    for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+    for (size_t n = 0; n < flatbuffer_tensor.shape()->size(); ++n) {
       element_count *= flatbuffer_tensor.shape()->Get(n);
     }
   }
diff --git a/src/tensorflow/lite/micro/micro_allocation_info.cpp b/src/tensorflow/lite/micro/micro_allocation_info.cpp
index 0160cb14..a89a5e6c 100644
--- a/src/tensorflow/lite/micro/micro_allocation_info.cpp
+++ b/src/tensorflow/lite/micro/micro_allocation_info.cpp
@@ -179,6 +179,7 @@ TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo(
     const int32_t* offline_offsets, SubgraphAllocations* allocations) {
   AllocationInfo* allocation_info = info_.allocation_info;
   // Initialize allocation info for every tensor in every subgraph.
+  int offline_index = 0;
   for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
        subgraph_idx++) {
     const SubGraph* subgraph = model_->subgraphs()->Get(subgraph_idx);
@@ -203,7 +204,7 @@ TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo(
           (!subgraph->tensors()->Get(i)->is_variable()) &&
           (current->bytes != 0);
       if (offline_offsets) {
-        current->offline_offset = offline_offsets[i];
+        current->offline_offset = offline_offsets[offline_index++];
 
         // Mark offline planned variable tensors so they can get an offline
         // offset and be handled offline.
@@ -211,7 +212,6 @@ TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo(
             current->offline_offset != kOnlinePlannedBuffer) {
           current->needs_allocating = true;
         }
-
       } else {
         current->offline_offset = kOnlinePlannedBuffer;
       }
diff --git a/src/tensorflow/lite/micro/micro_allocator.cpp b/src/tensorflow/lite/micro/micro_allocator.cpp
index 4585079c..0b199519 100644
--- a/src/tensorflow/lite/micro/micro_allocator.cpp
+++ b/src/tensorflow/lite/micro/micro_allocator.cpp
@@ -703,6 +703,14 @@ TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor(
   return tensor;
 }
 
+uint8_t* MicroAllocator::AllocateTempBuffer(size_t size, size_t alignment) {
+  return non_persistent_buffer_allocator_->AllocateTemp(size, alignment);
+}
+
+void MicroAllocator::DeallocateTempBuffer(uint8_t* buffer) {
+  non_persistent_buffer_allocator_->DeallocateTemp(buffer);
+}
+
 TfLiteStatus MicroAllocator::ResetTempAllocations() {
   return non_persistent_buffer_allocator_->ResetTempAllocations();
 }
diff --git a/src/tensorflow/lite/micro/micro_allocator.h b/src/tensorflow/lite/micro/micro_allocator.h
index c68c7135..05dbf892 100644
--- a/src/tensorflow/lite/micro/micro_allocator.h
+++ b/src/tensorflow/lite/micro/micro_allocator.h
@@ -68,7 +68,7 @@ struct ScratchBufferRequest {
 
 struct NodeAndRegistration {
   TfLiteNode node;
-  const TfLiteRegistration* registration;
+  const TfLiteRegistration_V1* registration;
 };
 
 // Holds a pointer to a buffer for a scratch buffer requested by a kernel during
@@ -195,6 +195,13 @@ class MicroAllocator {
 
   virtual void DeallocateTempTfLiteTensor(TfLiteTensor*);
 
+  // Returns a pointer to a buffer from the temporary arena memory and is only
+  // guaranteed until a call is made to ResetTempAllocations().
+  virtual uint8_t* AllocateTempBuffer(size_t size, size_t alignment);
+
+  // Signals that the temporary buffer no longer needed.
+  virtual void DeallocateTempBuffer(uint8_t* buffer);
+
   // Resets all temporary allocations. This method should be called after a
   // chain of temp allocations (e.g. chain of TfLiteTensor objects via
   // AllocateTfLiteTensor()).
diff --git a/src/tensorflow/lite/micro/micro_context.cpp b/src/tensorflow/lite/micro/micro_context.cpp
index bb78fe70..b06252ac 100644
--- a/src/tensorflow/lite/micro/micro_context.cpp
+++ b/src/tensorflow/lite/micro/micro_context.cpp
@@ -19,26 +19,34 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
 namespace tflite {
 MicroContext::MicroContext(MicroAllocator* allocator, const Model* model,
                            MicroGraph* graph)
-    : allocator_(*allocator), graph_(*graph), model_(model) {}
+    : allocator_(*allocator),
+      graph_(*graph),
+      model_(model),
+      state_(InterpreterState::kInit) {}
 
 MicroContext::~MicroContext() {}
 
 void* MicroContext::AllocatePersistentBuffer(size_t bytes) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare ||
+                state_ == InterpreterState::kInit);
   return allocator_.AllocatePersistentBuffer(bytes);
 }
 
 TfLiteStatus MicroContext::RequestScratchBufferInArena(size_t bytes,
                                                        int* buffer_idx) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare);
   return allocator_.RequestScratchBufferInArena(
       bytes, graph_.GetCurrentSubgraphIndex(), buffer_idx);
 }
 
 void* MicroContext::GetScratchBuffer(int buffer_idx) {
+  TFLITE_DCHECK(state_ == InterpreterState::kInvoke);
   ScratchBufferHandle* handle = scratch_buffer_handles_ + buffer_idx;
   return handle->data;
 }
@@ -94,6 +102,16 @@ void MicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
   return allocator_.DeallocateTempTfLiteTensor(tensor);
 }
 
+uint8_t* MicroContext::AllocateTempBuffer(size_t size, size_t alignment) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare);
+  return allocator_.AllocateTempBuffer(size, alignment);
+}
+
+void MicroContext::DeallocateTempBuffer(uint8_t* buffer) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare);
+  allocator_.DeallocateTempBuffer(buffer);
+}
+
 TfLiteEvalTensor* MicroContext::GetEvalTensor(int tensor_idx) {
   return &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()]
               .tensors[tensor_idx];
@@ -106,6 +124,8 @@ void MicroContext::SetScratchBufferHandles(
 
 TfLiteStatus MicroContext::set_external_context(
     void* external_context_payload) {
+  TFLITE_DCHECK(state_ == InterpreterState::kPrepare ||
+                state_ == InterpreterState::kInvoke);
   if (external_context_payload == nullptr ||
       external_context_payload_ != nullptr) {
     MicroPrintf(
@@ -126,4 +146,12 @@ void MicroContextReportOpError(struct TfLiteContext* context,
   va_end(args);
 }
 
+void MicroContext::SetInterpreterState(MicroContext::InterpreterState state) {
+  state_ = state;
+}
+
+MicroContext::InterpreterState MicroContext::GetInterpreterState() const {
+  return state_;
+}
+
 }  // namespace tflite
diff --git a/src/tensorflow/lite/micro/micro_context.h b/src/tensorflow/lite/micro/micro_context.h
index e7be6544..63b4b7d5 100644
--- a/src/tensorflow/lite/micro/micro_context.h
+++ b/src/tensorflow/lite/micro/micro_context.h
@@ -29,6 +29,15 @@ namespace tflite {
 // micro_context-><TFLM kernel API>
 class MicroContext {
  public:
+  // Enum that allows MicroContext to keep track of the stages different memory
+  // planning APIs are available to kernels.
+  enum class InterpreterState {
+    kInit,
+    kPrepare,
+    kMemoryPlanning,
+    kInvoke,
+  };
+
   // Does not take any ownership, and all pointers must refer to valid objects
   // that outlive the one constructed.
   explicit MicroContext(MicroAllocator* allocator, const Model* model,
@@ -84,10 +93,26 @@ class MicroContext {
   // Virtual so that it can be faked for kernel tests.
   virtual void DeallocateTempTfLiteTensor(TfLiteTensor* tensor);
 
+  // Returns a pointer to a temporary buffer (from the arena).
+  // This API is only valid from the kernel's Prepare function and
+  // the buffer's lifetime is also that of the Prepare function.
+  // Virtual so that it can be faked for kernel tests.
+  virtual uint8_t* AllocateTempBuffer(size_t size, size_t alignment);
+
+  // Signals that the temporary buffer is no longer needed.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void DeallocateTempBuffer(uint8_t* buffer);
+
   // Returns a TfLiteEvalTensor struct for a given index.
   // Virtual so that it can be faked for kernel tests.
   virtual TfLiteEvalTensor* GetEvalTensor(int tensor_idx);
 
+  // Sets the State of MemoryPlanning MicroContext
+  void SetInterpreterState(MicroContext::InterpreterState state);
+
+  // Sets the State of MemoryPlanning MicroContext
+  MicroContext::InterpreterState GetInterpreterState() const;
+
   // Does not take ownership of the pointer and the pointer must refer to valid
   // an object that outlive this class instance.
   // This can only be called once to set one external context.
@@ -110,6 +135,7 @@ class MicroContext {
   MicroAllocator& allocator_;
   MicroGraph& graph_;
   const Model* model_;
+  InterpreterState state_;
 
   ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
   void* external_context_payload_ = nullptr;
diff --git a/src/tensorflow/lite/micro/micro_graph.cpp b/src/tensorflow/lite/micro/micro_graph.cpp
index 6007e2d3..4d412e73 100644
--- a/src/tensorflow/lite/micro/micro_graph.cpp
+++ b/src/tensorflow/lite/micro/micro_graph.cpp
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
-const char* OpNameFromRegistration(const TfLiteRegistration* registration) {
+const char* OpNameFromRegistration(const TfLiteRegistration_V1* registration) {
   if (registration->builtin_code == BuiltinOperator_CUSTOM) {
     return registration->custom_name;
   } else {
@@ -62,7 +62,7 @@ TfLiteStatus MicroGraph::InitSubgraphs() {
     for (size_t i = 0; i < operators_size; ++i) {
       TfLiteNode* node =
           &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node);
-      const TfLiteRegistration* registration =
+      const TfLiteRegistration_V1* registration =
           subgraph_allocations_[subgraph_idx]
               .node_and_registrations[i]
               .registration;
@@ -96,7 +96,7 @@ TfLiteStatus MicroGraph::PrepareSubgraphs() {
     for (size_t i = 0; i < operators_size; ++i) {
       TfLiteNode* node =
           &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node);
-      const TfLiteRegistration* registration =
+      const TfLiteRegistration_V1* registration =
           subgraph_allocations_[subgraph_idx]
               .node_and_registrations[i]
               .registration;
@@ -126,7 +126,7 @@ TfLiteStatus MicroGraph::FreeSubgraphs() {
     for (size_t i = 0; i < operators_size; ++i) {
       TfLiteNode* node =
           &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node);
-      const TfLiteRegistration* registration =
+      const TfLiteRegistration_V1* registration =
           subgraph_allocations_[subgraph_idx]
               .node_and_registrations[i]
               .registration;
@@ -155,9 +155,10 @@ TfLiteStatus MicroGraph::InvokeSubgraph(int subgraph_idx) {
   for (size_t i = 0; i < operators_size; ++i) {
     TfLiteNode* node =
         &(subgraph_allocations_[subgraph_idx].node_and_registrations[i].node);
-    const TfLiteRegistration* registration = subgraph_allocations_[subgraph_idx]
-                                                 .node_and_registrations[i]
-                                                 .registration;
+    const TfLiteRegistration_V1* registration =
+        subgraph_allocations_[subgraph_idx]
+            .node_and_registrations[i]
+            .registration;
 
 // This ifdef is needed (even though ScopedMicroProfiler itself is a no-op with
 // -DTF_LITE_STRIP_ERROR_STRINGS) because the function OpNameFromRegistration is
diff --git a/src/tensorflow/lite/micro/micro_graph.h b/src/tensorflow/lite/micro/micro_graph.h
index 942082ac..ce93d339 100644
--- a/src/tensorflow/lite/micro/micro_graph.h
+++ b/src/tensorflow/lite/micro/micro_graph.h
@@ -38,20 +38,20 @@ class MicroGraph {
              MicroResourceVariables* resource_variables);
   virtual ~MicroGraph();
 
-  // Sets up builtin data and calls TfLiteRegistration->Init for every operator
-  // in every subgraph in the model.
+  // Sets up builtin data and calls TfLiteRegistration_V1->Init for every
+  // operator in every subgraph in the model.
   virtual TfLiteStatus InitSubgraphs();
 
-  // Calls TfLiteRegistration->Prepare for every operator in every subgraph in
-  // the model.
+  // Calls TfLiteRegistration_V1->Prepare for every operator in every subgraph
+  // in the model.
   virtual TfLiteStatus PrepareSubgraphs();
 
-  // Calls TfLiteRegistration->Free for every operator in every subgraph in the
-  // model.
+  // Calls TfLiteRegistration_V1->Free for every operator in every subgraph in
+  // the model.
   virtual TfLiteStatus FreeSubgraphs();
 
-  // Calls TfLiteRegistration->Invoke for every operator in a single subgraph in
-  // the model.
+  // Calls TfLiteRegistration_V1->Invoke for every operator in a single subgraph
+  // in the model.
   virtual TfLiteStatus InvokeSubgraph(int subgraph_idx);
 
   // Zeros out all variable tensors in all subgraphs in the model.
diff --git a/src/tensorflow/lite/micro/micro_interpreter.cpp b/src/tensorflow/lite/micro/micro_interpreter.cpp
index 91c7481d..75c3f628 100644
--- a/src/tensorflow/lite/micro/micro_interpreter.cpp
+++ b/src/tensorflow/lite/micro/micro_interpreter.cpp
@@ -24,11 +24,11 @@ limitations under the License.
 #include "tensorflow/lite/micro/flatbuffer_utils.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_context.h"
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/micro_profiler_interface.h"
 #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h"
-#include "tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
@@ -77,11 +77,17 @@ MicroInterpreter::~MicroInterpreter() {
 }
 
 void MicroInterpreter::Init(MicroProfilerInterface* profiler) {
+  micro_context_.SetInterpreterState(MicroContext::InterpreterState::kInit);
   context_.impl_ = static_cast<void*>(&micro_context_);
   context_.ReportError = MicroContextReportOpError;
   context_.GetTensor = MicroContextGetTensor;
   context_.GetEvalTensor = MicroContextGetEvalTensor;
   context_.profiler = profiler;
+  context_.RequestScratchBufferInArena =
+      MicroContextRequestScratchBufferInArena;
+  context_.GetExternalContext = MicroContextGetExternalContext;
+  context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer;
+  context_.GetScratchBuffer = MicroContextGetScratchBuffer;
 
   initialization_status_ = kTfLiteOk;
 }
@@ -192,27 +198,15 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
 
   TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer());
 
-  // Only allow AllocatePersistentBuffer in Init stage.
-  context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer;
-  context_.RequestScratchBufferInArena = nullptr;
-  context_.GetScratchBuffer = nullptr;
-  context_.GetExternalContext = nullptr;
+  micro_context_.SetInterpreterState(MicroContext::InterpreterState::kInit);
   TF_LITE_ENSURE_STATUS(graph_.InitSubgraphs());
 
-  // Both AllocatePersistentBuffer and RequestScratchBufferInArena is
-  // available in Prepare stage.
-  context_.RequestScratchBufferInArena =
-      MicroContextRequestScratchBufferInArena;
-  // external_context become available in Prepare stage.
-  context_.GetExternalContext = MicroContextGetExternalContext;
+  micro_context_.SetInterpreterState(MicroContext::InterpreterState::kPrepare);
 
   TF_LITE_ENSURE_STATUS(graph_.PrepareSubgraphs());
 
-  // Prepare is done, we're ready for Invoke. Memory allocation is no longer
-  // allowed. Kernels can only fetch scratch buffers via GetScratchBuffer.
-  context_.AllocatePersistentBuffer = nullptr;
-  context_.RequestScratchBufferInArena = nullptr;
-  context_.GetScratchBuffer = MicroContextGetScratchBuffer;
+  micro_context_.SetInterpreterState(
+      MicroContext::InterpreterState::kMemoryPlanning);
 
   TF_LITE_ENSURE_OK(&context_, allocator_.FinishModelAllocation(
                                    model_, graph_.GetAllocations(),
@@ -267,6 +261,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   TF_LITE_ENSURE_STATUS(Reset());
 
   tensors_allocated_ = true;
+  micro_context_.SetInterpreterState(MicroContext::InterpreterState::kInvoke);
   return kTfLiteOk;
 }
 
diff --git a/src/tensorflow/lite/micro/micro_mutable_op_resolver.h b/src/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 2898d193..c9a2c8fd 100644
--- a/src/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/src/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -37,7 +37,7 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
-TfLiteRegistration* Register_DETECTION_POSTPROCESS();
+TfLiteRegistration_V1* Register_DETECTION_POSTPROCESS();
 
 template <unsigned int tOpCount>
 class MicroMutableOpResolver : public MicroOpResolver {
@@ -46,11 +46,12 @@ class MicroMutableOpResolver : public MicroOpResolver {
 
   explicit MicroMutableOpResolver() {}
 
-  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+  const TfLiteRegistration_V1* FindOp(
+      tflite::BuiltinOperator op) const override {
     if (op == BuiltinOperator_CUSTOM) return nullptr;
 
     for (unsigned int i = 0; i < registrations_len_; ++i) {
-      const TfLiteRegistration& registration = registrations_[i];
+      const TfLiteRegistration_V1& registration = registrations_[i];
       if (registration.builtin_code == op) {
         return &registration;
       }
@@ -58,9 +59,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return nullptr;
   }
 
-  const TfLiteRegistration* FindOp(const char* op) const override {
+  const TfLiteRegistration_V1* FindOp(const char* op) const override {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
-      const TfLiteRegistration& registration = registrations_[i];
+      const TfLiteRegistration_V1& registration = registrations_[i];
       if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
           (strcmp(registration.custom_name, op) == 0)) {
         return &registration;
@@ -84,7 +85,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   // function is called again for a previously added Custom Operator, the
   // MicroOpResolver will be unchanged and this function will return
   // kTfLiteError.
-  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration) {
+  TfLiteStatus AddCustom(const char* name,
+                         TfLiteRegistration_V1* registration) {
     if (registrations_len_ >= tOpCount) {
       MicroPrintf(
           "Couldn't register custom op '%s', resolver size is too"
@@ -99,7 +101,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
       return kTfLiteError;
     }
 
-    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
+    TfLiteRegistration_V1* new_registration =
+        &registrations_[registrations_len_];
     registrations_len_ += 1;
 
     *new_registration = *registration;
@@ -112,11 +115,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
   // MicroMutableOpResolver object.
 
   TfLiteStatus AddAbs() {
-    return AddBuiltin(BuiltinOperator_ABS, tflite::ops::micro::Register_ABS(),
-                      ParseAbs);
+    return AddBuiltin(BuiltinOperator_ABS, Register_ABS(), ParseAbs);
   }
 
-  TfLiteStatus AddAdd(const TfLiteRegistration& registration = Register_ADD()) {
+  TfLiteStatus AddAdd(
+      const TfLiteRegistration_V1& registration = Register_ADD()) {
     return AddBuiltin(BuiltinOperator_ADD, registration, ParseAdd);
   }
 
@@ -139,7 +142,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddAveragePool2D(
-      const TfLiteRegistration& registration = Register_AVERAGE_POOL_2D()) {
+      const TfLiteRegistration_V1& registration = Register_AVERAGE_POOL_2D()) {
     return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, registration, ParsePool);
   }
 
@@ -181,13 +184,12 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddConv2D(
-      const TfLiteRegistration& registration = Register_CONV_2D()) {
+      const TfLiteRegistration_V1& registration = Register_CONV_2D()) {
     return AddBuiltin(BuiltinOperator_CONV_2D, registration, ParseConv2D);
   }
 
   TfLiteStatus AddCos() {
-    return AddBuiltin(BuiltinOperator_COS, tflite::ops::micro::Register_COS(),
-                      ParseCos);
+    return AddBuiltin(BuiltinOperator_COS, tflite::Register_COS(), ParseCos);
   }
 
   TfLiteStatus AddCumSum() {
@@ -200,8 +202,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::Register_DEPTH_TO_SPACE(), ParseDepthToSpace);
   }
 
-  TfLiteStatus AddDepthwiseConv2D(
-      const TfLiteRegistration& registration = Register_DEPTHWISE_CONV_2D()) {
+  TfLiteStatus AddDepthwiseConv2D(const TfLiteRegistration_V1& registration =
+                                      Register_DEPTHWISE_CONV_2D()) {
     return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, registration,
                       ParseDepthwiseConv2D);
   }
@@ -229,7 +231,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddEthosU() {
-    TfLiteRegistration* registration = tflite::Register_ETHOSU();
+    TfLiteRegistration_V1* registration = tflite::Register_ETHOSU();
     if (registration) {
       return AddCustom(tflite::GetString_ETHOSU(), registration);
     }
@@ -264,7 +266,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddFullyConnected(
-      const TfLiteRegistration& registration = Register_FULLY_CONNECTED()) {
+      const TfLiteRegistration_V1& registration = Register_FULLY_CONNECTED()) {
     return AddBuiltin(BuiltinOperator_FULLY_CONNECTED, registration,
                       ParseFullyConnected);
   }
@@ -323,8 +325,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddLog() {
-    return AddBuiltin(BuiltinOperator_LOG, tflite::ops::micro::Register_LOG(),
-                      ParseLog);
+    return AddBuiltin(BuiltinOperator_LOG, Register_LOG(), ParseLog);
   }
 
   TfLiteStatus AddLogicalAnd() {
@@ -333,8 +334,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddLogicalNot() {
-    return AddBuiltin(BuiltinOperator_LOGICAL_NOT,
-                      tflite::ops::micro::Register_LOGICAL_NOT(),
+    return AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT(),
                       ParseLogicalNot);
   }
 
@@ -359,7 +359,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddMaxPool2D(
-      const TfLiteRegistration& registration = Register_MAX_POOL_2D()) {
+      const TfLiteRegistration_V1& registration = Register_MAX_POOL_2D()) {
     return AddBuiltin(BuiltinOperator_MAX_POOL_2D, registration, ParsePool);
   }
 
@@ -377,7 +377,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseMinimum);
   }
 
-  TfLiteStatus AddMul(const TfLiteRegistration& registration = Register_MUL()) {
+  TfLiteStatus AddMul(
+      const TfLiteRegistration_V1& registration = Register_MUL()) {
     return AddBuiltin(BuiltinOperator_MUL, registration, ParseMul);
   }
 
@@ -394,7 +395,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return AddBuiltin(BuiltinOperator_PACK, Register_PACK(), ParsePack);
   }
 
-  TfLiteStatus AddPad(const TfLiteRegistration& registration = Register_PAD()) {
+  TfLiteStatus AddPad(
+      const TfLiteRegistration_V1& registration = Register_PAD()) {
     return AddBuiltin(BuiltinOperator_PAD, registration, ParsePad);
   }
 
@@ -453,8 +455,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddRsqrt() {
-    return AddBuiltin(BuiltinOperator_RSQRT,
-                      tflite::ops::micro::Register_RSQRT(), ParseRsqrt);
+    return AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT(), ParseRsqrt);
   }
 
   TfLiteStatus AddSelectV2() {
@@ -467,8 +468,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSin() {
-    return AddBuiltin(BuiltinOperator_SIN, tflite::ops::micro::Register_SIN(),
-                      ParseSin);
+    return AddBuiltin(BuiltinOperator_SIN, Register_SIN(), ParseSin);
   }
 
   TfLiteStatus AddSlice() {
@@ -476,7 +476,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSoftmax(
-      const TfLiteRegistration& registration = Register_SOFTMAX()) {
+      const TfLiteRegistration_V1& registration = Register_SOFTMAX()) {
     return AddBuiltin(BuiltinOperator_SOFTMAX, registration, ParseSoftmax);
   }
 
@@ -504,13 +504,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSqrt() {
-    return AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT(),
-                      ParseSqrt);
+    return AddBuiltin(BuiltinOperator_SQRT, Register_SQRT(), ParseSqrt);
   }
 
   TfLiteStatus AddSquare() {
-    return AddBuiltin(BuiltinOperator_SQUARE,
-                      tflite::ops::micro::Register_SQUARE(), ParseSquare);
+    return AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE(), ParseSquare);
   }
 
   TfLiteStatus AddSquaredDifference() {
@@ -533,7 +531,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSvdf(
-      const TfLiteRegistration& registration = Register_SVDF()) {
+      const TfLiteRegistration_V1& registration = Register_SVDF()) {
     return AddBuiltin(BuiltinOperator_SVDF, registration, ParseSvdf);
   }
 
@@ -555,10 +553,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(), ParseUnpack);
   }
 
-  TfLiteStatus AddUnidirectionalSequenceLSTM() {
+  TfLiteStatus AddUnidirectionalSequenceLSTM(
+      const TfLiteRegistration_V1& registration =
+          Register_UNIDIRECTIONAL_SEQUENCE_LSTM()) {
     return AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-                      Register_UNIDIRECTIONAL_SEQUENCE_LSTM(),
-                      ParseUnidirectionalSequenceLSTM);
+                      registration, ParseUnidirectionalSequenceLSTM);
   }
 
   TfLiteStatus AddVarHandle() {
@@ -579,7 +578,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
 
  private:
   TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
-                          const TfLiteRegistration& registration,
+                          const TfLiteRegistration_V1& registration,
                           TfLiteBridgeBuiltinParseFunction parser) {
     if (op == BuiltinOperator_CUSTOM) {
       MicroPrintf("Invalid parameter BuiltinOperator_CUSTOM to the ");
@@ -612,7 +611,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return kTfLiteOk;
   }
 
-  TfLiteRegistration registrations_[tOpCount];
+  TfLiteRegistration_V1 registrations_[tOpCount];
   unsigned int registrations_len_ = 0;
 
   // Arrays (and counter) to store the builtin codes and their corresponding
diff --git a/src/tensorflow/lite/micro/micro_op_resolver.cpp b/src/tensorflow/lite/micro/micro_op_resolver.cpp
new file mode 100644
index 00000000..7463e5af
--- /dev/null
+++ b/src/tensorflow/lite/micro/micro_op_resolver.cpp
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/schema/schema_utils.h"
+
+namespace tflite {
+
+TfLiteStatus GetRegistrationFromOpCode(
+    const OperatorCode* opcode, const MicroOpResolver& op_resolver,
+    const TfLiteRegistration_V1** registration) {
+  TfLiteStatus status = kTfLiteOk;
+  *registration = nullptr;
+  auto builtin_code = GetBuiltinCode(opcode);
+
+  if (builtin_code > BuiltinOperator_MAX) {
+    MicroPrintf("Op builtin_code out of range: %d.", builtin_code);
+    status = kTfLiteError;
+  } else if (builtin_code != BuiltinOperator_CUSTOM) {
+    *registration = op_resolver.FindOp(builtin_code);
+    if (*registration == nullptr) {
+      MicroPrintf("Didn't find op for builtin opcode '%s'",
+                  EnumNameBuiltinOperator(builtin_code));
+      status = kTfLiteError;
+    }
+  } else if (!opcode->custom_code()) {
+    MicroPrintf("Operator with CUSTOM builtin_code has no custom_code.\n");
+    status = kTfLiteError;
+  } else {
+    const char* name = opcode->custom_code()->c_str();
+    *registration = op_resolver.FindOp(name);
+    if (*registration == nullptr) {
+      // Do not report error for unresolved custom op, we do the final check
+      // while preparing ops.
+      status = kTfLiteError;
+    }
+  }
+  return status;
+}
+}  // namespace tflite
diff --git a/src/tensorflow/lite/micro/micro_op_resolver.h b/src/tensorflow/lite/micro/micro_op_resolver.h
index 02b07313..ed8b10e1 100644
--- a/src/tensorflow/lite/micro/micro_op_resolver.h
+++ b/src/tensorflow/lite/micro/micro_op_resolver.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h"
-#include "tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -31,38 +30,32 @@ namespace tflite {
 // We need an interface class instead of directly using MicroMutableOpResolver
 // because MicroMutableOpResolver is a class template with the number of
 // registered Ops as the template parameter.
-class MicroOpResolver : public TfLiteBridgeOpResolver {
+class MicroOpResolver {
  public:
   // Returns the Op registration struct corresponding to the enum code from the
   // flatbuffer schema. Returns nullptr if the op is not found or if op ==
   // BuiltinOperator_CUSTOM.
-  virtual const TfLiteRegistration* FindOp(BuiltinOperator op) const = 0;
+  virtual const TfLiteRegistration_V1* FindOp(BuiltinOperator op) const = 0;
 
   // Returns the Op registration struct corresponding to the custom operator by
   // name.
-  virtual const TfLiteRegistration* FindOp(const char* op) const = 0;
-
-  // This implementation exists for compatibility with the OpResolver base class
-  // and disregards the version parameter.
-  const TfLiteRegistration* FindOp(BuiltinOperator op,
-                                   int version) const final {
-    return FindOp(op);
-  }
-
-  // This implementation exists for compatibility with the OpResolver base class
-  // and disregards the version parameter.
-  const TfLiteRegistration* FindOp(const char* op, int version) const final {
-    return FindOp(op);
-  }
+  virtual const TfLiteRegistration_V1* FindOp(const char* op) const = 0;
 
   // Returns the operator specific parsing function for the OpData for a
   // BuiltinOperator (if registered), else nullptr.
   virtual TfLiteBridgeBuiltinParseFunction GetOpDataParser(
       BuiltinOperator op) const = 0;
 
-  ~MicroOpResolver() override {}
+  virtual ~MicroOpResolver() {}
 };
 
+// Handles the logic for converting between an OperatorCode structure extracted
+// from a flatbuffer and information about a registered operator
+// implementation.
+TfLiteStatus GetRegistrationFromOpCode(
+    const OperatorCode* opcode, const MicroOpResolver& op_resolver,
+    const TfLiteRegistration_V1** registration);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_
diff --git a/src/tensorflow/lite/micro/micro_profiler.cpp b/src/tensorflow/lite/micro/micro_profiler.cpp
index e9eb5e54..c3f0f4f1 100644
--- a/src/tensorflow/lite/micro/micro_profiler.cpp
+++ b/src/tensorflow/lite/micro/micro_profiler.cpp
@@ -26,7 +26,11 @@ namespace tflite {
 
 uint32_t MicroProfiler::BeginEvent(const char* tag) {
   if (num_events_ == kMaxEvents) {
-    num_events_ = 0;
+    MicroPrintf(
+        "MicroProfiler errored out because total number of events exceeded the "
+        "maximum of %d.",
+        kMaxEvents);
+    TFLITE_ASSERT_FALSE;
   }
 
   tags_[num_events_] = tag;
@@ -52,8 +56,7 @@ void MicroProfiler::Log() const {
 #if !defined(TF_LITE_STRIP_ERROR_STRINGS)
   for (int i = 0; i < num_events_; ++i) {
     uint32_t ticks = end_ticks_[i] - start_ticks_[i];
-    MicroPrintf("%s took %" PRIu32 " ticks (%d ms).", tags_[i], ticks,
-                TicksToMs(ticks));
+    MicroPrintf("%s took %u ticks (%d ms).", tags_[i], ticks, TicksToMs(ticks));
   }
 #endif
 }
diff --git a/src/tensorflow/lite/micro/micro_profiler.h b/src/tensorflow/lite/micro/micro_profiler.h
index d1136474..1c39ea1c 100644
--- a/src/tensorflow/lite/micro/micro_profiler.h
+++ b/src/tensorflow/lite/micro/micro_profiler.h
@@ -69,7 +69,7 @@ class MicroProfiler : public MicroProfilerInterface {
   // Maximum number of events that this class can keep track of. If we call
   // AddEvent more than kMaxEvents number of times, then the oldest event's
   // profiling information will be overwritten.
-  static constexpr int kMaxEvents = 1024;
+  static constexpr int kMaxEvents = 4096;
 
   const char* tags_[kMaxEvents];
   uint32_t start_ticks_[kMaxEvents];
diff --git a/src/tensorflow/lite/micro/test_helper_custom_ops.cpp b/src/tensorflow/lite/micro/test_helper_custom_ops.cpp
index b87cb5ae..15c450a6 100644
--- a/src/tensorflow/lite/micro/test_helper_custom_ops.cpp
+++ b/src/tensorflow/lite/micro/test_helper_custom_ops.cpp
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -36,12 +35,12 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-const TfLiteRegistration* PackerOp::getRegistration() {
+const TfLiteRegistration_V1* PackerOp::getRegistration() {
   return GetMutableRegistration();
 }
 
-TfLiteRegistration* PackerOp::GetMutableRegistration() {
-  static TfLiteRegistration r;
+TfLiteRegistration_V1* PackerOp::GetMutableRegistration() {
+  static TfLiteRegistration_V1 r;
   r.init = Init;
   r.prepare = Prepare;
   r.invoke = Invoke;
diff --git a/src/tensorflow/lite/micro/test_helper_custom_ops.h b/src/tensorflow/lite/micro/test_helper_custom_ops.h
index 9c950fc9..1ae95e4a 100644
--- a/src/tensorflow/lite/micro/test_helper_custom_ops.h
+++ b/src/tensorflow/lite/micro/test_helper_custom_ops.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -33,8 +32,8 @@ namespace testing {
 
 class PackerOp {
  public:
-  static const TfLiteRegistration* getRegistration();
-  static TfLiteRegistration* GetMutableRegistration();
+  static const TfLiteRegistration_V1* getRegistration();
+  static TfLiteRegistration_V1* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static void Free(TfLiteContext* context, void* buffer);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
diff --git a/src/tensorflow/lite/micro/test_helpers.cpp b/src/tensorflow/lite/micro/test_helpers.cpp
index bf81f140..d69a8e23 100644
--- a/src/tensorflow/lite/micro/test_helpers.cpp
+++ b/src/tensorflow/lite/micro/test_helpers.cpp
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/micro_arena_constants.h"
@@ -1431,12 +1430,12 @@ const Model* BuildSimpleMockModelWithNullInputsOutputs() {
 
 }  // namespace
 
-const TfLiteRegistration* SimpleStatefulOp::getRegistration() {
+const TfLiteRegistration_V1* SimpleStatefulOp::getRegistration() {
   return GetMutableRegistration();
 }
 
-TfLiteRegistration* SimpleStatefulOp::GetMutableRegistration() {
-  static TfLiteRegistration r;
+TfLiteRegistration_V1* SimpleStatefulOp::GetMutableRegistration() {
+  static TfLiteRegistration_V1 r;
   r.init = Init;
   r.prepare = Prepare;
   r.invoke = Invoke;
@@ -1445,10 +1444,6 @@ TfLiteRegistration* SimpleStatefulOp::GetMutableRegistration() {
 
 void* SimpleStatefulOp::Init(TfLiteContext* context, const char* buffer,
                              size_t length) {
-  TFLITE_DCHECK(context->AllocateBufferForEval == nullptr);
-  TFLITE_DCHECK(context->GetScratchBuffer == nullptr);
-  TFLITE_DCHECK(context->RequestScratchBufferInArena == nullptr);
-
   void* raw = context->AllocatePersistentBuffer(context, sizeof(OpData));
   OpData* data = reinterpret_cast<OpData*>(raw);
   *data = {};
@@ -1521,12 +1516,12 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-const TfLiteRegistration* MockCustom::getRegistration() {
+const TfLiteRegistration_V1* MockCustom::getRegistration() {
   return GetMutableRegistration();
 }
 
-TfLiteRegistration* MockCustom::GetMutableRegistration() {
-  static TfLiteRegistration r;
+TfLiteRegistration_V1* MockCustom::GetMutableRegistration() {
+  static TfLiteRegistration_V1 r;
   r.init = Init;
   r.prepare = Prepare;
   r.invoke = Invoke;
@@ -1569,12 +1564,12 @@ TfLiteStatus MockCustom::Invoke(TfLiteContext* context, TfLiteNode* node) {
 
 bool MockCustom::freed_ = false;
 
-const TfLiteRegistration* MultipleInputs::getRegistration() {
+const TfLiteRegistration_V1* MultipleInputs::getRegistration() {
   return GetMutableRegistration();
 }
 
-TfLiteRegistration* MultipleInputs::GetMutableRegistration() {
-  static TfLiteRegistration r;
+TfLiteRegistration_V1* MultipleInputs::GetMutableRegistration() {
+  static TfLiteRegistration_V1 r;
   r.init = Init;
   r.prepare = Prepare;
   r.invoke = Invoke;
@@ -1624,12 +1619,12 @@ TfLiteStatus MultipleInputs::Invoke(TfLiteContext* context, TfLiteNode* node) {
 
 bool MultipleInputs::freed_ = false;
 
-const TfLiteRegistration* NoOp::getRegistration() {
+const TfLiteRegistration_V1* NoOp::getRegistration() {
   return GetMutableRegistration();
 }
 
-TfLiteRegistration* NoOp::GetMutableRegistration() {
-  static TfLiteRegistration r;
+TfLiteRegistration_V1* NoOp::GetMutableRegistration() {
+  static TfLiteRegistration_V1 r;
   r.init = Init;
   r.prepare = Prepare;
   r.invoke = Invoke;
@@ -1658,16 +1653,20 @@ TfLiteStatus NoOp::Invoke(TfLiteContext* context, TfLiteNode* node) {
 
 bool NoOp::freed_ = false;
 
-AllOpsResolver GetOpResolver() {
-  AllOpsResolver op_resolver;
-  op_resolver.AddCustom("mock_custom", MockCustom::GetMutableRegistration());
-  op_resolver.AddCustom("simple_stateful_op",
-                        SimpleStatefulOp::GetMutableRegistration());
-  op_resolver.AddCustom("multiple_inputs_op",
-                        MultipleInputs::GetMutableRegistration());
-  op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration());
-  op_resolver.AddCustom("custom_packer_op", PackerOp::GetMutableRegistration());
-  return op_resolver;
+TfLiteStatus GetTestingOpResolver(
+    tflite::testing::TestingOpResolver& op_resolver) {
+  TF_LITE_ENSURE_STATUS(op_resolver.AddCustom(
+      "mock_custom", MockCustom::GetMutableRegistration()));
+  TF_LITE_ENSURE_STATUS(op_resolver.AddCustom(
+      "simple_stateful_op", SimpleStatefulOp::GetMutableRegistration()));
+  TF_LITE_ENSURE_STATUS(op_resolver.AddCustom(
+      "multiple_inputs_op", MultipleInputs::GetMutableRegistration()));
+  TF_LITE_ENSURE_STATUS(
+      op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration()));
+  TF_LITE_ENSURE_STATUS(op_resolver.AddCustom(
+      "custom_packer_op", PackerOp::GetMutableRegistration()));
+  TF_LITE_ENSURE_STATUS(op_resolver.AddIf());
+  return kTfLiteOk;
 }
 
 const Model* GetModelWithUnusedInputs() {
diff --git a/src/tensorflow/lite/micro/test_helpers.h b/src/tensorflow/lite/micro/test_helpers.h
index 728a0bbf..20dbc090 100644
--- a/src/tensorflow/lite/micro/test_helpers.h
+++ b/src/tensorflow/lite/micro/test_helpers.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -32,6 +32,7 @@ namespace tflite {
 namespace testing {
 
 constexpr int kOfflinePlannerHeaderSize = 3;
+using TestingOpResolver = tflite::MicroMutableOpResolver<10>;
 
 struct NodeConnection_ {
   std::initializer_list<int32_t> input;
@@ -55,8 +56,8 @@ class SimpleStatefulOp {
   };
 
  public:
-  static const TfLiteRegistration* getRegistration();
-  static TfLiteRegistration* GetMutableRegistration();
+  static const TfLiteRegistration_V1* getRegistration();
+  static TfLiteRegistration_V1* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
   static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
@@ -64,8 +65,8 @@ class SimpleStatefulOp {
 
 class MockCustom {
  public:
-  static const TfLiteRegistration* getRegistration();
-  static TfLiteRegistration* GetMutableRegistration();
+  static const TfLiteRegistration_V1* getRegistration();
+  static TfLiteRegistration_V1* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static void Free(TfLiteContext* context, void* buffer);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
@@ -78,8 +79,8 @@ class MockCustom {
 // the sum of the inputs.
 class MultipleInputs {
  public:
-  static const TfLiteRegistration* getRegistration();
-  static TfLiteRegistration* GetMutableRegistration();
+  static const TfLiteRegistration_V1* getRegistration();
+  static TfLiteRegistration_V1* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static void Free(TfLiteContext* context, void* buffer);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
@@ -91,8 +92,8 @@ class MultipleInputs {
 // A simple no-op operator.
 class NoOp {
  public:
-  static const TfLiteRegistration* getRegistration();
-  static TfLiteRegistration* GetMutableRegistration();
+  static const TfLiteRegistration_V1* getRegistration();
+  static TfLiteRegistration_V1* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static void Free(TfLiteContext* context, void* buffer);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
@@ -102,7 +103,7 @@ class NoOp {
 };
 
 // Returns an Op Resolver that can be used in the testing code.
-AllOpsResolver GetOpResolver();
+TfLiteStatus GetTestingOpResolver(TestingOpResolver& op_resolver);
 
 // Returns a simple example flatbuffer TensorFlow Lite model. Contains 1 input,
 // 1 layer of weights, 1 output Tensor, and 1 operator.
@@ -216,7 +217,6 @@ TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims,
   result.is_variable = is_variable;
   result.allocation_type = kTfLiteMemNone;
   result.data.data = const_cast<T*>(data);
-  result.quantization = {kTfLiteAffineQuantization, nullptr};
   result.bytes = ElementCount(*dims) * sizeof(T);
   result.data.data = const_cast<T*>(data);
 
diff --git a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.cpp b/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.cpp
deleted file mode 100644
index b8f3eb08..00000000
--- a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h"
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
-#include "tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h"
-#include "tensorflow/lite/schema/schema_utils.h"
-
-namespace tflite {
-
-TfLiteStatus GetRegistrationFromOpCode(
-    const OperatorCode* opcode, const OpResolver& op_resolver,
-    const TfLiteRegistration** registration) {
-  return GetRegistrationFromOpCode(
-      opcode, op_resolver, tflite::GetMicroErrorReporter(), registration);
-}
-}  // namespace tflite
diff --git a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h b/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h
deleted file mode 100644
index 252df6e8..00000000
--- a/src/tensorflow/lite/micro/tflite_bridge/op_resolver_bridge.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_MICRO_TFLITE_BRIDGE_OP_RESOLVER_BRIDGE_H_
-#define TENSORFLOW_LITE_MICRO_TFLITE_BRIDGE_OP_RESOLVER_BRIDGE_H_
-
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
-
-namespace tflite {
-
-// Forward declaration of the classes and structs used here.
-struct OperatorCode;
-
-using TfLiteBridgeOpResolver = OpResolver;
-
-// Handles the logic for converting between an OperatorCode structure extracted
-// from a flatbuffer and information about a registered operator
-// implementation.
-TfLiteStatus GetRegistrationFromOpCode(const OperatorCode* opcode,
-                                       const OpResolver& op_resolver,
-                                       const TfLiteRegistration** registration);
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_MICRO_TFLITE_BRIDGE_OP_RESOLVER_BRIDGE_H_
diff --git a/src/tensorflow/lite/schema/schema_generated.h b/src/tensorflow/lite/schema/schema_generated.h
index a475fcd2..856e15dd 100644
--- a/src/tensorflow/lite/schema/schema_generated.h
+++ b/src/tensorflow/lite/schema/schema_generated.h
@@ -543,6 +543,18 @@ struct SignOptions;
 struct SignOptionsBuilder;
 struct SignOptionsT;
 
+struct BitcastOptions;
+struct BitcastOptionsBuilder;
+struct BitcastOptionsT;
+
+struct BitwiseXorOptions;
+struct BitwiseXorOptionsBuilder;
+struct BitwiseXorOptionsT;
+
+struct RightShiftOptions;
+struct RightShiftOptionsBuilder;
+struct RightShiftOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeBuilder;
 struct OperatorCodeT;
@@ -1059,11 +1071,14 @@ enum BuiltinOperator : int32_t {
   BuiltinOperator_ATAN2 = 156,
   BuiltinOperator_UNSORTED_SEGMENT_MIN = 157,
   BuiltinOperator_SIGN = 158,
+  BuiltinOperator_BITCAST = 159,
+  BuiltinOperator_BITWISE_XOR = 160,
+  BuiltinOperator_RIGHT_SHIFT = 161,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SIGN
+  BuiltinOperator_MAX = BuiltinOperator_RIGHT_SHIFT
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[159] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[162] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -1223,13 +1238,16 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[159] {
     BuiltinOperator_UNSORTED_SEGMENT_SUM,
     BuiltinOperator_ATAN2,
     BuiltinOperator_UNSORTED_SEGMENT_MIN,
-    BuiltinOperator_SIGN
+    BuiltinOperator_SIGN,
+    BuiltinOperator_BITCAST,
+    BuiltinOperator_BITWISE_XOR,
+    BuiltinOperator_RIGHT_SHIFT
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[160] = {
+  static const char * const names[163] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1389,13 +1407,16 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "ATAN2",
     "UNSORTED_SEGMENT_MIN",
     "SIGN",
+    "BITCAST",
+    "BITWISE_XOR",
+    "RIGHT_SHIFT",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_SIGN)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_RIGHT_SHIFT)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -1525,11 +1546,14 @@ enum BuiltinOptions : uint8_t {
   BuiltinOptions_UnsortedSegmentSumOptions = 121,
   BuiltinOptions_ATan2Options = 122,
   BuiltinOptions_SignOptions = 123,
+  BuiltinOptions_BitcastOptions = 124,
+  BuiltinOptions_BitwiseXorOptions = 125,
+  BuiltinOptions_RightShiftOptions = 126,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SignOptions
+  BuiltinOptions_MAX = BuiltinOptions_RightShiftOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[124] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[127] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -1654,13 +1678,16 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[124] {
     BuiltinOptions_UnsortedSegmentMinOptions,
     BuiltinOptions_UnsortedSegmentSumOptions,
     BuiltinOptions_ATan2Options,
-    BuiltinOptions_SignOptions
+    BuiltinOptions_SignOptions,
+    BuiltinOptions_BitcastOptions,
+    BuiltinOptions_BitwiseXorOptions,
+    BuiltinOptions_RightShiftOptions
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOptions() {
-  static const char * const names[125] = {
+  static const char * const names[128] = {
     "NONE",
     "Conv2DOptions",
     "DepthwiseConv2DOptions",
@@ -1785,13 +1812,16 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "UnsortedSegmentSumOptions",
     "ATan2Options",
     "SignOptions",
+    "BitcastOptions",
+    "BitwiseXorOptions",
+    "RightShiftOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_SignOptions)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_RightShiftOptions)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions()[index];
 }
@@ -2292,6 +2322,18 @@ template<> struct BuiltinOptionsTraits<tflite::SignOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SignOptions;
 };
 
+template<> struct BuiltinOptionsTraits<tflite::BitcastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BitwiseXorOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RightShiftOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions;
+};
+
 template<typename T> struct BuiltinOptionsUnionTraits {
   static const BuiltinOptions enum_value = BuiltinOptions_NONE;
 };
@@ -2788,6 +2830,18 @@ template<> struct BuiltinOptionsUnionTraits<tflite::SignOptionsT> {
   static const BuiltinOptions enum_value = BuiltinOptions_SignOptions;
 };
 
+template<> struct BuiltinOptionsUnionTraits<tflite::BitcastOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BitwiseXorOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RightShiftOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -3802,6 +3856,30 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SignOptions ?
       reinterpret_cast<const tflite::SignOptionsT *>(value) : nullptr;
   }
+  tflite::BitcastOptionsT *AsBitcastOptions() {
+    return type == BuiltinOptions_BitcastOptions ?
+      reinterpret_cast<tflite::BitcastOptionsT *>(value) : nullptr;
+  }
+  const tflite::BitcastOptionsT *AsBitcastOptions() const {
+    return type == BuiltinOptions_BitcastOptions ?
+      reinterpret_cast<const tflite::BitcastOptionsT *>(value) : nullptr;
+  }
+  tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() {
+    return type == BuiltinOptions_BitwiseXorOptions ?
+      reinterpret_cast<tflite::BitwiseXorOptionsT *>(value) : nullptr;
+  }
+  const tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() const {
+    return type == BuiltinOptions_BitwiseXorOptions ?
+      reinterpret_cast<const tflite::BitwiseXorOptionsT *>(value) : nullptr;
+  }
+  tflite::RightShiftOptionsT *AsRightShiftOptions() {
+    return type == BuiltinOptions_RightShiftOptions ?
+      reinterpret_cast<tflite::RightShiftOptionsT *>(value) : nullptr;
+  }
+  const tflite::RightShiftOptionsT *AsRightShiftOptions() const {
+    return type == BuiltinOptions_RightShiftOptions ?
+      reinterpret_cast<const tflite::RightShiftOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -11556,6 +11634,123 @@ inline flatbuffers::Offset<SignOptions> CreateSignOptions(
 
 flatbuffers::Offset<SignOptions> CreateSignOptions(flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct BitcastOptionsT : public flatbuffers::NativeTable {
+  typedef BitcastOptions TableType;
+};
+
+struct BitcastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BitcastOptionsT NativeTableType;
+  typedef BitcastOptionsBuilder Builder;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BitcastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BitcastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BitcastOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BitcastOptionsBuilder {
+  typedef BitcastOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit BitcastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BitcastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BitcastOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  BitcastOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BitwiseXorOptionsT : public flatbuffers::NativeTable {
+  typedef BitwiseXorOptions TableType;
+};
+
+struct BitwiseXorOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BitwiseXorOptionsT NativeTableType;
+  typedef BitwiseXorOptionsBuilder Builder;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BitwiseXorOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BitwiseXorOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BitwiseXorOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BitwiseXorOptionsBuilder {
+  typedef BitwiseXorOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit BitwiseXorOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BitwiseXorOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BitwiseXorOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  BitwiseXorOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RightShiftOptionsT : public flatbuffers::NativeTable {
+  typedef RightShiftOptions TableType;
+};
+
+struct RightShiftOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RightShiftOptionsT NativeTableType;
+  typedef RightShiftOptionsBuilder Builder;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RightShiftOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RightShiftOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RightShiftOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RightShiftOptionsBuilder {
+  typedef RightShiftOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RightShiftOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<RightShiftOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RightShiftOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RightShiftOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   int8_t deprecated_builtin_code = 0;
@@ -12068,6 +12263,15 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::SignOptions *builtin_options_as_SignOptions() const {
     return builtin_options_type() == tflite::BuiltinOptions_SignOptions ? static_cast<const tflite::SignOptions *>(builtin_options()) : nullptr;
   }
+  const tflite::BitcastOptions *builtin_options_as_BitcastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BitcastOptions ? static_cast<const tflite::BitcastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BitwiseXorOptions *builtin_options_as_BitwiseXorOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BitwiseXorOptions ? static_cast<const tflite::BitwiseXorOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RightShiftOptions *builtin_options_as_RightShiftOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RightShiftOptions ? static_cast<const tflite::RightShiftOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -12596,6 +12800,18 @@ template<> inline const tflite::SignOptions *Operator::builtin_options_as<tflite
   return builtin_options_as_SignOptions();
 }
 
+template<> inline const tflite::BitcastOptions *Operator::builtin_options_as<tflite::BitcastOptions>() const {
+  return builtin_options_as_BitcastOptions();
+}
+
+template<> inline const tflite::BitwiseXorOptions *Operator::builtin_options_as<tflite::BitwiseXorOptions>() const {
+  return builtin_options_as_BitwiseXorOptions();
+}
+
+template<> inline const tflite::RightShiftOptions *Operator::builtin_options_as<tflite::RightShiftOptions>() const {
+  return builtin_options_as_RightShiftOptions();
+}
+
 struct OperatorBuilder {
   typedef Operator Table;
   flatbuffers::FlatBufferBuilder &fbb_;
@@ -16931,6 +17147,75 @@ inline flatbuffers::Offset<SignOptions> CreateSignOptions(flatbuffers::FlatBuffe
       _fbb);
 }
 
+inline BitcastOptionsT *BitcastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BitcastOptionsT>(new BitcastOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BitcastOptions::UnPackTo(BitcastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<BitcastOptions> BitcastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBitcastOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BitcastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBitcastOptions(
+      _fbb);
+}
+
+inline BitwiseXorOptionsT *BitwiseXorOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BitwiseXorOptionsT>(new BitwiseXorOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BitwiseXorOptions::UnPackTo(BitwiseXorOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<BitwiseXorOptions> BitwiseXorOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBitwiseXorOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BitwiseXorOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBitwiseXorOptions(
+      _fbb);
+}
+
+inline RightShiftOptionsT *RightShiftOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RightShiftOptionsT>(new RightShiftOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RightShiftOptions::UnPackTo(RightShiftOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RightShiftOptions> RightShiftOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRightShiftOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RightShiftOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRightShiftOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<OperatorCodeT>(new OperatorCodeT());
   UnPackTo(_o.get(), _resolver);
@@ -17966,6 +18251,18 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return true;
   }
 }
@@ -18477,6 +18774,18 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -18976,6 +19285,18 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const tflite::SignOptionsT *>(value);
       return CreateSignOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptionsT *>(value);
+      return CreateBitcastOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptionsT *>(value);
+      return CreateBitwiseXorOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptionsT *>(value);
+      return CreateRightShiftOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -19474,6 +19795,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) :
       value = new tflite::SignOptionsT(*reinterpret_cast<tflite::SignOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_BitcastOptions: {
+      value = new tflite::BitcastOptionsT(*reinterpret_cast<tflite::BitcastOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      value = new tflite::BitwiseXorOptionsT(*reinterpret_cast<tflite::BitwiseXorOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      value = new tflite::RightShiftOptionsT(*reinterpret_cast<tflite::RightShiftOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -20096,6 +20429,21 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<tflite::BitcastOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<tflite::BitwiseXorOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<tflite::RightShiftOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/src/third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h b/src/third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h
new file mode 100644
index 00000000..e4472639
--- /dev/null
+++ b/src/third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h
@@ -0,0 +1,308 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_compiler.h
+ * Description:  Generic compiler header
+ *
+ * $Date:        31 January 2023
+ * $Revision:    V.1.1.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ * -------------------------------------------------------------------- */
+
+#ifndef ARM_NN_COMPILER_H
+#define ARM_NN_COMPILER_H
+
+/**
+ *
+ * @brief Arm C-Language Extension(ACLE) Includes
+ *
+ */
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+
+    #ifndef __ASM
+        #define __ASM __asm
+    #endif
+    #ifndef __INLINE
+        #define __INLINE __inline
+    #endif
+    #ifndef __STATIC_INLINE
+        #define __STATIC_INLINE static __inline
+    #endif
+    #ifndef __STATIC_FORCEINLINE
+        #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
+    #endif
+    #ifndef __RESTRICT
+        #define __RESTRICT __restrict
+    #endif
+
+#elif defined(__ICCARM__)
+
+    #warning IAR support is not tested
+    #ifndef __ASM
+        #define __ASM __asm
+    #endif
+    #ifndef __INLINE
+        #define __INLINE inline
+    #endif
+    #ifndef __STATIC_INLINE
+        #define __STATIC_INLINE static inline
+    #endif
+    #ifndef __FORCEINLINE
+        #define __FORCEINLINE _Pragma("inline=forced")
+    #endif
+    #ifndef __STATIC_FORCEINLINE
+        #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
+    #endif
+
+#elif defined(_MSC_VER)
+
+    // Build for non Arm Cortex-M processors is not tested or supported.
+    // Use this section to stub any macros or intrinsics
+    #warning Unsupported compiler
+    #ifndef __STATIC_FORCEINLINE
+        #define __STATIC_FORCEINLINE static __forceinline
+    #endif
+    #ifndef __STATIC_INLINE
+        #define __STATIC_INLINE static __inline
+    #endif
+    #ifndef __ALIGNED
+        #define __ALIGNED(x) __declspec(align(x))
+    #endif
+
+#elif defined(__GNUC__)
+
+    #ifndef __ASM
+        #define __ASM __asm
+    #endif
+    #ifndef __INLINE
+        #define __INLINE inline
+    #endif
+    #ifndef __STATIC_INLINE
+        #define __STATIC_INLINE static inline
+    #endif
+    #ifndef __STATIC_FORCEINLINE
+        #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
+    #endif
+    #ifndef __RESTRICT
+        #define __RESTRICT __restrict
+    #endif
+
+#else
+
+    #error Unsupported compiler. Add support as needed
+
+#endif
+
+/**
+ *
+ * @brief Compiler specific diagnostic adjustment / fixes if applicable
+ *
+ */
+
+// Note: __ARM_ARCH is used with M-profile architecture as the target here.
+#if defined(__GNUC__)
+    #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
+        // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
+        // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
+        #define ARM_GCC_12_2_ICE
+    #endif
+#endif
+
+#if ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
+    #include <arm_mve.h>
+#endif
+
+#if defined(__ARM_ARCH) || defined(__ARM_ACLE)
+    #include <arm_acle.h>
+#endif
+
+/**
+ *
+ * @brief ACLE and Intrinsics
+ *
+ */
+
+// Note: Have __GNUC__, that is used to check for GCC , checks at the end
+// as __GNUC__ is defined by non-GCC compilers as well
+
+/* Common intrinsics for all architectures */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
+    #define CLZ __clz
+#elif defined(__GNUC__)
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
+{
+    /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+       __builtin_clz(0) is undefined behaviour, so handle this case specially.
+       This guarantees Arm-compatible results if compiling on a non-Arm
+       target, and ensures the compiler doesn't decide to activate any
+       optimisations using the logic "value was passed to __builtin_clz, so it
+       is non-zero".
+       ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
+       single CLZ instruction.
+     */
+    if (value == 0U)
+    {
+        return 32U;
+    }
+    return __builtin_clz(value);
+}
+#endif
+
+// ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
+
+// Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
+
+#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+    // Common intrinsics
+    #define SMLABB __smlabb
+    #define SMLATT __smlatt
+    #define QADD __qadd
+    #define QSUB8 __qsub8
+    #define QSUB16 __qsub16
+    #define SADD16 __sadd16
+
+    // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
+    #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
+
+        #define SMULBB __smulbb
+        #define SMULTT __smultt
+        #define ROR __ror
+        #define SXTB16 __sxtb16
+        #define SXTAB16 __sxtab16
+        #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
+        #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
+        #define SMLAD __smlad
+        // PKH<XY> translates into pkh<xy> on AC6
+        #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
+            (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
+        #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
+            (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
+
+    #elif defined(__GNUC__)
+
+        #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
+            __extension__({                                                                                            \
+                uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
+                __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));                 \
+                __RES;                                                                                                 \
+            })
+        #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
+            __extension__({                                                                                            \
+                uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
+                if (ARG3 == 0)                                                                                         \
+                    __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2));                                \
+                else                                                                                                   \
+                    __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));             \
+                __RES;                                                                                                 \
+            })
+
+__STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
+{
+    uint32_t result;
+
+    __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
+    return (result);
+}
+
+__STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
+{
+    uint32_t result;
+
+    __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
+    return (result);
+}
+
+// __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
+__STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+    uint32_t result;
+
+    __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
+    return (result);
+}
+
+__STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
+{
+    op2 %= 32U;
+    if (op2 == 0U)
+    {
+        return op1;
+    }
+    return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+__STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
+{
+    uint32_t result;
+    if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
+    {
+        __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
+    }
+    else
+    {
+        result = SXTB16(ROR(op1, rotate));
+    }
+    return result;
+}
+
+__STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
+{
+    uint32_t result;
+    if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
+    {
+        __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
+    }
+    else
+    {
+        result = SXTAB16(op1, ROR(op2, rotate));
+    }
+    return result;
+}
+
+// Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
+__STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
+{
+    uint32_t result;
+
+    __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
+    return (result);
+}
+
+__STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
+{
+    uint32_t result;
+
+    __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
+    return (result);
+}
+    #endif
+
+#endif
+
+#endif /* #ifndef ARM_NN_COMPILER_H */
\ No newline at end of file
diff --git a/src/third_party/cmsis_nn/Include/arm_nn_math_types.h b/src/third_party/cmsis_nn/Include/arm_nn_math_types.h
index 3c6c437f..a4c95587 100644
--- a/src/third_party/cmsis_nn/Include/arm_nn_math_types.h
+++ b/src/third_party/cmsis_nn/Include/arm_nn_math_types.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -16,157 +16,63 @@
  * limitations under the License.
  */
 
-/******************************************************************************
- * @file     arm_nn_math_types.h
- * @brief    Compiler include and basic types
- * @version  V1.2.0
- * @date     20 June 2022
- * Target Processor: Cortex-M
- ******************************************************************************/
-
-/**
-   Copied from CMSIS/DSP/arm_math_types.h and modified
-*/
-
-#ifndef _ARM_NN_MATH_TYPES_H_
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_math_types.h
+ * Description:  Compiler include and basic types
+ *
+ * $Date:        4 January 2023
+ * $Revision:    V.1.3.2
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ * -------------------------------------------------------------------- */
 
-#define _ARM_NN_MATH_TYPES_H_
+#ifndef ARM_NN_MATH_TYPES_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#define ARM_NN_MATH_TYPES_H
 
-#include <float.h>
 #include <limits.h>
-#include <math.h>
 #include <stdint.h>
 #include <string.h>
 
-/* Integer aliases */
-typedef int8_t q7_t;
-typedef int16_t q15_t;
-typedef int32_t q31_t;
-typedef int64_t q63_t;
-
-/* Compiler specific diagnostic adjustment */
-#if defined(__CC_ARM)
-
-#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
-
-#elif defined(__GNUC__)
-
-#elif defined(__ICCARM__)
-
-#elif defined(__TI_ARM__)
-
-#elif defined(__CSMC__)
-
-#elif defined(__TASKING__)
-
-#elif defined(_MSC_VER)
-
-#else
-#error Unknown compiler
-#endif
-
-/* Included for instrinsics definitions */
-#if defined(_MSC_VER)
-#ifndef __STATIC_FORCEINLINE
-#define __STATIC_FORCEINLINE static __forceinline
-#endif
-#ifndef __STATIC_INLINE
-#define __STATIC_INLINE static __inline
-#endif
-#ifndef __ALIGNED
-#define __ALIGNED(x) __declspec(align(x))
-#endif
-
-#elif defined(__GNUC_PYTHON__)
-#ifndef __ALIGNED
-#define __ALIGNED(x) __attribute__((aligned(x)))
-#endif
-#ifndef __STATIC_FORCEINLINE
-#define __STATIC_FORCEINLINE static inline __attribute__((always_inline))
-#endif
-#ifndef __STATIC_INLINE
-#define __STATIC_INLINE static inline
-#endif
-
-#else
-#include "third_party/cmsis/CMSIS/Core/Include/cmsis_compiler.h"
-#endif
-
-/* evaluate ARM DSP feature */
-#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
-#ifndef ARM_MATH_DSP
-#define ARM_MATH_DSP 1
-#endif
-#endif
-
-#if __ARM_FEATURE_MVE
-#ifndef ARM_MATH_MVEI
-#define ARM_MATH_MVEI
-#endif
-#endif
-
-/* Compiler specific diagnostic adjustment */
-#if defined(__CC_ARM)
-
-#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
-
-#elif defined(__GNUC__)
-// #pragma GCC diagnostic pop
-
-#elif defined(__ICCARM__)
-
-#elif defined(__TI_ARM__)
-
-#elif defined(__CSMC__)
-
-#elif defined(__TASKING__)
-
-#elif defined(_MSC_VER)
-
-#else
-#error Unknown compiler
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#if __ARM_FEATURE_MVE
-#include <arm_mve.h>
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /**
- * @brief Add necessary typedefs
+ *
+ * @brief Translate architecture feature flags to CMSIS-NN defines
+ *
  */
 
-#define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL))
-#define NN_Q15_MAX ((q15_t)(0x7FFF))
-#define NN_Q7_MAX ((q7_t)(0x7F))
-#define NN_Q31_MIN ((q31_t)(0x80000000L))
-#define NN_Q15_MIN ((q15_t)(0x8000))
-#define NN_Q7_MIN ((q7_t)(0x80))
+// CMSIS-NN uses the same macro names as CMSIS-DSP
+#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+    #ifndef ARM_MATH_DSP
+        #define ARM_MATH_DSP 1
+    #endif
+#endif
+
+#if defined(__ARM_FEATURE_MVE)
+    #ifndef ARM_MATH_MVEI
+        #define ARM_MATH_MVEI 1
+    #endif
+#endif
 
 /**
- * @brief Error status returned by some functions in the library.
+ *
+ * @brief Limits macros
+ *
  */
 
-typedef enum
-{
-    ARM_CMSIS_NN_SUCCESS = 0,        /**< No error */
-    ARM_CMSIS_NN_ARG_ERROR = -1,     /**< One or more arguments are incorrect */
-    ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**<  No implementation available */
-} arm_cmsis_nn_status;
+#define NN_Q31_MAX ((int32_t)(0x7FFFFFFFL))
+#define NN_Q15_MAX ((int16_t)(0x7FFF))
+#define NN_Q7_MAX ((int8_t)(0x7F))
+#define NN_Q31_MIN ((int32_t)(0x80000000L))
+#define NN_Q15_MIN ((int16_t)(0x8000))
+#define NN_Q7_MIN ((int8_t)(0x80))
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /*ifndef _ARM_NN_MATH_TYPES_H_ */
+#endif /*ifndef ARM_NN_MATH_TYPES_H */
diff --git a/src/third_party/cmsis_nn/Include/arm_nn_tables.h b/src/third_party/cmsis_nn/Include/arm_nn_tables.h
index c9090e31..c52b7ed8 100644
--- a/src/third_party/cmsis_nn/Include/arm_nn_tables.h
+++ b/src/third_party/cmsis_nn/Include/arm_nn_tables.h
@@ -21,8 +21,8 @@
  * Title:        arm_nn_tables.h
  * Description:  Extern declaration for NN tables
  *
- * $Date:        30. September 2022
- * $Revision:    V.2.0.0
+ * $Date:        28 October 2022
+ * $Revision:    V.2.1.0
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -37,5 +37,6 @@
  *
  */
 
+extern const uint16_t sigmoid_table_uint16[256];
 
 #endif /*  ARM_NN_TABLES_H */
\ No newline at end of file
diff --git a/src/third_party/cmsis_nn/Include/arm_nn_types.h b/src/third_party/cmsis_nn/Include/arm_nn_types.h
index 64182bec..915fbec9 100644
--- a/src/third_party/cmsis_nn/Include/arm_nn_types.h
+++ b/src/third_party/cmsis_nn/Include/arm_nn_types.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  Public header file to contain the CMSIS-NN structs for the
  *               TensorFlowLite micro compliant functions
  *
- * $Date:        22. Februari 2022
- * $Revision:    V.2.1.0
+ * $Date:        8 Februari 2023
+ * $Revision:    V.2.4.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 #ifndef _ARM_NN_TYPES_H
@@ -33,6 +33,21 @@
 
 #include <stdint.h>
 
+/** Enum for specifying activation function types */
+typedef enum
+{
+    ARM_SIGMOID = 0, /**< Sigmoid activation function */
+    ARM_TANH = 1,    /**< Tanh activation function */
+} arm_nn_activation_type;
+
+/** Function return codes */
+typedef enum
+{
+    ARM_CMSIS_NN_SUCCESS = 0,        /**< No error */
+    ARM_CMSIS_NN_ARG_ERROR = -1,     /**< One or more arguments are incorrect */
+    ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**<  No implementation available */
+} arm_cmsis_nn_status;
+
 /** CMSIS-NN object to contain the width and height of a tile */
 typedef struct
 {
@@ -57,6 +72,15 @@ typedef struct
     int32_t c; /**< Input channels */
 } cmsis_nn_dims;
 
+/** CMSIS-NN object to contain LSTM specific input parameters related to dimensions */
+typedef struct
+{
+    int32_t max_time;
+    int32_t num_inputs;
+    int32_t num_batches;
+    int32_t num_outputs;
+} cmsis_nn_lstm_dims;
+
 /** CMSIS-NN object for the per-channel quantization parameters */
 typedef struct
 {
@@ -134,4 +158,100 @@ typedef struct
     const int16_t *one_by_one_lut;
 } cmsis_nn_softmax_lut_s16;
 
+/** LSTM guard parameters */
+typedef struct
+{
+    int32_t input_variance;
+    int32_t forget_variance;
+    int32_t cell_variance;
+    int32_t output_variance;
+} cmsis_nn_lstm_guard_params;
+
+/** LSTM scratch buffer container */
+typedef struct
+{
+    int16_t *input_gate;
+    int16_t *forget_gate;
+    int16_t *cell_gate;
+    int16_t *output_gate;
+} cmsis_nn_lstm_context;
+
+/** Quantized clip value for cell and projection of LSTM input. Zero value means no clipping. */
+typedef struct
+{
+    int16_t cell;
+    int8_t projection;
+} cmsis_nn_lstm_clip_params;
+
+/** CMSIS-NN object for quantization parameters */
+typedef struct
+{
+    int32_t multiplier; /**< Multiplier value */
+    int32_t shift;      /**< Shift value */
+} cmsis_nn_scaling;
+
+/** CMSIS-NN norm layer coefficients */
+typedef struct
+{
+    int16_t *input_weight;
+    int16_t *forget_weight;
+    int16_t *cell_weight;
+    int16_t *output_weight;
+} cmsis_nn_layer_norm;
+
+/** Parameters for integer LSTM, as defined in TFLM */
+typedef struct
+{
+    int32_t time_major; /**< Nonzero (true) if first row of data is timestamps for input */
+    cmsis_nn_scaling input_to_input_scaling;
+    cmsis_nn_scaling input_to_forget_scaling;
+    cmsis_nn_scaling input_to_cell_scaling;
+    cmsis_nn_scaling input_to_output_scaling;
+    cmsis_nn_scaling recurrent_to_input_scaling;
+    cmsis_nn_scaling recurrent_to_forget_scaling;
+    cmsis_nn_scaling recurrent_to_cell_scaling;
+    cmsis_nn_scaling recurrent_to_output_scaling;
+    cmsis_nn_scaling cell_to_input_scaling;
+    cmsis_nn_scaling cell_to_forget_scaling;
+    cmsis_nn_scaling cell_to_output_scaling;
+    cmsis_nn_scaling projection_scaling;
+    cmsis_nn_scaling hidden_scaling;
+    cmsis_nn_scaling layer_norm_input_scaling;  /**< layer normalization for input layer */
+    cmsis_nn_scaling layer_norm_forget_scaling; /**< layer normalization for forget gate */
+    cmsis_nn_scaling layer_norm_cell_scaling;   /**< layer normalization for cell */
+    cmsis_nn_scaling layer_norm_output_scaling; /**< layer normalization for outpus layer */
+
+    int32_t cell_state_shift;
+    int32_t hidden_offset;
+    int32_t output_state_offset;
+
+    cmsis_nn_lstm_clip_params clip;
+    cmsis_nn_lstm_guard_params guard;
+    cmsis_nn_layer_norm layer_norm;
+
+    /* Effective bias is precalculated as bias + zero_point * weight.
+    Only applicable to when input/output are s8 and weights are s16 */
+    const int32_t *i2i_effective_bias; /**< input to input effective bias */
+    const int32_t *i2f_effective_bias; /**< input to forget gate effective bias */
+    const int32_t *i2c_effective_bias; /**< input to cell effective bias */
+    const int32_t *i2o_effective_bias; /**< input to output effective bias */
+
+    const int32_t *r2i_effective_bias; /**< recurrent gate to input effective bias */
+    const int32_t *r2f_effective_bias; /**< recurrent gate to forget gate effective bias */
+    const int32_t *r2c_effective_bias; /**< recurrent gate to cell effective bias */
+    const int32_t *r2o_effective_bias; /**< recurrent gate to output effective bias */
+
+    const int32_t *projection_effective_bias;
+
+    /* Not precalculated bias */
+    const int32_t *input_gate_bias;
+    const int32_t *forget_gate_bias;
+    const int32_t *cell_gate_bias;
+    const int32_t *output_gate_bias;
+
+    /* Activation min and max */
+    cmsis_nn_activation activation;
+
+} cmsis_nn_lstm_params;
+
 #endif // _ARM_NN_TYPES_H
diff --git a/src/third_party/cmsis_nn/Include/arm_nnfunctions.h b/src/third_party/cmsis_nn/Include/arm_nnfunctions.h
index 1712da41..f338ca60 100644
--- a/src/third_party/cmsis_nn/Include/arm_nnfunctions.h
+++ b/src/third_party/cmsis_nn/Include/arm_nnfunctions.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        30 September 2022
- * $Revision:    V.11.0.0
+ * $Date:        13 January 2023
+ * $Revision:    V.11.3.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 /**
@@ -32,24 +32,24 @@
    *
    * \tableofcontents
    * \section Introduction
-   * 
+   *
    *
    * This user manual describes the CMSIS NN software library,
    * a collection of efficient neural network kernels developed to maximize the
    * performance and minimize the memory footprint of neural networks on Arm Cortex-M processors.
    *
    * The library is divided into a number of functions each covering a specific category:
-   * - \ref NNConv Convolution Functions
-   * - \ref Acti "Activation Functions"
-   * - \ref FC Fully-connected Layer Functions
-   * - \ref SVDF Layer Functions
-   * - \ref Pooling Functions
-   * - \ref Softmax Functions
-   * - \ref groupElementwise Basic math Functions
-   *
+   * - \ref NNConv
+   * - \ref Acti
+   * - \ref FC
+   * - \ref SVDF
+   * - \ref Pooling
+   * - \ref Softmax
+   * - \ref groupElementwise
+   * - \ref LSTM
    *
    * \section Processors Supported Processors
-   * 
+   *
    * CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each
    * targets a different group of processors.
    *  - Processors without Single Instruction Multiple Data(SIMD) capability (e.g, Cortex-M0)
@@ -59,40 +59,45 @@
    *
    * \section Framework Quantization Specification
    * The library follows the [int8](https://www.tensorflow.org/lite/performance/quantization_spec) and int16
-   *  quantization specification of TensorFlow Lite for Microcontrollers. 
+   *  quantization specification of TensorFlow Lite for Microcontrollers.
    * \section Overview Block Diagram
-   * 
+   *
    * \image html CMSIS-NN-OVERVIEW.PNG
    *
    * \section Examples
-   * 
+   *
    *
    * An example image recognition application using TensorFlow Flow Lite for Microcontrollers as an inference engine
    * and CMSIS-NN as the optimized library can be found in the Examples directory.
    *
    * \section Macros Pre-processor Macros
-   * 
+   *
    * \subsection Feature Feature flag based
    * The macros below are defined in a build system based on feature flags for a chosen processor or architecture
    * input to a compiler.
-   * These tie in to the classification in \ref Macros. 
-   * 
-   * For a CMSIS-NN file compiled as *armclang -mcpu=cortex-m4 --target=arm-arm-none-eabi -I<CMSIS Core Include> 
+   * These tie in to the classification in \ref Macros.
+   *
+   * For a CMSIS-NN file compiled as *armclang -mcpu=cortex-m4 --target=arm-arm-none-eabi -I<CMSIS Core Include>
    * -Ofast -O file.c* , ARM_MATH_DSP is enabled as Cortex-M4 has the DSP extension as a feature.
-   * 
+   *
    * - `ARM_MATH_DSP`  - Selects code for processors with DSP extension.
    *
    * - `ARM_MATH_MVEI`  - Selects code for processors which supports MVE instructions.
    *
-   * \subsection MiscFlags User Set 
+   * \subsection MiscFlags User Set
    * - `ARM_MATH_AUTOVECTORIZE`
-   *  Applicable when ARM_MATH_MVEI is active to let the compiler auto vectorize functions, if available, that uses inline
+   *  Applicable when ARM_MATH_MVEI is active to let the compiler auto vectorize functions, if available, that uses
+   inline
    *  assembly. This has to be explicitly set at compile time.
    *
+   * \section Inclusive Inclusive Language
+   * This product confirms to Arm’s inclusive language policy and, to the best of our knowledge,
+   * does not contain any non-inclusive language. If you find something that concerns you, email terms@arm.com.
+   *
    * \section Copyright Copyright Notice
-   * 
    *
-   * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+   *
+   * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
    *
    *
    */
@@ -115,18 +120,6 @@
 extern "C" {
 #endif
 
-/**
- * @brief Struct for specifying activation function types
- *
- */
-typedef enum
-{
-    ARM_SIGMOID = 0,
-    /**< Sigmoid activation function */
-    ARM_TANH = 1,
-    /**< Tanh activation function */
-} arm_nn_activation_type;
-
 /**
  * @defgroup NNConv Convolution Functions
  *
@@ -175,13 +168,13 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
                                             const cmsis_nn_conv_params *conv_params,
                                             const cmsis_nn_per_channel_quant_params *quant_params,
                                             const cmsis_nn_dims *input_dims,
-                                            const q7_t *input_data,
+                                            const int8_t *input_data,
                                             const cmsis_nn_dims *filter_dims,
-                                            const q7_t *filter_data,
+                                            const int8_t *filter_data,
                                             const cmsis_nn_dims *bias_dims,
                                             const int32_t *bias_data,
                                             const cmsis_nn_dims *output_dims,
-                                            q7_t *output_data);
+                                            int8_t *output_data);
 
 /**
  * @brief Get the required buffer size for arm_convolve_wrapper_s8
@@ -194,7 +187,7 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
  *                                filter dimensions
  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
  *
- * @return         The function returns  required buffer size(bytes)
+ * @return         The function returns required buffer size(bytes)
  *
  */
 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
@@ -202,6 +195,32 @@ int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv
                                                 const cmsis_nn_dims *filter_dims,
                                                 const cmsis_nn_dims *output_dims);
 
+/**
+ * @brief Get the required buffer size for arm_convolve_wrapper_s8 for Arm(R) Helium Architecture case.
+ *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_convolve_wrapper_s8_get_buffer_size().
+ *
+ */
+int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
+                                                    const cmsis_nn_dims *input_dims,
+                                                    const cmsis_nn_dims *filter_dims,
+                                                    const cmsis_nn_dims *output_dims);
+
+/**
+ * @brief Get the required buffer size for arm_convolve_wrapper_s8 for processors with DSP extension.
+ *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_convolve_wrapper_s8_get_buffer_size().
+ *
+ */
+int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
+                                                    const cmsis_nn_dims *input_dims,
+                                                    const cmsis_nn_dims *filter_dims,
+                                                    const cmsis_nn_dims *output_dims);
+
 /**
  * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
  *        cmsis-nn to perform the convolution.
@@ -233,16 +252,16 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
                                              const cmsis_nn_conv_params *conv_params,
                                              const cmsis_nn_per_channel_quant_params *quant_params,
                                              const cmsis_nn_dims *input_dims,
-                                             const q15_t *input_data,
+                                             const int16_t *input_data,
                                              const cmsis_nn_dims *filter_dims,
-                                             const q7_t *filter_data,
+                                             const int8_t *filter_data,
                                              const cmsis_nn_dims *bias_dims,
                                              const int64_t *bias_data,
                                              const cmsis_nn_dims *output_dims,
-                                             q15_t *output_data);
+                                             int16_t *output_data);
 
 /**
- * @brief Get the required buffer size for arm_convolve_wrapper_s16
+ * @brief Get the required buffer size for arm_convolve_wrapper_s16.
  *
  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
  *                                conv_params->input_offset  : Not used
@@ -252,7 +271,7 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
  *                                filter dimensions
  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
  *
- * @return         The function returns  required buffer size(bytes)
+ * @return         The function returns required buffer size(bytes)
  *
  */
 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
@@ -260,6 +279,32 @@ int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *con
                                                  const cmsis_nn_dims *filter_dims,
                                                  const cmsis_nn_dims *output_dims);
 
+/**
+ * @brief Get the required buffer size for arm_convolve_wrapper_s16 for for processors with DSP extension.
+ *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_convolve_wrapper_s16_get_buffer_size().
+ *
+ */
+int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
+                                                     const cmsis_nn_dims *input_dims,
+                                                     const cmsis_nn_dims *filter_dims,
+                                                     const cmsis_nn_dims *output_dims);
+
+/**
+ * @brief Get the required buffer size for arm_convolve_wrapper_s16 for Arm(R) Helium Architecture case.
+ *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_convolve_wrapper_s16_get_buffer_size().
+ *
+ */
+int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
+                                                     const cmsis_nn_dims *input_dims,
+                                                     const cmsis_nn_dims *filter_dims,
+                                                     const cmsis_nn_dims *output_dims);
+
 /**
  * @brief Basic s8 convolution function
  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
@@ -284,21 +329,20 @@ int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *con
  *
  * @details
  *    1. Supported framework: TensorFlow Lite micro
- *    2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
- *    3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
  *
  */
 arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
                                     const cmsis_nn_conv_params *conv_params,
                                     const cmsis_nn_per_channel_quant_params *quant_params,
                                     const cmsis_nn_dims *input_dims,
-                                    const q7_t *input_data,
+                                    const int8_t *input_data,
                                     const cmsis_nn_dims *filter_dims,
-                                    const q7_t *filter_data,
+                                    const int8_t *filter_data,
                                     const cmsis_nn_dims *bias_dims,
                                     const int32_t *bias_data,
                                     const cmsis_nn_dims *output_dims,
-                                    q7_t *output_data);
+                                    int8_t *output_data);
 
 /**
  * @brief Get the required buffer size for s8 convolution function
@@ -306,7 +350,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
  * are the spatial filter dimensions
- * @return          The function returns  required buffer size(bytes)
+ * @return          The function returns required buffer size(bytes)
  *
  */
 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
@@ -335,21 +379,20 @@ int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const c
  *
  * @details
  *    1. Supported framework: TensorFlow Lite micro
- *    2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
- *    3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
  *
  */
 arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
                                      const cmsis_nn_conv_params *conv_params,
                                      const cmsis_nn_per_channel_quant_params *quant_params,
                                      const cmsis_nn_dims *input_dims,
-                                     const q15_t *input_data,
+                                     const int16_t *input_data,
                                      const cmsis_nn_dims *filter_dims,
-                                     const q7_t *filter_data,
+                                     const int8_t *filter_data,
                                      const cmsis_nn_dims *bias_dims,
                                      const int64_t *bias_data,
                                      const cmsis_nn_dims *output_dims,
-                                     q15_t *output_data);
+                                     int16_t *output_data);
 /**
  * @brief Optimized s16 convolution function
  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
@@ -375,9 +418,8 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
  *
  * @details
  *    1. Supported framework: TensorFlow Lite micro
- *    2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
- *    3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
- *    4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
+ *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ *    3. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
  *
  */
 
@@ -385,13 +427,13 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
                                           const cmsis_nn_conv_params *conv_params,
                                           const cmsis_nn_per_channel_quant_params *quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q15_t *input_data,
+                                          const int16_t *input_data,
                                           const cmsis_nn_dims *filter_dims,
-                                          const q7_t *filter_data,
+                                          const int8_t *filter_data,
                                           const cmsis_nn_dims *bias_dims,
                                           const int64_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q15_t *output_data);
+                                          int16_t *output_data);
 
 /**
  * @brief Get the required buffer size for s16 convolution function
@@ -399,7 +441,7 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
  * @param[in]       input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  * @param[in]       filter_dims   Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
  *                                are the spatial filter dimensions
- * @return          The function returns  required buffer size(bytes)
+ * @return          The function returns required buffer size(bytes)
  *
  */
 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
@@ -442,7 +484,6 @@ int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, c
  * @details
  *   - Supported framework : TensorFlow Lite Micro
  *   - The following constrains on the arguments apply
- *      -# input_dims->c is a multiple of 4
  *      -# conv_params->padding.w = conv_params->padding.h = 0
  *      -# conv_params->stride.w = conv_params->stride.h = 1
  *
@@ -451,13 +492,13 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
                                              const cmsis_nn_conv_params *conv_params,
                                              const cmsis_nn_per_channel_quant_params *quant_params,
                                              const cmsis_nn_dims *input_dims,
-                                             const q7_t *input_data,
+                                             const int8_t *input_data,
                                              const cmsis_nn_dims *filter_dims,
-                                             const q7_t *filter_data,
+                                             const int8_t *filter_data,
                                              const cmsis_nn_dims *bias_dims,
                                              const int32_t *bias_data,
                                              const cmsis_nn_dims *output_dims,
-                                             q7_t *output_data);
+                                             int8_t *output_data);
 
 /**
  * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
@@ -468,6 +509,46 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
  */
 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
 
+/**
+ * @brief s8 version for 1x1 convolution with support for non-unity stride values
+ *
+ * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
+ *                               None is required by this function.
+ * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
+ *                               Range of conv_params->input_offset  : [-127, 128]
+ *                               Range of conv_params->output_offset : [-128, 127]
+ * @param[in]      quant_params  Per-channel quantization info.
+ *                               It contains the multiplier and shift values to be applied to each output channel
+ * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in]      input_data    Input (activation) data pointer. Data type: int8
+ * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
+ * @param[in]      filter_data   Filter data pointer. Data type: int8
+ * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
+ * @param[in]      bias_data     Optional bias data pointer. Data type: int32
+ * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
+ * @param[out]     output_data   Output data pointer. Data type: int8
+ *
+ * @return     The function returns either
+ *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
+ *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
+ * @details
+ *   - Supported framework : TensorFlow Lite Micro
+ *   - The following constrains on the arguments apply
+ *      -# conv_params->padding.w = conv_params->padding.h = 0
+ *
+ */
+arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx,
+                                        const cmsis_nn_conv_params *conv_params,
+                                        const cmsis_nn_per_channel_quant_params *quant_params,
+                                        const cmsis_nn_dims *input_dims,
+                                        const int8_t *input_data,
+                                        const cmsis_nn_dims *filter_dims,
+                                        const int8_t *filter_data,
+                                        const cmsis_nn_dims *bias_dims,
+                                        const int32_t *bias_data,
+                                        const cmsis_nn_dims *output_dims,
+                                        int8_t *output_data);
+
 /**
  * @brief 1xn convolution
  *
@@ -509,13 +590,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
                                           const cmsis_nn_conv_params *conv_params,
                                           const cmsis_nn_per_channel_quant_params *quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q7_t *input_data,
+                                          const int8_t *input_data,
                                           const cmsis_nn_dims *filter_dims,
-                                          const q7_t *filter_data,
+                                          const int8_t *filter_data,
                                           const cmsis_nn_dims *bias_dims,
                                           const int32_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q7_t *output_data);
+                                          int8_t *output_data);
 
 /**
  * @brief Get the required additional buffer size for 1xn convolution
@@ -523,7 +604,7 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
  *                                        horizontal spatial filter dimension
- * @return          The function returns  required buffer size(bytes)
+ * @return          The function returns required buffer size(bytes)
  *
  */
 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
@@ -561,7 +642,6 @@ int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, c
  *        -# arm_depthwise_conv_s8()
  *        -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
  *        -# arm_depthwise_conv_s8_opt()
- *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  *    - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
  * boundary.
  */
@@ -569,13 +649,13 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
                                                   const cmsis_nn_per_channel_quant_params *quant_params,
                                                   const cmsis_nn_dims *input_dims,
-                                                  const q7_t *input_data,
+                                                  const int8_t *input_data,
                                                   const cmsis_nn_dims *filter_dims,
-                                                  const q7_t *filter_data,
+                                                  const int8_t *filter_data,
                                                   const cmsis_nn_dims *bias_dims,
                                                   const int32_t *bias_data,
                                                   const cmsis_nn_dims *output_dims,
-                                                  q7_t *output_data);
+                                                  int8_t *output_data);
 
 /**
  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
@@ -595,6 +675,32 @@ int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_par
                                                       const cmsis_nn_dims *filter_dims,
                                                       const cmsis_nn_dims *output_dims);
 
+/**
+ * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for processors with DSP extension.
+ *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
+ *
+ */
+int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                          const cmsis_nn_dims *input_dims,
+                                                          const cmsis_nn_dims *filter_dims,
+                                                          const cmsis_nn_dims *output_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for Arm(R) Helium Architecture case.
+ *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
+ *
+ */
+int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                          const cmsis_nn_dims *input_dims,
+                                                          const cmsis_nn_dims *filter_dims,
+                                                          const cmsis_nn_dims *output_dims);
+
 /**
  * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
  *
@@ -623,19 +729,18 @@ int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_par
  *
  * @details
  *    - Supported framework: TensorFlow Lite
- *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  */
 arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
                                           const cmsis_nn_dw_conv_params *dw_conv_params,
                                           const cmsis_nn_per_channel_quant_params *quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q7_t *input_data,
+                                          const int8_t *input_data,
                                           const cmsis_nn_dims *filter_dims,
-                                          const q7_t *filter_data,
+                                          const int8_t *filter_data,
                                           const cmsis_nn_dims *bias_dims,
                                           const int32_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q7_t *output_data);
+                                          int8_t *output_data);
 
 /**
  * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
@@ -665,19 +770,18 @@ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
  *
  * @details
  *    - Supported framework: TensorFlow Lite
- *    - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
  */
 arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
                                            const cmsis_nn_dw_conv_params *dw_conv_params,
                                            const cmsis_nn_per_channel_quant_params *quant_params,
                                            const cmsis_nn_dims *input_dims,
-                                           const q15_t *input_data,
+                                           const int16_t *input_data,
                                            const cmsis_nn_dims *filter_dims,
-                                           const q7_t *filter_data,
+                                           const int8_t *filter_data,
                                            const cmsis_nn_dims *bias_dims,
                                            const int64_t *bias_data,
                                            const cmsis_nn_dims *output_dims,
-                                           q15_t *output_data);
+                                           int16_t *output_data);
 
 /**
  * @brief Wrapper function to pick the right optimized s16 depthwise convolution function
@@ -711,19 +815,18 @@ arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
  *    - Picks one of the the following functions
  *        -# arm_depthwise_conv_s16()
  *        -# arm_depthwise_conv_fast_s16()  - Cortex-M CPUs with DSP extension only
- *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  */
 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
                                                    const cmsis_nn_dw_conv_params *dw_conv_params,
                                                    const cmsis_nn_per_channel_quant_params *quant_params,
                                                    const cmsis_nn_dims *input_dims,
-                                                   const q15_t *input_data,
+                                                   const int16_t *input_data,
                                                    const cmsis_nn_dims *filter_dims,
-                                                   const q7_t *filter_data,
+                                                   const int8_t *filter_data,
                                                    const cmsis_nn_dims *bias_dims,
                                                    const int64_t *bias_data,
                                                    const cmsis_nn_dims *output_dims,
-                                                   q15_t *output_data);
+                                                   int16_t *output_data);
 
 /**
  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16()
@@ -743,6 +846,32 @@ int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_pa
                                                        const cmsis_nn_dims *filter_dims,
                                                        const cmsis_nn_dims *output_dims);
 
+/**
+ * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for processors with DSP extension.
+ *        Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
+ *
+ */
+int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                           const cmsis_nn_dims *input_dims,
+                                                           const cmsis_nn_dims *filter_dims,
+                                                           const cmsis_nn_dims *output_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for Arm(R) Helium Architecture
+ * case. Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
+ *
+ */
+int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                           const cmsis_nn_dims *input_dims,
+                                                           const cmsis_nn_dims *filter_dims,
+                                                           const cmsis_nn_dims *output_dims);
+
 /**
  * @brief Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel.
  *        Refer arm_depthwise_conv_s16() for function argument details.
@@ -759,7 +888,6 @@ int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_pa
  *    - Supported framework: TensorFlow Lite
  *    - The following constrains on the arguments apply
  *        -# Number of input channel equals number of output channels or ch_mult equals 1
- *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  *    - Reccomended when number of channels is 4 or greater.
  *
  */
@@ -767,13 +895,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                                                 const cmsis_nn_dw_conv_params *dw_conv_params,
                                                 const cmsis_nn_per_channel_quant_params *quant_params,
                                                 const cmsis_nn_dims *input_dims,
-                                                const q15_t *input_data,
+                                                const int16_t *input_data,
                                                 const cmsis_nn_dims *filter_dims,
-                                                const q7_t *filter_data,
+                                                const int8_t *filter_data,
                                                 const cmsis_nn_dims *bias_dims,
                                                 const int64_t *bias_data,
                                                 const cmsis_nn_dims *output_dims,
-                                                q15_t *output_data);
+                                                int16_t *output_data);
 
 /**
  * @brief Get the required buffer size for optimized s16 depthwise convolution
@@ -781,7 +909,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
  *                               Batch argument N is not used.
  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
- * @return          The function returns  required buffer size in bytes
+ * @return          The function returns required buffer size in bytes
  *
  */
 int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
@@ -808,13 +936,13 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
                                               const cmsis_nn_dw_conv_params *dw_conv_params,
                                               const cmsis_nn_per_channel_quant_params *quant_params,
                                               const cmsis_nn_dims *input_dims,
-                                              const q7_t *input_data,
+                                              const int8_t *input_data,
                                               const cmsis_nn_dims *filter_dims,
-                                              const q7_t *filter_data,
+                                              const int8_t *filter_data,
                                               const cmsis_nn_dims *bias_dims,
                                               const int32_t *bias_data,
                                               const cmsis_nn_dims *output_dims,
-                                              q7_t *output_data);
+                                              int8_t *output_data);
 
 /**
  * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
@@ -835,7 +963,6 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
  *    - Supported framework: TensorFlow Lite
  *    - The following constrains on the arguments apply
  *        -# Number of input channel equals number of output channels or ch_mult equals 1
- *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  *    - Reccomended when number of channels is 4 or greater.
  *
  */
@@ -843,13 +970,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                                               const cmsis_nn_dw_conv_params *dw_conv_params,
                                               const cmsis_nn_per_channel_quant_params *quant_params,
                                               const cmsis_nn_dims *input_dims,
-                                              const q7_t *input_data,
+                                              const int8_t *input_data,
                                               const cmsis_nn_dims *filter_dims,
-                                              const q7_t *filter_data,
+                                              const int8_t *filter_data,
                                               const cmsis_nn_dims *bias_dims,
                                               const int32_t *bias_data,
                                               const cmsis_nn_dims *output_dims,
-                                              q7_t *output_data);
+                                              int8_t *output_data);
 
 /**
  * @brief Get the required buffer size for optimized s8 depthwise convolution
@@ -857,7 +984,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
  *                               Batch argument N is not used.
  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
- * @return          The function returns  required buffer size in bytes
+ * @return          The function returns required buffer size in bytes
  *
  */
 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
@@ -909,29 +1036,47 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim
  *
  * @details
  *    - Supported framework: TensorFlow Lite
- *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  */
 arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                            const cmsis_nn_fc_params *fc_params,
                                            const cmsis_nn_per_tensor_quant_params *quant_params,
                                            const cmsis_nn_dims *input_dims,
-                                           const q7_t *input_data,
+                                           const int8_t *input_data,
                                            const cmsis_nn_dims *filter_dims,
-                                           const q7_t *filter_data,
+                                           const int8_t *filter_data,
                                            const cmsis_nn_dims *bias_dims,
                                            const int32_t *bias_data,
                                            const cmsis_nn_dims *output_dims,
-                                           q7_t *output_data);
+                                           int8_t *output_data);
 
 /**
- * @brief Get the required buffer size for S8 basic fully-connected and
- * matrix multiplication layer function for TF Lite
+ * @brief Get size of additional buffer required by arm_fully_connected_s8().
  * @param[in]      filter_dims             dimension of filter
  * @return         The function returns    required buffer size in bytes
  *
  */
 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
 
+/**
+ * @brief Get size of additional buffer required by arm_fully_connected_s8() for processors with DSP extension.
+ *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_fully_connected_s8_get_buffer_size().
+ *
+ */
+int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_fully_connected_s8() for Arm(R) Helium Architecture case.
+ *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_fully_connected_s8_get_buffer_size().
+ *
+ */
+int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
+
 /**
  * @brief Basic s16 Fully Connected function.
  *
@@ -966,23 +1111,21 @@ int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
  *
  * @details
  *    - Supported framework: TensorFlow Lite
- *    - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
  */
 arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
                                             const cmsis_nn_fc_params *fc_params,
                                             const cmsis_nn_per_tensor_quant_params *quant_params,
                                             const cmsis_nn_dims *input_dims,
-                                            const q15_t *input_data,
+                                            const int16_t *input_data,
                                             const cmsis_nn_dims *filter_dims,
-                                            const q7_t *filter_data,
+                                            const int8_t *filter_data,
                                             const cmsis_nn_dims *bias_dims,
                                             const int64_t *bias_data,
                                             const cmsis_nn_dims *output_dims,
-                                            q15_t *output_data);
+                                            int16_t *output_data);
 
 /**
- * @brief Get the required buffer size for S16 basic fully-connected and
- * matrix multiplication layer function for TF Lite
+ * @brief Get size of additional buffer required by arm_fully_connected_s16().
  * @param[in]      filter_dims             dimension of filter
  * @return         The function returns    required buffer size in bytes
  *
@@ -990,20 +1133,24 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
 int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
 
 /**
- * @brief Q7 opt fully-connected layer function
- * @param[in]       pV          pointer to input vector
- * @param[in]       pM          pointer to matrix weights
- * @param[in]       dim_vec     length of the vector
- * @param[in]       num_of_rows number of rows in weight matrix
- * @param[in]       bias_shift  amount of left-shift for bias
- * @param[in]       out_shift   amount of right-shift for output
- * @param[in]       bias        pointer to bias
- * @param[in,out]   pOut        pointer to output vector
- * @param[in,out]   vec_buffer  pointer to buffer space for input
- * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ * @brief Get size of additional buffer required by arm_fully_connected_s16() for processors with DSP extension.
+ *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_fully_connected_s16_get_buffer_size().
  *
  */
+int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
 
+/**
+ * @brief Get size of additional buffer required by arm_fully_connected_s16() for Arm(R) Helium Architecture case.
+ *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_fully_connected_s16_get_buffer_size().
+ *
+ */
+int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
 
 /**
  * @defgroup groupElementwise Elementwise Functions
@@ -1157,24 +1304,46 @@ arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
  * @param[in,out]   data        pointer to input
  * @param[in]       size        number of elements
  */
-
-void arm_relu_q7(q7_t *data, uint16_t size);
+void arm_relu_q7(int8_t *data, uint16_t size);
 
 /**
  * @brief s8 ReLU6 function
  * @param[in,out]   data        pointer to input
  * @param[in]       size        number of elements
  */
-
-void arm_relu6_s8(q7_t *data, uint16_t size);
+void arm_relu6_s8(int8_t *data, uint16_t size);
 
 /**
  * @brief Q15 RELU function
  * @param[in,out]   data        pointer to input
  * @param[in]       size        number of elements
  */
+void arm_relu_q15(int16_t *data, uint16_t size);
 
-void arm_relu_q15(q15_t *data, uint16_t size);
+/**
+ * @brief s16 neural network activation function using direct table look-up
+ * @param[in]       input        pointer to input data
+ * @param[out]      output      pointer to output
+ * @param[in]       size        number of elements
+ * @param[in]       left_shift  bit-width of the integer part, assume to be smaller than 3
+ * @param[in]       type        type of activation functions
+ *
+ * @details Supported framework: TensorFlow Lite for Microcontrollers.
+ * This activation function must be bit precise congruent with the corresponding TFLM tanh and sigmoid actication
+ * functions
+ */
+void arm_nn_activation_s16(const int16_t *input,
+                           int16_t *output,
+                           const uint16_t size,
+                           const uint16_t left_shift,
+                           const arm_nn_activation_type type);
+
+/**
+ * @defgroup Pooling Pooling Functions
+ *
+ * Perform max and average pooling operations
+ *
+ */
 
 /**
  * @brief s8 average pooling function.
@@ -1204,20 +1373,40 @@ void arm_relu_q15(q15_t *data, uint16_t size);
 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
                                    const cmsis_nn_pool_params *pool_params,
                                    const cmsis_nn_dims *input_dims,
-                                   const q7_t *input_data,
+                                   const int8_t *input_data,
                                    const cmsis_nn_dims *filter_dims,
                                    const cmsis_nn_dims *output_dims,
-                                   q7_t *output_data);
+                                   int8_t *output_data);
 
 /**
  * @brief Get the required buffer size for S8 average pooling function
  * @param[in]       dim_dst_width         output tensor dimension
  * @param[in]       ch_src                number of input tensor channels
- * @return          The function returns  required buffer size in bytes
+ * @return          The function returns required buffer size in bytes
  *
  */
 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
 
+/**
+ * @brief Get the required buffer size for S8 average pooling function for processors with DSP extension.
+ *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_avgpool_s8_get_buffer_size().
+ *
+ */
+int32_t arm_avgpool_s8_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
+
+/**
+ * @brief Get the required buffer size for S8 average pooling function for Arm(R) Helium Architecture case.
+ *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_avgpool_s8_get_buffer_size().
+ *
+ */
+int32_t arm_avgpool_s8_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
+
 /**
  * @brief s16 average pooling function.
  *
@@ -1256,11 +1445,31 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
  * @brief Get the required buffer size for S16 average pooling function
  * @param[in]       dim_dst_width         output tensor dimension
  * @param[in]       ch_src                number of input tensor channels
- * @return          The function returns  required buffer size in bytes
+ * @return          The function returns required buffer size in bytes
  *
  */
 int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
 
+/**
+ * @brief Get the required buffer size for S16 average pooling function for processors with DSP extension.
+ *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_avgpool_s16_get_buffer_size().
+ *
+ */
+int32_t arm_avgpool_s16_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
+
+/**
+ * @brief Get the required buffer size for S16 average pooling function for Arm(R) Helium Architecture case.
+ *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_avgpool_s16_get_buffer_size().
+ *
+ */
+int32_t arm_avgpool_s16_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
+
 /**
  * @brief s8 max pooling function.
  *
@@ -1290,10 +1499,10 @@ int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_sr
 arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
                                     const cmsis_nn_pool_params *pool_params,
                                     const cmsis_nn_dims *input_dims,
-                                    const q7_t *input_data,
+                                    const int8_t *input_data,
                                     const cmsis_nn_dims *filter_dims,
                                     const cmsis_nn_dims *output_dims,
-                                    q7_t *output_data);
+                                    int8_t *output_data);
 
 /**
  * @brief s16 max pooling function.
@@ -1431,7 +1640,6 @@ void arm_softmax_u8(const uint8_t *input,
                     const int32_t diff_min,
                     uint8_t *output);
 
-
 /**
  * @defgroup Reshape Reshape Functions
  *
@@ -1669,8 +1877,6 @@ void arm_concatenation_s8_w(const int8_t *input,
  *
  * @details
  *    1. Supported framework: TensorFlow Lite micro
- *    2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
- *
  */
 arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                                 const cmsis_nn_context *output_ctx,
@@ -1678,17 +1884,17 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
                                 const cmsis_nn_dims *input_dims,
-                                const q7_t *input_data,
+                                const int8_t *input_data,
                                 const cmsis_nn_dims *state_dims,
-                                q7_t *state_data,
+                                int8_t *state_data,
                                 const cmsis_nn_dims *weights_feature_dims,
-                                const q7_t *weights_feature_data,
+                                const int8_t *weights_feature_data,
                                 const cmsis_nn_dims *weights_time_dims,
-                                const q7_t *weights_time_data,
+                                const int8_t *weights_time_data,
                                 const cmsis_nn_dims *bias_dims,
-                                const q31_t *bias_data,
+                                const int32_t *bias_data,
                                 const cmsis_nn_dims *output_dims,
-                                q7_t *output_data);
+                                int8_t *output_data);
 
 /**
  * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
@@ -1719,8 +1925,6 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
  *
  * @details
  *    1. Supported framework: TensorFlow Lite micro
- *    2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
- *
  */
 arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
                                           const cmsis_nn_context *output_ctx,
@@ -1728,17 +1932,85 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
                                           const cmsis_nn_per_tensor_quant_params *input_quant_params,
                                           const cmsis_nn_per_tensor_quant_params *output_quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q7_t *input_data,
+                                          const int8_t *input_data,
                                           const cmsis_nn_dims *state_dims,
-                                          q15_t *state_data,
+                                          int16_t *state_data,
                                           const cmsis_nn_dims *weights_feature_dims,
-                                          const q7_t *weights_feature_data,
+                                          const int8_t *weights_feature_data,
                                           const cmsis_nn_dims *weights_time_dims,
-                                          const q15_t *weights_time_data,
+                                          const int16_t *weights_time_data,
                                           const cmsis_nn_dims *bias_dims,
-                                          const q31_t *bias_data,
+                                          const int32_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q7_t *output_data);
+                                          int8_t *output_data);
+
+/**
+ * @defgroup LSTM LSTM Layer Functions
+ *
+ */
+
+/**
+ * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output
+ * Peephole connections, projection, clipping, combined input/forget gate and layer normalization are not supported.
+ *
+ * @param[in]   scratch_buffers                 Struct containing scratch buffers
+ *                                              Expected size for each scratch buffer is
+ *                                              lstm_dims->num_batches * lstm_dims->num_outputs.
+ * @param[in]   input_data                      Pointer to input data
+ * @param[in]   lstm_dims                       LSTM input parameters related to dimensions
+ * @param[in]   input_to_input_weights          Input to input weights
+ * @param[in]   input_to_forget_weights         Input to forget weights
+ * @param[in]   input_to_cell_weights           Input to cell weights
+ * @param[in]   input_to_output_weights         Input to output weights
+ * @param[in]   recurrent_to_input_weights      Recurrent to input weights
+ * @param[in]   recurrent_to_forget_weights     Recurrent to forget weights
+ * @param[in]   recurrent_to_cell_weights       Recurrent to cell weights
+ * @param[in]   recurrent_to_output_weights     Recurrent to output weights
+ * @param[in]   cell_to_input_weights           Cell to input weights. Not used.
+ * @param[in]   cell_to_forget_weights          Cell to forget weights. Not used.
+ * @param[in]   cell_to_output_weights          Cell to output weights. Not used.
+ * @param[in]   projection_weights              Projection weights. Not used.
+ * @param[in]   lstm                            LSTM parameters. See struct declaration
+ * @param[in]   output_state                    Pointer to (recurrent) output state
+ * @param[in]   cell_state                      Pointer to cell state
+ * @param[in]   output_data                     Pointer to output state
+ *
+ * @note Following assumptions are done based on LSTM functionality as supported by
+ *       Keras version 2.9.0 at the time of development. As stated here,
+ *       https://github.com/tensorflow/community/blob/master/rfcs/20180920-unify-rnn-interface.md
+ *       Keras's LSTMCell is equivalent to TensorFlow's BasicLSTMCell,
+ *       which does not support peephole, clipping or projection.
+ *       Layer normalization and combined input/forget gate are not supported either.
+ *
+ *       1 Input to input weight can not be nullptr. Otherwise nullptr for combined input/forgat gate.
+ *       2 Cell weights are not used and should be nullptr. Otherwise needed for peephole connections.
+ *       3 Projection weight is not used and should be nullpr. Otherwise needed for projection.
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite micro
+ *
+ */
+arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratch_buffers,
+                                                   const int8_t *input_data,
+                                                   const cmsis_nn_lstm_dims *lstm_dims,
+                                                   const int8_t *input_to_input_weights,
+                                                   const int8_t *input_to_forget_weights,
+                                                   const int8_t *input_to_cell_weights,
+                                                   const int8_t *input_to_output_weights,
+                                                   const int8_t *recurrent_to_input_weights,
+                                                   const int8_t *recurrent_to_forget_weights,
+                                                   const int8_t *recurrent_to_cell_weights,
+                                                   const int8_t *recurrent_to_output_weights,
+                                                   const int16_t *cell_to_input_weights,
+                                                   const int16_t *cell_to_forget_weights,
+                                                   const int16_t *cell_to_output_weights,
+                                                   const int8_t *projection_weights,
+                                                   const cmsis_nn_lstm_params *lstm,
+                                                   int8_t *output_state,
+                                                   int16_t *cell_state,
+                                                   int8_t *output_data);
 
 #ifdef __cplusplus
 }
diff --git a/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h b/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h
index 8860f299..b1deaba7 100644
--- a/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h
+++ b/src/third_party/cmsis_nn/Include/arm_nnsupportfunctions.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,15 +21,16 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        30 September 2022
- * $Revision:    V.11.0.0
+ * $Date:        13 Februari 2023
+ * $Revision:    V.15.0.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 #ifndef _ARM_NNSUPPORTFUNCTIONS_H_
 #define _ARM_NNSUPPORTFUNCTIONS_H_
 
+#include "third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h"
 #include "third_party/cmsis_nn/Include/arm_nn_math_types.h"
 #include "third_party/cmsis_nn/Include/arm_nn_types.h"
 
@@ -39,6 +40,10 @@
 extern "C" {
 #endif
 
+#define USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)                                         \
+    (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 &&             \
+     filter_dims->w * filter_dims->h < 512)
+
 #define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
 #define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
 #define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0
@@ -61,7 +66,7 @@ extern "C" {
 /**
  * @brief definition to pack four 8 bit values.
  */
-#define PACK_Q7x4_32x1(v0, v1, v2, v3)                                                                                 \
+#define PACK_S8x4_32x1(v0, v1, v2, v3)                                                                                 \
     ((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) |                     \
      (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))
 
@@ -71,16 +76,16 @@ extern "C" {
 #define PACK_Q15x2_32x1(v0, v1) (((int32_t)v0 & (int32_t)0xFFFF) | ((int32_t)v1 << 16))
 
 /**
- * @brief Union for SIMD access of q31/q15/q7 types
+ * @brief Union for SIMD access of q31/s16/s8 types
  */
 union arm_nnword
 {
-    q31_t word;
+    int32_t word;
     /**< q31 type */
-    q15_t half_words[2];
-    /**< q15 type */
-    q7_t bytes[4];
-    /**< q7 type */
+    int16_t half_words[2];
+    /**< s16 type */
+    int8_t bytes[4];
+    /**< s8 type */
 };
 
 /**
@@ -105,7 +110,6 @@ union arm_nn_long_long
  *
  */
 
-
 /**
  * @defgroup supportConversion Data Conversion
  *
@@ -114,22 +118,22 @@ union arm_nn_long_long
  */
 
 /**
- * @brief Converts the elements from a q7 vector to a q15 vector with an added offset
- * @param[in]    src        pointer to the q7 input vector
- * @param[out]   dst        pointer to the q15 output vector
+ * @brief Converts the elements from a s8 vector to a s16 vector with an added offset
+ * @param[in]    src        pointer to the s8 input vector
+ * @param[out]   dst        pointer to the s16 output vector
  * @param[in]    block_size length of the input vector
- * @param[in]    offset     q7 offset to be added to each input vector element.
+ * @param[in]    offset     s8 offset to be added to each input vector element.
  *
  * \par Description:
  *
  * The equation used for the conversion process is:
  *
  * <pre>
- *  dst[n] = (q15_t) src[n] + offset;   0 <= n < block_size.
+ *  dst[n] = (int16_t) src[n] + offset;   0 <= n < block_size.
  * </pre>
  *
  */
-void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
+void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, uint32_t block_size, int16_t offset);
 
 /**
  * @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
@@ -150,17 +154,17 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
  *
  * @details     Supported framework: TensorFlow Lite micro.
  */
-q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
-                                    const q15_t *col,
-                                    const uint16_t num_ch,
-                                    const int32_t *out_shift,
-                                    const int32_t *out_mult,
-                                    const int32_t out_offset,
-                                    const int32_t activation_min,
-                                    const int32_t activation_max,
-                                    const uint16_t kernel_size,
-                                    const int32_t *const output_bias,
-                                    q7_t *out);
+int8_t *arm_nn_depthwise_conv_s8_core(const int8_t *row,
+                                      const int16_t *col,
+                                      const uint16_t num_ch,
+                                      const int32_t *out_shift,
+                                      const int32_t *out_mult,
+                                      const int32_t out_offset,
+                                      const int32_t activation_min,
+                                      const int32_t activation_max,
+                                      const uint16_t kernel_size,
+                                      const int32_t *const output_bias,
+                                      int8_t *out);
 
 /**
  * @brief General Matrix-multiplication function with per-channel requantization.
@@ -184,20 +188,20 @@ q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
  *
  * @details   Supported framework: TensorFlow Lite
  */
-q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
-                         const q7_t *input_col,
-                         const uint16_t output_ch,
-                         const uint16_t col_batches,
-                         const int32_t *output_shift,
-                         const int32_t *output_mult,
-                         const int32_t out_offset,
-                         const int32_t col_offset,
-                         const int32_t row_offset,
-                         const int16_t out_activation_min,
-                         const int16_t out_activation_max,
-                         const uint16_t row_len,
-                         const int32_t *const bias,
-                         q7_t *out);
+int8_t *arm_nn_mat_mult_s8(const int8_t *input_row,
+                           const int8_t *input_col,
+                           const uint16_t output_ch,
+                           const uint16_t col_batches,
+                           const int32_t *output_shift,
+                           const int32_t *output_mult,
+                           const int32_t out_offset,
+                           const int32_t col_offset,
+                           const int32_t row_offset,
+                           const int16_t out_activation_min,
+                           const int16_t out_activation_max,
+                           const uint16_t row_len,
+                           const int32_t *const bias,
+                           int8_t *out);
 /**
  * @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
  * @param[in]       input_a     pointer to operand A
@@ -219,16 +223,16 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
  *            clamped in the range provided by activation min and max.
  *            Supported framework: TensorFlow Lite micro.
  */
-q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
-                                  const q15_t *input_b,
-                                  const int32_t output_ch,
-                                  const int32_t *out_shift,
-                                  const int32_t *out_mult,
-                                  const int16_t activation_min,
-                                  const int16_t activation_max,
-                                  const int32_t num_col_a,
-                                  const int64_t *const output_bias,
-                                  q15_t *out_0);
+int16_t *arm_nn_mat_mult_kernel_s16(const int8_t *input_a,
+                                    const int16_t *input_b,
+                                    const int32_t output_ch,
+                                    const int32_t *out_shift,
+                                    const int32_t *out_mult,
+                                    const int16_t activation_min,
+                                    const int16_t activation_max,
+                                    const int32_t num_col_a,
+                                    const int64_t *const output_bias,
+                                    int16_t *out_0);
 
 /**
  * @brief General Vector by Matrix multiplication with requantization and storage of result.
@@ -319,14 +323,16 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
  * @param[in]  dst_offset         Offset to be applied the output result
  * @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
  * @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
+ * @param[in]  rhs_cols_offset    Offset between input columns. Used to handle non-unity strides
+ *                                Expected value : x * rhs_cols, where x >= 1
  *
  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
  *
  */
-arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
-                                            const q7_t *rhs,
-                                            const q31_t *bias,
-                                            q7_t *dst,
+arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs,
+                                            const int8_t *rhs,
+                                            const int32_t *bias,
+                                            int8_t *dst,
                                             const int32_t *dst_multipliers,
                                             const int32_t *dst_shifts,
                                             const int32_t lhs_rows,
@@ -335,7 +341,8 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
                                             const int32_t lhs_offset,
                                             const int32_t dst_offset,
                                             const int32_t activation_min,
-                                            const int32_t activation_max);
+                                            const int32_t activation_max,
+                                            const int32_t rhs_cols_offset);
 
 /**
  * @brief s8 Vector by Matrix (transposed) multiplication
@@ -346,7 +353,6 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
  * @param[out]     dst             Output vector
  * @param[in]      lhs_offset      Offset to be added to the input values of the left-hand side vector.
  *                                 Range: -127 to 128
- * @param[in]      rhs_offset      Not used
  * @param[in]      dst_offset      Offset to be added to the output values. Range: -127 to 128
  * @param[in]      dst_multiplier  Output multiplier
  * @param[in]      dst_shift       Output shift
@@ -360,12 +366,11 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
  * @return         The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
  *
  */
-arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
-                                             const q7_t *rhs,
-                                             const q31_t *bias,
-                                             q7_t *dst,
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
+                                             const int8_t *rhs,
+                                             const int32_t *bias,
+                                             int8_t *dst,
                                              const int32_t lhs_offset,
-                                             const int32_t rhs_offset,
                                              const int32_t dst_offset,
                                              const int32_t dst_multiplier,
                                              const int32_t dst_shift,
@@ -392,10 +397,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
  * @return         The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
  *
  */
-arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
-                                              const q7_t *rhs,
-                                              const q63_t *bias,
-                                              q15_t *dst,
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const int16_t *lhs,
+                                              const int8_t *rhs,
+                                              const int64_t *bias,
+                                              int16_t *dst,
                                               const int32_t dst_multiplier,
                                               const int32_t dst_shift,
                                               const int32_t rhs_cols,
@@ -411,7 +416,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
  * @param[out]     dst             Output vector
  * @param[in]      lhs_offset      Offset to be added to the input values of the left-hand side
  *                                 vector. Range: -127 to 128
- * @param[in]      rhs_offset      Not used
  * @param[in]      scatter_offset  Address offset for dst. First output is stored at 'dst', the
  *                                 second at 'dst + scatter_offset' and so on.
  * @param[in]      dst_multiplier  Output multiplier
@@ -424,11 +428,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
  * @return         The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
  *
  */
-arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
-                                                  const q7_t *rhs,
-                                                  q15_t *dst,
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const int8_t *lhs,
+                                                  const int8_t *rhs,
+                                                  int16_t *dst,
                                                   const int32_t lhs_offset,
-                                                  const int32_t rhs_offset,
                                                   const int32_t scatter_offset,
                                                   const int32_t dst_multiplier,
                                                   const int32_t dst_shift,
@@ -466,8 +469,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
  *                  - Output bias
  *                  - rhs
  */
-arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
-                                                         const q7_t *rhs,
+arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const int8_t *lhs,
+                                                         const int8_t *rhs,
                                                          const int32_t lhs_offset,
                                                          const int32_t active_ch,
                                                          const int32_t total_ch,
@@ -478,7 +481,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
                                                          const int32_t activation_max,
                                                          const uint16_t row_x_col,
                                                          const int32_t *const output_bias,
-                                                         q7_t *out);
+                                                         int8_t *out);
 
 /**
  * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
@@ -509,8 +512,8 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
  *                  - Output bias
  *                  - rhs
  */
-arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
-                                                  const q7_t *rhs,
+arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs,
+                                                  const int8_t *rhs,
                                                   const int32_t lhs_offset,
                                                   const int32_t active_ch,
                                                   const int32_t total_ch,
@@ -521,7 +524,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
                                                   const int32_t activation_max,
                                                   const uint16_t row_x_col,
                                                   const int32_t *const output_bias,
-                                                  q7_t *out);
+                                                  int8_t *out);
 
 /**
  * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
@@ -550,7 +553,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
  *                  - rhs
  */
 int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs,
-                                        const q7_t *rhs,
+                                        const int8_t *rhs,
                                         const uint16_t num_ch,
                                         const int32_t *out_shift,
                                         const int32_t *out_mult,
@@ -561,36 +564,13 @@ int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs,
                                         int16_t *out);
 
 /**
- *@brief Matrix-multiplication function for convolution with reordered columns
- *@param[in]       pA          pointer to operand A
- *@param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
- *@param[in]       ch_im_out   numRow of A
- *@param[in]       numCol_A    numCol of A
- *@param[in]       bias_shift  amount of left-shift for bias
- *@param[in]       out_shift   amount of right-shift for output
- *@param[in]       bias        the bias
- *@param[in,out]   pOut        pointer to output
- *@return     The function returns the incremented output pointer
- *
- *@details  This function assumes that data in pInBuffer are reordered
- */
-q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
-                                              const q15_t *pInBuffer,
-                                              const uint16_t ch_im_out,
-                                              const uint16_t numCol_A,
-                                              const uint16_t bias_shift,
-                                              const uint16_t out_shift,
-                                              const q7_t *bias,
-                                              q7_t *pOut);
-
-/**
-  @brief         Read 2 q15 elements and post increment pointer.
+  @brief         Read 2 s16 elements and post increment pointer.
   @param[in]     in_q15   Pointer to pointer that holds address of input.
   @return        q31 value
  */
-__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
+__STATIC_FORCEINLINE int32_t arm_nn_read_q15x2_ia(const int16_t **in_q15)
 {
-    q31_t val;
+    int32_t val;
 
     memcpy(&val, *in_q15, 4);
     *in_q15 += 2;
@@ -599,51 +579,51 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
 }
 
 /**
-  @brief         Read 4 q7 from q7 pointer and post increment pointer.
-  @param[in]     in_q7       Pointer to pointer that holds address of input.
+  @brief         Read 4 s8 from s8 pointer and post increment pointer.
+  @param[in]     in_s8       Pointer to pointer that holds address of input.
   @return        q31 value
  */
-__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
+__STATIC_FORCEINLINE int32_t arm_nn_read_s8x4_ia(const int8_t **in_s8)
 {
-    q31_t val;
-    memcpy(&val, *in_q7, 4);
-    *in_q7 += 4;
+    int32_t val;
+    memcpy(&val, *in_s8, 4);
+    *in_s8 += 4;
 
     return (val);
 }
 
 /**
-  @brief         Read 2 q15 from q15 pointer.
-  @param[in]     in_q15   pointer to address of input.
-  @return        q31 value
+  @brief         Read 2 int16 values from int16 pointer.
+  @param[in]     in     pointer to address of input.
+  @return        s32    value
  */
-__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
+__STATIC_FORCEINLINE int32_t arm_nn_read_s16x2(const int16_t *in)
 {
-    q31_t val;
-    memcpy(&val, in_q15, 4);
+    int32_t val;
+    memcpy(&val, in, 4);
 
     return (val);
 }
 
 /**
-  @brief         Read 4 q7 values.
-  @param[in]     in_q7       pointer to address of input.
-  @return        q31 value
+  @brief         Read 4 s8 values.
+  @param[in]     in_s8       pointer to address of input.
+  @return        s32 value
  */
-__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
+__STATIC_FORCEINLINE int32_t arm_nn_read_s8x4(const int8_t *in_s8)
 {
-    q31_t val;
-    memcpy(&val, in_q7, 4);
+    int32_t val;
+    memcpy(&val, in_s8, 4);
 
     return (val);
 }
 
 /**
-  @brief         Write four q7 to q7 pointer and increment pointer afterwards.
+  @brief         Write four s8 to s8 pointer and increment pointer afterwards.
   @param[in]     in       Double pointer to input value
   @param[in]     value    Four bytes to copy
  */
-__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
+__STATIC_FORCEINLINE void arm_nn_write_s8x4_ia(int8_t **in, int32_t value)
 {
     memcpy(*in, &value, 4);
     *in += 4;
@@ -656,7 +636,7 @@ __STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
  * @param[in]       block_size  Number of bytes to copy.
  *
  */
-__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size)
+__STATIC_FORCEINLINE void arm_memset_s8(int8_t *dst, const int8_t val, uint32_t block_size)
 {
 #if defined(ARM_MATH_MVEI)
     __asm volatile("   vdup.8                  q0, %[set_val]             \n"
@@ -676,61 +656,40 @@ __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t bloc
 #if defined(ARM_MATH_DSP)
 
 /**
- * @brief read and expand one q7 word into two q15 words
+ * @brief read and expand one s8 word into two s16 words
  */
 
-__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2)
+__STATIC_FORCEINLINE const int8_t *read_and_pad(const int8_t *source, int32_t *out1, int32_t *out2)
 {
-    q31_t inA = arm_nn_read_q7x4_ia(&source);
-    q31_t inAbuf1 = __SXTB16_RORn((uint32_t)inA, 8);
-    q31_t inAbuf2 = __SXTB16(inA);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    *out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
-    *out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
-#else
-    *out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
-    *out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
-#endif
+    int32_t inA = arm_nn_read_s8x4_ia(&source);
+    int32_t inAbuf1 = SXTB16_RORn((uint32_t)inA, 8);
+    int32_t inAbuf2 = SXTB16(inA);
+
+    #ifndef ARM_MATH_BIG_ENDIAN
+    *out2 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
+    *out1 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
+    #else
+    *out1 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
+    *out2 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
+    #endif
 
     return source;
 }
 
 /**
- * @brief read and expand one q7 word into two q15 words with reordering
+ * @brief read and expand one s8 word into two s16 words with reordering
  */
 
-__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2)
+__STATIC_FORCEINLINE const int8_t *read_and_pad_reordered(const int8_t *source, int32_t *out1, int32_t *out2)
 {
-    q31_t inA = arm_nn_read_q7x4_ia(&source);
-#ifndef ARM_MATH_BIG_ENDIAN
-    *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
-    *out1 = __SXTB16(inA);
-#else
-    *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
-    *out2 = __SXTB16(inA);
-#endif
-
-    return source;
-}
-
-/**
- * @brief read and expand one q7 word into two q15 words with reordering and add an offset
- */
-__STATIC_FORCEINLINE const q7_t *
-read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset)
-{
-    q31_t inA = arm_nn_read_q7x4_ia(&source);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
-    *out1 = __SXTB16(inA);
-#else
-    *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
-    *out2 = __SXTB16(inA);
-#endif
-    *out1 = __QADD16(*out1, offset);
-    *out2 = __QADD16(*out2, offset);
+    int32_t inA = arm_nn_read_s8x4_ia(&source);
+    #ifndef ARM_MATH_BIG_ENDIAN
+    *out2 = SXTB16(ROR((uint32_t)inA, 8));
+    *out1 = SXTB16(inA);
+    #else
+    *out1 = SXTB16(ROR((uint32_t)inA, 8));
+    *out2 = SXTB16(inA);
+    #endif
 
     return source;
 }
@@ -759,17 +718,17 @@ read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2,
  *            clamped in the range provided by activation min and max.
  *            Supported framework: TensorFlow Lite micro.
  */
-q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
-                                    const q15_t *input_b,
-                                    const uint16_t output_ch,
-                                    const int32_t *out_shift,
-                                    const int32_t *out_mult,
-                                    const int32_t out_offset,
-                                    const int16_t activation_min,
-                                    const int16_t activation_max,
-                                    const uint16_t num_col_a,
-                                    const int32_t *const output_bias,
-                                    q7_t *out_0);
+int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a,
+                                      const int16_t *input_b,
+                                      const uint16_t output_ch,
+                                      const int32_t *out_shift,
+                                      const int32_t *out_mult,
+                                      const int32_t out_offset,
+                                      const int16_t activation_min,
+                                      const int16_t activation_max,
+                                      const uint16_t num_col_a,
+                                      const int32_t *const output_bias,
+                                      int8_t *out_0);
 
 /**
  * @brief Common softmax function for s8 input and s8 or s16 output
@@ -799,9 +758,9 @@ void arm_nn_softmax_common_s8(const int8_t *input,
  * @brief macro for adding rounding offset
  */
 #ifndef ARM_NN_TRUNCATE
-#define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1)
+    #define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1)
 #else
-#define NN_ROUND(out_shift) 0
+    #define NN_ROUND(out_shift) 0
 #endif
 
 // Macros for shortening quantization functions' names and avoid long lines
@@ -823,18 +782,18 @@ void arm_nn_softmax_common_s8(const int8_t *input,
  * @return          Result of multiplication.
  *
  */
-__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2)
+__STATIC_FORCEINLINE int32_t arm_nn_doubling_high_mult(const int32_t m1, const int32_t m2)
 {
-    q31_t result = 0;
+    int32_t result = 0;
     // Rounding offset to add for a right shift of 31
-    q63_t mult = 1 << 30;
+    int64_t mult = 1 << 30;
 
     if ((m1 < 0) ^ (m2 < 0))
     {
         mult = 1 - mult;
     }
     // Gets resolved as a SMLAL instruction
-    mult = mult + (q63_t)m1 * m2;
+    mult = mult + (int64_t)m1 * m2;
 
     // Utilize all of the upper 32 bits. This is the doubling step
     // as well.
@@ -861,9 +820,9 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t
  *                  this function.
  *
  */
-__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2)
+__STATIC_FORCEINLINE int32_t arm_nn_doubling_high_mult_no_sat(const int32_t m1, const int32_t m2)
 {
-    q31_t result = 0;
+    int32_t result = 0;
     union arm_nn_long_long mult;
 
     // Rounding offset to add for a right shift of 31
@@ -871,7 +830,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, cons
     mult.word.high = 0;
 
     // Gets resolved as a SMLAL instruction
-    mult.long_long = mult.long_long + (q63_t)m1 * m2;
+    mult.long_long = mult.long_long + (int64_t)m1 * m2;
 
     // Utilize all of the upper 32 bits. This is the doubling step
     // as well.
@@ -888,17 +847,17 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, cons
  * @return          Rounded result of division. Midpoint is rounded away from zero.
  *
  */
-__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
+__STATIC_FORCEINLINE int32_t arm_nn_divide_by_power_of_two(const int32_t dividend, const int32_t exponent)
 {
-    q31_t result = 0;
-    const q31_t remainder_mask = (1 << exponent) - 1;
+    int32_t result = 0;
+    const int32_t remainder_mask = (1 << exponent) - 1;
     int32_t remainder = remainder_mask & dividend;
 
     // Basic division
     result = dividend >> exponent;
 
     // Adjust 'result' for rounding (mid point away from zero)
-    q31_t threshold = remainder_mask >> 1;
+    int32_t threshold = remainder_mask >> 1;
     if (result < 0)
     {
         threshold++;
@@ -920,7 +879,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, c
  * @return          Returns (val * multiplier)/(2 ^ shift)
  *
  */
-__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
+__STATIC_FORCEINLINE int32_t arm_nn_requantize(const int32_t val, const int32_t multiplier, const int32_t shift)
 {
 #ifdef CMSIS_NN_USE_SINGLE_ROUNDING
     const int64_t total_shift = 31 - shift;
@@ -946,12 +905,14 @@ __STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multip
  * @return          Returns (val * multiplier)/(2 ^ shift)
  *
  */
-__STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t reduced_multiplier, const q31_t shift)
+__STATIC_FORCEINLINE int32_t arm_nn_requantize_s64(const int64_t val,
+                                                   const int32_t reduced_multiplier,
+                                                   const int32_t shift)
 {
-    const q63_t new_val = val * reduced_multiplier;
+    const int64_t new_val = val * reduced_multiplier;
 
-    q31_t result = new_val >> (14 - shift); // 64->32 bit reduction
-    result = (result + 1) >> 1;             // Last shift position and insert round
+    int32_t result = new_val >> (14 - shift); // 64->32 bit reduction
+    result = (result + 1) >> 1;               // Last shift position and insert round
 
     return result;
 }
@@ -963,7 +924,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t re
  * @param[in]       block_size  Number of bytes to copy.
  *
  */
-__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size)
+__STATIC_FORCEINLINE void arm_memcpy_s8(int8_t *__RESTRICT dst, const int8_t *__RESTRICT src, uint32_t block_size)
 {
 #if defined(ARM_MATH_MVEI)
     __asm volatile("   wlstp.8                 lr, %[cnt], 1f             \n"
@@ -987,7 +948,7 @@ __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__REST
  * @param[in]       block_size  Number of bytes to copy.
  *
  */
-__STATIC_FORCEINLINE void arm_memcpy_q15(q15_t *__RESTRICT dst, const q15_t *__RESTRICT src, uint32_t block_size)
+__STATIC_FORCEINLINE void arm_memcpy_q15(int16_t *__RESTRICT dst, const int16_t *__RESTRICT src, uint32_t block_size)
 {
     memcpy(dst, src, block_size);
 }
@@ -1000,7 +961,7 @@ __STATIC_FORCEINLINE void arm_memcpy_q15(q15_t *__RESTRICT dst, const q15_t *__R
  * @return          Result of multiplication.
  *
  */
-__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2)
+__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const int32_t m2)
 {
     return vqrdmulhq_n_s32(m1, m2);
 }
@@ -1013,7 +974,7 @@ __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, co
  * @return          Rounded result of division. Midpoint is rounded away from zero.
  *
  */
-__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent)
+__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const int32_t exponent)
 {
     const int32x4_t shift = vdupq_n_s32(-exponent);
     const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
@@ -1030,9 +991,9 @@ __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t di
  * @return          Returns (val * multiplier)/(2 ^ shift)
  *
  */
-__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift)
+__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const int32_t multiplier, const int32_t shift)
 {
-#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
+    #ifdef CMSIS_NN_USE_SINGLE_ROUNDING
     const int right_shift = MIN(-1, shift);
     const int left_shift = shift - right_shift;
 
@@ -1043,10 +1004,10 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31
     result = vrshlq_s32(result, right_shift_dup);
 
     return result;
-#else
+    #else
     return arm_divide_by_power_of_two_mve(
         arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift));
-#endif
+    #endif
 }
 
 __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2)
@@ -1066,7 +1027,7 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
                                                        const int32x4_t multiplier,
                                                        const int32x4_t shift)
 {
-#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
+    #ifdef CMSIS_NN_USE_SINGLE_ROUNDING
     const int32x4_t right_shift = vminq_s32(vdupq_n_s32(-1), shift);
     const int32x4_t left_shift = vqsubq_s32(shift, right_shift);
 
@@ -1074,7 +1035,7 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
     result = vrshlq_s32(result, right_shift);
 
     return result;
-#else
+    #else
     const int32x4_t zz = vdupq_n_s32(0);
     const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
 
@@ -1083,7 +1044,7 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
 
     return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier),
                                                right_shift);
-#endif
+    #endif
 }
 #endif
 
@@ -1122,7 +1083,7 @@ __STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val)
     return SELECT_USING_MASK(mask, NN_Q31_MAX, result);
 }
 
-__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp)
+__STATIC_FORCEINLINE int32_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp)
 {
     const int32_t thresh = ((1 << (31 - exp)) - 1);
     int32_t result = val << exp;
@@ -1146,18 +1107,191 @@ __STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val
 }
 
 /**
-  @brief         Write 2 q15 elements and post increment pointer.
+  @brief         Write 2 s16 elements and post increment pointer.
   @param[in]     dest_q15  Pointer to pointer that holds address of destination.
   @param[in]     src_q31   Input value to be written.
  */
-__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
+__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(int16_t **dest_q15, int32_t src_q31)
 {
-    q31_t val = src_q31;
+    int32_t val = src_q31;
 
     memcpy(*dest_q15, &val, 4);
     *dest_q15 += 2;
 }
 
+/**
+  @brief         Write 2 s8 elements and post increment pointer.
+  @param[in]     dst  Pointer to pointer that holds address of destination.
+  @param[in]     src  Input value to be written.
+ */
+__STATIC_FORCEINLINE void arm_nn_write_s8x2_ia(int8_t **dst, int16_t src)
+{
+    memcpy(*dst, &src, 2);
+    *dst += 2;
+}
+
+// Support functions for LSTM
+/**
+ * @brief Update LSTM function for an iteration step
+ *
+ * param[in]    input                           Input data
+ * param[in]    input_to_input_weight           Input to input gate weights
+ * param[in]    input_to_forget_weight          Input to forget gate weights
+ * param[in]    input_to_cell_weight            Input to cell gate weights
+ * param[in]    input_to_output_weight          Input to output weights
+ * param[in]    recurrent_to_input_weight       Recurrent signal to input weights
+ * param[in]    recurrent_to_forget_weight      Recurrent signal to forget gate weights
+ * param[in]    recurrent_to_cell_weight        Recurrent signal to cell gate weighst
+ * param[in]    recurrent_to_output_weight      Recurrent signal to output weights
+ * param[in]    lstm                            LSTM parameters
+ * param[in]    n_batch                         Batch size
+ * param[in]    n_cell                          Cell size
+ * param[in]    n_input                         Input size
+ * param[in]    n_output                        Output size
+ * param[out]   output_state                    Output state
+ * param[out]   cell_state                      Internal state
+ * param[out]   output                          Output signal
+ * param[in] *scratch_buffers                   Struct containing scratch buffers
+ */
+arm_cmsis_nn_status arm_nn_lstm_step_s8_s16(const int8_t *input,
+                                            const int8_t *input_to_input_weight,
+                                            const int8_t *input_to_forget_weight,
+                                            const int8_t *input_to_cell_weight,
+                                            const int8_t *input_to_output_weight,
+                                            const int8_t *recurrent_to_input_weight,
+                                            const int8_t *recurrent_to_forget_weight,
+                                            const int8_t *recurrent_to_cell_weight,
+                                            const int8_t *recurrent_to_output_weight,
+                                            const cmsis_nn_lstm_params *lstm,
+                                            const int n_batch,
+                                            const int n_cell,
+                                            const int n_input,
+                                            const int n_output,
+                                            int8_t *output_state,
+                                            int16_t *cell_state,
+                                            int8_t *output,
+                                            cmsis_nn_lstm_context *scratch_buffers);
+
+/**
+ * @brief         Updates a LSTM gate for an iteration step of LSTM function, int8x8_16 version.
+ *
+ * param[in]    input                           Input data
+ * param[in]    input_to_gate_weights           Input to gate weights
+ * param[in]    input_to_gate_bias              Input to gate weights
+ * param[in]    input_to_gate_scaling           Input to gate scaling
+ * param[in]    activation                      Actival min and max values
+ * param[in]    output_state                    Output state
+ * param[in]    recurrent_to_gate_weights       Recurrent to gate weights
+ * param[in]    recurrent_to_gate_bias          Recurrent to gate bias
+ * param[in]    recurrent_to_gate_scaling       Recurrent to gate scaling
+ * param[in]    n_batch                         Batch size
+ * param[in]    n_input                         Input size
+ * param[out]   n_output                        Output size
+ * param[in]    activation_type                 Activation type (sigmoid or tanh)
+ * param[out]   n_cell                          Cell size
+ */
+void arm_nn_lstm_calculate_gate_s8_s16(const int8_t *input,
+                                       const int8_t *input_to_gate_weights,
+                                       const int32_t *input_to_gate_bias,
+                                       const cmsis_nn_scaling input_to_gate_scaling,
+                                       const int8_t *output_state,
+                                       const int8_t *recurrent_to_gate_weights,
+                                       const int32_t *recurrent_to_gate_bias,
+                                       const cmsis_nn_scaling recurrent_to_gate_scaling,
+                                       const int32_t n_batch,
+                                       const int32_t n_input,
+                                       const int32_t n_output,
+                                       const int32_t n_cell,
+                                       const arm_nn_activation_type activation_type,
+                                       int16_t *gate);
+
+/**
+ * @brief       Update cell state for a single LSTM iteration step, int8x8_16 version.
+ * @param[in]   n_block             total number of cells for all batches
+ * @param[in]   cell_state_scale    Scaling factor of cell state
+ * @param[in]   cell_state          Input/output vector, size n_batch*n_cell
+ * @param[in]   input_gate          Input vector of size n_block
+ * @param[in]   forget_gate         Input/scratch vector of size n_block, always modified
+ * @param[in]   cell_gate           Input vector of size, n_block
+ */
+void arm_nn_lstm_update_cell_state_s16(const int32_t n_block,
+                                       const int32_t cell_state_scale,
+                                       int16_t *cell_state,
+                                       const int16_t *input_gate,
+                                       const int16_t *forget_gate,
+                                       const int16_t *cell_gate);
+
+/**
+ * @brief       Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version.
+ *
+ * @param[in]       n_batch                     The number of distinct vectors in each array
+ * @param[in]       n_cell                      Number of cells
+ * @param[in,out]   cell_state                  Cell state, size n_batch*n_cell
+ * @param[in]       cell_state_scale            Scaling of cell_state
+ * @param[in]       output_gate                 Output gate
+ * @param[in]       hidden_scale                Effective scaling of cell_state .* output_gate
+ * @param[in]       hidden_offset               Zero point for cell_state .* output_gate
+ * @param[out]      output_state                Output state
+ * @param[in]       cell_gate_scratch           Scratch buffer
+ */
+void arm_nn_lstm_update_output_s8_s16(const int n_batch,
+                                      const int n_cell,
+                                      int16_t *cell_state,
+                                      const int32_t cell_state_scale,
+                                      const int16_t *output_gate,
+                                      const cmsis_nn_scaling hidden_scale,
+                                      const int32_t hidden_offset,
+                                      int8_t *output_state,
+                                      int16_t *cell_gate_scratch);
+
+/**
+ * @brief The result of the multiplication is accumulated to the passed result buffer.
+ * Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent
+ * from each other).
+ *
+ * @param[in]   lhs_in           Batched vector
+ * @param[in]   rhs_in           Weights - input matrix (H(Rows)xW(Columns))
+ * @param[in]   bias             Bias vector
+ * @param[out]  dst              Output
+ * @param[in]   dst_offset       Output offset
+ * @param[in]   dst_multiplier   Multiplier for quantization
+ * @param[in]   dst_shift        Shift for quantization
+ * @param[in]   rhs_cols         Vector/matarix column length
+ * @param[in]   rhs_rows         Row count of matrix
+ * @param[in]   batch            Batch size
+ */
+void arm_nn_vec_mat_mul_result_acc_s8(const int8_t *lhs_in,
+                                      const int8_t *rhs_in,
+                                      const int32_t *bias,
+                                      int16_t *dst,
+                                      const int32_t dst_offset,
+                                      const int32_t dst_multiplier,
+                                      const int32_t dst_shift,
+                                      const int32_t rhs_cols,
+                                      const int32_t rhs_rows,
+                                      const int32_t batch);
+
+/**
+ * @brief s16 elementwise multiplication with s8 output
+ * @param[in]       input_1_vect        pointer to input vector 1
+ * @param[in]       input_2_vect        pointer to input vector 2
+ * @param[in,out]   output              pointer to output vector
+ * @param[in]       out_offset          output offset
+ * @param[in]       out_mult            output multiplier
+ * @param[in]       out_shift           output shift
+ * @param[in]       block_size          number of samples
+ * @return          The function returns ARM_CMSIS_NN_SUCCESS
+ *
+ * @details   Supported framework: TensorFlow Lite micro
+ */
+arm_cmsis_nn_status arm_elementwise_mul_s16_s8(const int16_t *input_1_vect,
+                                               const int16_t *input_2_vect,
+                                               int8_t *output,
+                                               const int32_t out_offset,
+                                               const int32_t out_mult,
+                                               const int32_t out_shift,
+                                               const int32_t block_size);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_nn_activation_s16.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_nn_activation_s16.c
new file mode 100644
index 00000000..51b736f6
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_nn_activation_s16.c
@@ -0,0 +1,119 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_activations_q15.c
+ * Description:  Q15 neural network activation function using direct table look-up
+ *
+ * $Date:        8 September 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nn_tables.h"
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Acti
+ * @{
+ */
+
+/*
+ * @brief Neural network activation function using direct table look-up
+ *
+ * @note  Refer header file for details.
+ *
+ */
+
+void arm_nn_activation_s16(const int16_t *input,
+                           int16_t *output,
+                           const uint16_t size,
+                           const uint16_t left_shift,
+                           const arm_nn_activation_type type)
+{
+    uint32_t abs_input_shift, max_saturation;
+    switch (type)
+    {
+    case ARM_SIGMOID:
+        abs_input_shift = 9;
+        max_saturation = 0x7FFF << 10;
+        break;
+    case ARM_TANH:
+    default:
+        abs_input_shift = 8;
+        max_saturation = 0xFFFF << 8;
+        break;
+    }
+
+    // Use the LUT for sigmoid and take into account, that
+    // tanh(x) = 2*sigmoid(2*x) - 1
+    int32_t input_multiplier = ((int32_t)3) << left_shift;
+
+    for (int i = 0; i < size; ++i, input++, output++)
+    {
+        int32_t input_data = ((*input) * input_multiplier);
+
+        uint32_t abs_input_data = input_data > 0 ? input_data : -input_data;
+
+        uint32_t uh = abs_input_data >> abs_input_shift;
+
+        uint32_t result;
+
+        if (uh >= 255)
+        {
+            result = max_saturation;
+        }
+        else
+        {
+            uint32_t ua = sigmoid_table_uint16[uh];
+            uint32_t ub = sigmoid_table_uint16[uh + 1];
+            uint32_t ut;
+            if (type == ARM_SIGMOID)
+            {
+                ut = abs_input_data & 0x1ff;
+            }
+            else
+            {
+                ut = abs_input_data & 0x0ff;
+            }
+            result = (ua << abs_input_shift) + ut * (ub - ua);
+        }
+        if (type == ARM_SIGMOID)
+        {
+            result = (input_data >= 0) ? (result + (1 << 9)) : ((1 << 25) - result + (1 << 9) - 1);
+            result >>= 10;
+        }
+        else
+        {
+            result = (input_data >= 0) ? (result - (1 << 23)) + (1 << 7) : ((-result + (1 << 23)) + (1 << 7) - 1);
+            result >>= 8;
+        }
+        *output = (int16_t)result;
+    }
+}
+
+/**
+ * @} end of Acti group
+ */
diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c
index 3d66927f..a9ecb127 100644
--- a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c
+++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu6_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2019, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_relu6_s8.c
  * Description:  Basic s8 version of ReLU6
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -47,7 +47,7 @@
  *
  */
 
-void arm_relu6_s8(q7_t *data, uint16_t size)
+void arm_relu6_s8(int8_t *data, uint16_t size)
 {
     int32_t i;
 
diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c
index cede0e9c..d079167b 100644
--- a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c
+++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q15.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_relu_q15.c
  * Description:  Q15 version of ReLU
  *
- * $Date:        4 Aug 2022
- * $Revision:    V.1.0.3
+ * $Date:        31 January 2023
+ * $Revision:    V.1.1.1
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -47,28 +47,28 @@
  *
  */
 
-void arm_relu_q15(q15_t *data, uint16_t size)
+void arm_relu_q15(int16_t *data, uint16_t size)
 {
 
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for M cores with DSP extension */
 
     uint16_t i = size >> 1;
-    q15_t *input = data;
-    q15_t *output = data;
-    q31_t in;
-    q31_t buf;
-    q31_t mask;
+    int16_t *input = data;
+    int16_t *output = data;
+    int32_t in;
+    int32_t buf;
+    int32_t mask;
 
     while (i)
     {
-        in = arm_nn_read_q15x2_ia((const q15_t **)&input);
+        in = arm_nn_read_q15x2_ia((const int16_t **)&input);
 
         /* extract the first bit */
-        buf = __ROR(in & 0x80008000, 15);
+        buf = ROR(in & 0x80008000, 15);
 
         /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
-        mask = __QSUB16(0x00000000, buf);
+        mask = QSUB16(0x00000000, buf);
 
         arm_nn_write_q15x2_ia(&output, in & (~mask));
         i--;
diff --git a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c
index 7c7a187f..58d22848 100644
--- a/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c
+++ b/src/third_party/cmsis_nn/Source/ActivationFunctions/arm_relu_q7.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_relu_q7.c
  * Description:  Q7 version of ReLU
  *
- * $Date:        4 Aug 2022
- * $Revision:    V.1.1.4
+ * $Date:        31 January 2023
+ * $Revision:    V.1.2.1
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -47,30 +47,30 @@
  *
  */
 
-void arm_relu_q7(q7_t *data, uint16_t size)
+void arm_relu_q7(int8_t *data, uint16_t size)
 {
 
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for M cores with DSP extension */
 
     uint16_t i = size >> 2;
-    q7_t *input = data;
-    q7_t *output = data;
-    q31_t in;
-    q31_t buf;
-    q31_t mask;
+    int8_t *input = data;
+    int8_t *output = data;
+    int32_t in;
+    int32_t buf;
+    int32_t mask;
 
     while (i)
     {
-        in = arm_nn_read_q7x4_ia((const q7_t **)&input);
+        in = arm_nn_read_s8x4_ia((const int8_t **)&input);
 
         /* extract the first bit */
-        buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
+        buf = (int32_t)ROR((uint32_t)in & 0x80808080, 7);
 
         /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
-        mask = __QSUB8(0x00000000, buf);
+        mask = QSUB8(0x00000000, buf);
 
-        arm_nn_write_q7x4_ia(&output, in & (~mask));
+        arm_nn_write_s8x4_ia(&output, in & (~mask));
 
         i--;
     }
diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c
index 54c88349..7e3dce05 100644
--- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c
+++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_elementwise_add_s16
  * Description:  Elementwise add
  *
- * $Date:        10 May 2022
- * $Revision:    V.2.1.0
+ * $Date:        24 Oct 2022
+ * $Revision:    V.2.2.0
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -69,13 +69,47 @@ arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect,
     (void)input_1_offset;
     (void)input_2_offset;
     (void)out_offset;
+
+#if defined(ARM_MATH_MVEI)
+
+    int32_t count = block_size;
+
+    while (count > 0)
+    {
+
+        mve_pred16_t pred = vctp32q(count);
+
+        int32x4_t vect_1 = vldrhq_z_s32(input_1_vect, pred);
+        int32x4_t vect_2 = vldrhq_z_s32(input_2_vect, pred);
+
+        vect_1 = vshlq_r_s32(vect_1, left_shift);
+        vect_2 = vshlq_r_s32(vect_2, left_shift);
+
+        vect_1 = arm_requantize_mve(vect_1, input_1_mult, input_1_shift);
+        vect_2 = arm_requantize_mve(vect_2, input_2_mult, input_2_shift);
+
+        vect_1 = vaddq_s32(vect_1, vect_2);
+        vect_1 = arm_requantize_mve(vect_1, out_mult, out_shift);
+
+        vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
+        vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
+
+        input_1_vect += 4;
+        input_2_vect += 4;
+
+        vstrhq_p_s32(output, vect_1, pred);
+
+        output += 4;
+        count -= 4;
+    }
+
+#else  // #if defined(ARM_MATH_MVEI)
     int32_t input_1;
     int32_t input_2;
     int32_t sum;
     int32_t two_halfword_1, two_halfword_2;
     int16_t sum_1, sum_2;
     int32_t loop_count = block_size / 2;
-
     while (loop_count > 0)
     {
         two_halfword_1 = arm_nn_read_q15x2_ia(&input_1_vect);
@@ -127,10 +161,10 @@ arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect,
         /* Decrement loop counter */
         loop_count--;
     }
-
+#endif // #if defined(ARM_MATH_MVEI)
     return (ARM_CMSIS_NN_SUCCESS);
 }
 
 /**
  * @} end of Doxygen group
- */
+ */
\ No newline at end of file
diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c
index be222c12..e2d895b9 100644
--- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c
+++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_add_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_elementwise_add_s8
  * Description:  Elementwise add
  *
- * $Date:        19 April 2022
- * $Revision:    V.3.0.0
+ * $Date:        5 January 2023
+ * $Revision:    V.3.1.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -109,7 +109,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
     int32_t input_2;
     int32_t sum;
 
-#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_DSP)
     int32_t a_1, b_1, a_2, b_2;
 
     int32_t offset_1_packed, offset_2_packed;
@@ -128,11 +128,11 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
         input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
 
-        a_1 = __SADD16(a_1, offset_1_packed);
-        b_1 = __SADD16(b_1, offset_1_packed);
+        a_1 = SADD16(a_1, offset_1_packed);
+        b_1 = SADD16(b_1, offset_1_packed);
 
-        a_2 = __SADD16(a_2, offset_2_packed);
-        b_2 = __SADD16(b_2, offset_2_packed);
+        a_2 = SADD16(a_2, offset_2_packed);
+        b_2 = SADD16(b_2, offset_2_packed);
 
         /* Sum 1 */
         input_1 = (b_1 & 0x0FFFF) << left_shift;
@@ -147,7 +147,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum += out_offset;
         sum = MAX(sum, out_activation_min);
         sum = MIN(sum, out_activation_max);
-        r1 = (q7_t)sum;
+        r1 = (int8_t)sum;
 
         /* Sum 3 */
         input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
@@ -161,7 +161,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum += out_offset;
         sum = MAX(sum, out_activation_min);
         sum = MIN(sum, out_activation_max);
-        r3 = (q7_t)sum;
+        r3 = (int8_t)sum;
 
         /* Sum 2 */
         input_1 = (a_1 & 0x0FFFF) << left_shift;
@@ -175,7 +175,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum += out_offset;
         sum = MAX(sum, out_activation_min);
         sum = MIN(sum, out_activation_max);
-        r2 = (q7_t)sum;
+        r2 = (int8_t)sum;
 
         /* Sum 4 */
         input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
@@ -189,17 +189,17 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum += out_offset;
         sum = MAX(sum, out_activation_min);
         sum = MIN(sum, out_activation_max);
-        r4 = (q7_t)sum;
+        r4 = (int8_t)sum;
 
-        arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
+        arm_nn_write_s8x4_ia(&output, PACK_S8x4_32x1(r1, r2, r3, r4));
 
         loop_count--;
     }
 
     loop_count = block_size & 0x3;
-#else
+    #else
     loop_count = block_size;
-#endif
+    #endif
 
     while (loop_count > 0)
     {
@@ -218,7 +218,7 @@ arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum = MAX(sum, out_activation_min);
         sum = MIN(sum, out_activation_max);
 
-        *output++ = (q7_t)sum;
+        *output++ = (int8_t)sum;
 
         /* Decrement loop counter */
         loop_count--;
diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c
index 7a85d8a1..7315b9c3 100644
--- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c
+++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_elementwise_mul_s16
  * Description:  Element wise multiplication
  *
- * $Date:        10 May 2022
- * $Revision:    V.2.1.0
+ * $Date:        20 January 2023
+ * $Revision:    V.2.4.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -61,29 +61,66 @@ arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
     (void)input_1_offset;
     (void)input_2_offset;
     (void)out_offset;
+    int32_t loop_count;
+
+#if defined(ARM_MATH_MVEI)
+
+    loop_count = block_size;
+
+    while (loop_count > 0)
+    {
+        mve_pred16_t pred = vctp32q(loop_count);
+
+        int32x4_t input_1 = vldrhq_z_s32(input_1_vect, pred);
+        int32x4_t input_2 = vldrhq_z_s32(input_2_vect, pred);
+
+        int32x4_t res_0 = vmulq_s32(input_1, input_2);
+
+        res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
+
+        res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
+        res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
+
+        vstrhq_p_s32(output, res_0, pred);
+        input_1_vect += 4;
+        input_2_vect += 4;
+
+        output += 4;
+        loop_count -= 4;
+    }
+
+#else
     int32_t input_1;
     int32_t input_2;
     int32_t mul_res;
     int32_t two_halfword_1, two_halfword_2;
     int16_t mul_1, mul_2;
-    int32_t loop_count = block_size / 2;
+    loop_count = block_size / 2;
 
     while (loop_count > 0)
     {
         two_halfword_1 = arm_nn_read_q15x2_ia(&input_1_vect);
         two_halfword_2 = arm_nn_read_q15x2_ia(&input_2_vect);
 
+    #if defined(ARM_MATH_DSP)
+        mul_res = SMULBB(two_halfword_1, two_halfword_2);
+    #else
         input_1 = (int16_t)(two_halfword_1 & 0xFFFF);
         input_2 = (int16_t)(two_halfword_2 & 0xFFFF);
         mul_res = input_1 * input_2;
+    #endif
         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
         mul_1 = (int16_t)mul_res;
 
+    #if defined(ARM_MATH_DSP)
+        mul_res = SMULTT(two_halfword_1, two_halfword_2);
+    #else
         input_1 = (int16_t)(two_halfword_1 >> 16);
         input_2 = (int16_t)(two_halfword_2 >> 16);
         mul_res = input_1 * input_2;
+    #endif
         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
@@ -113,7 +150,7 @@ arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
         /* Decrement loop counter */
         loop_count--;
     }
-
+#endif // #if defined(ARM_MATH_MVEI)
     return ARM_CMSIS_NN_SUCCESS;
 }
 
diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c
new file mode 100644
index 00000000..16296849
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c
@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_elementwise_mul_s16_s8.c
+ * Description:  Elementwise multiplication of 16 bit input with 8 bit output
+ *
+ * $Date:        20 January 2023
+ * $Revision:    V.1.2.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup BasicMath
+ * @{
+ */
+
+/*
+ * s16 elementwise multiplication with s8 output
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_elementwise_mul_s16_s8(const int16_t *input_1_vect,
+                                               const int16_t *input_2_vect,
+                                               int8_t *output,
+                                               const int32_t out_offset,
+                                               const int32_t out_mult,
+                                               const int32_t out_shift,
+                                               const int32_t block_size)
+{
+    int32_t loop_count = block_size;
+
+#if defined(ARM_MATH_MVEI)
+
+    while (loop_count > 0)
+    {
+        mve_pred16_t pred = vctp32q(loop_count);
+
+        int32x4_t input_1 = vldrhq_z_s32(input_1_vect, pred);
+        int32x4_t input_2 = vldrhq_z_s32(input_2_vect, pred);
+
+        int32x4_t res_0 = vmulq_s32(input_1, input_2);
+
+        res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
+        res_0 = vaddq_n_s32(res_0, out_offset);
+
+        res_0 = vmaxq_s32(res_0, vdupq_n_s32(NN_Q7_MIN));
+        res_0 = vminq_s32(res_0, vdupq_n_s32(NN_Q7_MAX));
+
+        vstrbq_p_s32(output, res_0, pred);
+        input_1_vect += 4;
+        input_2_vect += 4;
+
+        output += 4;
+        loop_count -= 4;
+    }
+
+#else
+    #if defined(ARM_MATH_DSP)
+
+    while (loop_count > 1)
+    {
+        int32_t input_1 = arm_nn_read_q15x2_ia(&input_1_vect);
+        int32_t input_2 = arm_nn_read_q15x2_ia(&input_2_vect);
+
+        int32_t mul_res = SMULBB(input_1, input_2);
+        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
+        mul_res = CLAMP(mul_res, NN_Q7_MAX, NN_Q7_MIN);
+        int32_t mul = (int16_t)(mul_res & 0xFF);
+
+        mul_res = SMULTT(input_1, input_2);
+        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
+        mul_res = CLAMP(mul_res, NN_Q7_MAX, NN_Q7_MIN);
+        mul |= (int16_t)mul_res << 8;
+
+        arm_nn_write_s8x2_ia(&output, mul);
+        loop_count -= 2;
+    }
+    #endif
+    for (int i = 0; i < loop_count; i++)
+    {
+        /* C = A * B */
+        int32_t mul_res = input_1_vect[i] * input_2_vect[i];
+        mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
+
+        mul_res = CLAMP(mul_res, NN_Q7_MAX, NN_Q7_MIN);
+
+        output[i] = (int8_t)mul_res;
+    }
+
+#endif
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+/**
+ * @} end of BasicMath group
+ */
diff --git a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
index 42a53997..484f214e 100644
--- a/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
+++ b/src/third_party/cmsis_nn/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_elementwise_mul_s8
  * Description:  Element wise multiplication
  *
- * $Date:        4 Aug 2022
- * $Revision:    V.2.0.1
+ * $Date:        20 January 2023
+ * $Revision:    V.2.2.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -97,7 +97,7 @@ arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
     int32_t input_2;
     int32_t mul_res;
 
-#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_DSP)
     int32_t a_1, b_1, a_2, b_2;
 
     int32_t offset_1_packed, offset_2_packed;
@@ -116,62 +116,50 @@ arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
         input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
         input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
 
-        a_1 = __SADD16(a_1, offset_1_packed);
-        b_1 = __SADD16(b_1, offset_1_packed);
+        a_1 = SADD16(a_1, offset_1_packed);
+        b_1 = SADD16(b_1, offset_1_packed);
 
-        a_2 = __SADD16(a_2, offset_2_packed);
-        b_2 = __SADD16(b_2, offset_2_packed);
+        a_2 = SADD16(a_2, offset_2_packed);
+        b_2 = SADD16(b_2, offset_2_packed);
 
         /* Mul 1 */
-        input_1 = (int16_t)(b_1 & 0x0FFFFL);
-        input_2 = (int16_t)(b_2 & 0x0FFFFL);
-
-        mul_res = input_1 * input_2;
+        mul_res = SMULBB(b_1, b_2);
         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
 
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
-        r1 = (q7_t)mul_res;
+        r1 = (int8_t)mul_res;
 
         /* Mul 3 */
-        input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
-        input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
-
-        mul_res = input_1 * input_2;
+        mul_res = SMULTT(b_1, b_2);
         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
-        r3 = (q7_t)mul_res;
+        r3 = (int8_t)mul_res;
 
         /* Mul 2 */
-        input_1 = (int16_t)(a_1 & 0x0FFFFL);
-        input_2 = (int16_t)(a_2 & 0x0FFFFL);
-
-        mul_res = input_1 * input_2;
+        mul_res = SMULBB(a_1, a_2);
         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
-        r2 = (q7_t)mul_res;
+        r2 = (int8_t)mul_res;
 
         /* Mul 4 */
-        input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
-        input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
-
-        mul_res = input_1 * input_2;
+        mul_res = SMULTT(a_1, a_2);
         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
-        r4 = (q7_t)mul_res;
+        r4 = (int8_t)mul_res;
 
-        arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
+        arm_nn_write_s8x4_ia(&output, PACK_S8x4_32x1(r1, r2, r3, r4));
 
         loop_count--;
     }
 
     loop_count = block_size & 0x3;
-#else
+    #else
     loop_count = block_size;
-#endif
+    #endif
 
     while (loop_count > 0)
     {
@@ -186,7 +174,7 @@ arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
         mul_res = MAX(mul_res, out_activation_min);
         mul_res = MIN(mul_res, out_activation_max);
 
-        *output++ = (q7_t)mul_res;
+        *output++ = (int8_t)mul_res;
 
         /* Decrement loop counter */
         loop_count--;
diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
index ca1520d7..9ea9e02b 100644
--- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
+++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_concatenation_s8_w.c
  * Description:  s8 version of concatenation along the W axis
  *
- * $Date:        October 2019
- * $Revision:    V.1.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -58,7 +58,7 @@ void arm_concatenation_s8_w(const int8_t *input,
 
     output += offset_w * (input_x * input_y * input_z);
 
-    arm_memcpy_q7(output, input, input_copy_size);
+    arm_memcpy_s8(output, input, input_copy_size);
 }
 
 /**
diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
index b316a73d..d02be297 100644
--- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
+++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_concatenation_s8_x.c
  * Description:  s8 version of concatenation along the X axis
  *
- * $Date:        October 2019
- * $Revision:    V.1.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -64,7 +64,7 @@ void arm_concatenation_s8_x(const int8_t *input,
     // Copy per row
     for (i = 0; i < num_iterations; ++i)
     {
-        arm_memcpy_q7(output, input, input_x);
+        arm_memcpy_s8(output, input, input_x);
         input += input_x;
         output += output_x;
     }
diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
index fa953003..78131fd1 100644
--- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
+++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_concatenation_s8_y.c
  * Description:  s8 version of concatenation along the Y axis
  *
- * $Date:        October 2019
- * $Revision:    V.1.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -65,7 +65,7 @@ void arm_concatenation_s8_y(const int8_t *input,
     // Copy per tile
     for (i = 0; i < num_iterations; ++i)
     {
-        arm_memcpy_q7(output, input, input_copy_size);
+        arm_memcpy_s8(output, input, input_copy_size);
         input += input_copy_size;
         output += output_stride;
     }
diff --git a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
index a13d5fbd..b742c3dd 100644
--- a/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
+++ b/src/third_party/cmsis_nn/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_concatenation_s8_z.c
  * Description:  s8 version of concatenation along the Z axis
  *
- * $Date:        October 2019
- * $Revision:    V.1.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -64,7 +64,7 @@ void arm_concatenation_s8_z(const int8_t *input,
 
     for (i = 0; i < input_w; ++i)
     {
-        arm_memcpy_q7(output, input, input_copy_size);
+        arm_memcpy_s8(output, input, input_copy_size);
         input += input_copy_size;
         output += output_stride;
     }
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
index d17513a7..2e030ce5 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_convolve_1_x_n_s8.c
  * Description:  s8 version of 1xN convolution using symmetric quantization.
  *
- * $Date:        20 June 2022
- * $Revision:    V.3.1.0
+ * $Date:        30 January 2023
+ * $Revision:    V.3.3.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -51,13 +51,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
                                           const cmsis_nn_conv_params *conv_params,
                                           const cmsis_nn_per_channel_quant_params *quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q7_t *input_data,
+                                          const int8_t *input_data,
                                           const cmsis_nn_dims *filter_dims,
-                                          const q7_t *filter_data,
+                                          const int8_t *filter_data,
                                           const cmsis_nn_dims *bias_dims,
                                           const int32_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q7_t *output_data)
+                                          int8_t *output_data)
 {
     (void)bias_dims;
     arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS;
@@ -101,15 +101,15 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
                 for (int i = 0; i < 4; i++)
                 {
                     const int32_t actual_kernel_len = ker_end_idx[i] - ker_begin_idx[i];
-                    arm_nn_mat_mul_core_1x_s8(actual_kernel_len * input_ch,
-                                              (kernel_x - actual_kernel_len) * input_ch,
-                                              input_data + input_begin_idx[i] * input_ch,
-                                              filter_data + (ker_begin_idx[i] * input_ch),
-                                              output_ch,
-                                              conv_params,
-                                              quant_params,
-                                              bias_data,
-                                              output_data);
+                    status = arm_nn_mat_mul_core_1x_s8(actual_kernel_len * input_ch,
+                                                       (kernel_x - actual_kernel_len) * input_ch,
+                                                       input_data + input_begin_idx[i] * input_ch,
+                                                       filter_data + (ker_begin_idx[i] * input_ch),
+                                                       output_ch,
+                                                       conv_params,
+                                                       quant_params,
+                                                       bias_data,
+                                                       output_data);
                     output_data += output_ch;
                 }
             }
@@ -125,7 +125,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
                                                         bias_data,
                                                         output_data);
             }
+
+            if (status != ARM_CMSIS_NN_SUCCESS || output_data == NULL)
+            {
+                return ARM_CMSIS_NN_NO_IMPL_ERROR;
+            }
         }
+
         /* Advance to the next batch */
         input_data += (input_x * input_ch);
     }
@@ -149,17 +155,6 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
     return status;
 }
 
-int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
-{
-#if !defined(ARM_MATH_MVEI)
-    return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
-#else
-    (void)input_dims;
-    (void)filter_dims;
-    return 0;
-#endif
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c
new file mode 100644
index 00000000..a31e23c3
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c
@@ -0,0 +1,115 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_1x1_s8.c
+ * Description:  Generic s8 version of 1x1 convolution
+ *
+ * $Date:        20 January 2023
+ * $Revision:    V.1.0.1
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * A more generic version of s8 1x1 convolution intended for non-unity strides. This is slower
+ * than the _fast() version if used for unity stride values.
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx,
+                                        const cmsis_nn_conv_params *conv_params,
+                                        const cmsis_nn_per_channel_quant_params *quant_params,
+                                        const cmsis_nn_dims *input_dims,
+                                        const int8_t *input_data,
+                                        const cmsis_nn_dims *filter_dims,
+                                        const int8_t *filter_data,
+                                        const cmsis_nn_dims *bias_dims,
+                                        const int32_t *bias_data,
+                                        const cmsis_nn_dims *output_dims,
+                                        int8_t *output_data)
+{
+    (void)ctx;
+    (void)filter_dims;
+    (void)bias_dims;
+    if (conv_params->padding.w != 0 || conv_params->padding.h != 0)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    const int32_t lhs_rows = output_dims->w;
+    const int32_t rhs_rows = output_dims->c;
+    const int32_t rhs_cols = input_dims->c;
+    const int32_t stride_w = conv_params->stride.w;
+    const int32_t input_inc = input_dims->w * conv_params->stride.h * rhs_cols;
+    const int32_t output_inc = output_dims->w * rhs_rows;
+    const int32_t output_h = output_dims->h;
+    const int32_t batch = input_dims->n;
+    const int8_t *input_data_ref = input_data;
+
+    for (int i_batch = 0; i_batch < batch; i_batch++)
+    {
+        input_data = input_data_ref + (i_batch * rhs_cols * input_dims->w * input_dims->h);
+        for (int i_output_h = 0; i_output_h < output_h; i_output_h++)
+        {
+            // Process one input row
+            arm_cmsis_nn_status result = arm_nn_mat_mult_nt_t_s8(input_data,
+                                                                 filter_data,
+                                                                 bias_data,
+                                                                 output_data,
+                                                                 quant_params->multiplier,
+                                                                 quant_params->shift,
+                                                                 lhs_rows,
+                                                                 rhs_rows,
+                                                                 rhs_cols,
+                                                                 conv_params->input_offset,
+                                                                 conv_params->output_offset,
+                                                                 conv_params->activation.min,
+                                                                 conv_params->activation.max,
+                                                                 rhs_cols * stride_w);
+            if (result != ARM_CMSIS_NN_SUCCESS)
+            {
+                return result;
+            }
+            input_data += input_inc;
+            output_data += output_inc;
+        }
+    }
+
+    /* Return to application */
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
index 741051d7..c6c6cad0 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -19,18 +19,17 @@
 /* ----------------------------------------------------------------------
  * Project:      CMSIS NN Library
  * Title:        arm_convolve_1x1_s8_fast.c
- * Description:  Fast q7 version of 1x1 convolution (non-square shape)
+ * Description:  Fast s8 version of 1x1 convolution (non-square shape)
  *
- * $Date:        20 june 2022
- * $Revision:    V.3.0.1
+ * $Date:        30 January 2023
+ * $Revision:    V.3.1.0
  *
- * Target Processor:  Cortex-M Processors
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
 #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
-#include <stdio.h>
 
 /**
  *  @ingroup Public
@@ -52,13 +51,13 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
                                              const cmsis_nn_conv_params *conv_params,
                                              const cmsis_nn_per_channel_quant_params *quant_params,
                                              const cmsis_nn_dims *input_dims,
-                                             const q7_t *input_data,
+                                             const int8_t *input_data,
                                              const cmsis_nn_dims *filter_dims,
-                                             const q7_t *filter_data,
+                                             const int8_t *filter_data,
                                              const cmsis_nn_dims *bias_dims,
                                              const int32_t *bias_data,
                                              const cmsis_nn_dims *output_dims,
-                                             q7_t *output_data)
+                                             int8_t *output_data)
 {
     if (conv_params->padding.w != 0 || conv_params->padding.h != 0 || conv_params->stride.w != 1 ||
         conv_params->stride.h != 1)
@@ -70,43 +69,6 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
     (void)filter_dims;
     (void)bias_dims;
 
-#if defined(ARM_MATH_MVEI)
-
-    const int32_t col_len = input_dims->w * input_dims->h * input_dims->n;
-    const int32_t output_ch = output_dims->c;
-    const int32_t input_ch = input_dims->c;
-
-    for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
-    {
-        output_data = arm_nn_mat_mul_core_4x_s8(input_ch,
-                                                input_ch,
-                                                input_data + i_items * input_ch,
-                                                filter_data,
-                                                output_ch,
-                                                conv_params,
-                                                quant_params,
-                                                bias_data,
-                                                output_data);
-    }
-
-    /* Handle left over elements */
-    for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
-    {
-        arm_nn_mat_mul_core_1x_s8(input_ch,
-                                  0,
-                                  input_data + i_items * input_ch,
-                                  filter_data,
-                                  output_ch,
-                                  conv_params,
-                                  quant_params,
-                                  bias_data,
-                                  output_data);
-        output_data += output_ch;
-    }
-
-#else
-    /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
-
     const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n;
     const int32_t rhs_rows = output_dims->c;
     const int32_t rhs_cols = input_dims->c;
@@ -123,20 +85,13 @@ arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
                             conv_params->input_offset,
                             conv_params->output_offset,
                             conv_params->activation.min,
-                            conv_params->activation.max);
-
-#endif
+                            conv_params->activation.max,
+                            rhs_cols);
 
     /* Return to application */
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
-{
-    (void)input_dims;
-    return 0;
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
index 13703f02..7819f720 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_convolve_fast_s16.c
  * Description:  Optimized s16 version of convolution.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        30 January 2023
+ * $Revision:    V.2.2.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
                                           const cmsis_nn_conv_params *conv_params,
                                           const cmsis_nn_per_channel_quant_params *quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q15_t *input_data,
+                                          const int16_t *input_data,
                                           const cmsis_nn_dims *filter_dims,
-                                          const q7_t *filter_data,
+                                          const int8_t *filter_data,
                                           const cmsis_nn_dims *bias_dims,
                                           const int64_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q15_t *output_data)
+                                          int16_t *output_data)
 {
     (void)bias_dims;
     if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
@@ -70,7 +70,7 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q15_t *buffer_a = (q15_t *)ctx->buf;
+    int16_t *buffer_a = (int16_t *)ctx->buf;
 
     const int32_t input_batches = input_dims->n;
     const int32_t input_x = input_dims->w;
@@ -96,8 +96,8 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
     {
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
         /* Generate two columns from the input tensor a GEMM computation */
-        q15_t *two_column_buf = buffer_a;
-        q15_t *out = output_data;
+        int16_t *two_column_buf = buffer_a;
+        int16_t *out = output_data;
         /* This part implements the im2col function */
         for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
         {
@@ -112,13 +112,13 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
                         if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
                         {
                             /* Filling 0 for out-of-bound paddings */
-                            arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
+                            arm_memset_s8((int8_t *)two_column_buf, 0, sizeof(int16_t) * input_ch);
                         }
                         else
                         {
-                            arm_memcpy_q7((q7_t *)two_column_buf,
-                                          (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
-                                          input_ch * sizeof(q15_t));
+                            arm_memcpy_s8((int8_t *)two_column_buf,
+                                          (const int8_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
+                                          input_ch * sizeof(int16_t));
                         }
                         two_column_buf += input_ch;
                     }
@@ -146,31 +146,31 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
         /* Left-over because odd number of output pixels */
         if (two_column_buf != buffer_a)
         {
-            const q7_t *ker_a = filter_data;
+            const int8_t *ker_a = filter_data;
             int i;
 
             for (i = 0; i < output_ch; i++)
             {
                 /* Init the accumulator*/
-                q31_t sum = 0;
+                int32_t sum = 0;
 
                 /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
-                const q15_t *ip_as_col = buffer_a;
+                const int16_t *ip_as_col = buffer_a;
 
                 /* 4 multiply and accumulates are done in one loop. */
                 uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
 
                 while (col_count)
                 {
-                    q31_t ker_a1, ker_a2;
-                    q31_t ip_b1, ip_b2;
+                    int32_t ker_a1, ker_a2;
+                    int32_t ip_b1, ip_b2;
 
                     ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
 
                     ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
-                    sum = __SMLAD(ker_a1, ip_b1, sum);
+                    sum = SMLAD(ker_a1, ip_b1, sum);
                     ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
-                    sum = __SMLAD(ker_a2, ip_b2, sum);
+                    sum = SMLAD(ker_a2, ip_b2, sum);
 
                     col_count--;
                 }
@@ -178,15 +178,15 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
                 col_count = input_ch * kernel_y * kernel_x & 0x3;
                 while (col_count)
                 {
-                    q7_t ker_a1 = *ker_a++;
-                    q15_t ip_b1 = *ip_as_col++;
+                    int8_t ker_a1 = *ker_a++;
+                    int16_t ip_b1 = *ip_as_col++;
                     sum += ker_a1 * ip_b1;
                     col_count--;
                 }
                 if (bias_data)
                 {
-                    q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
-                    q63_t acc_64 = sum + bias_data[i];
+                    int32_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
+                    int64_t acc_64 = sum + bias_data[i];
                     sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
                 }
                 else
@@ -195,7 +195,7 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
                 }
                 sum = MAX(sum, out_activation_min);
                 sum = MIN(sum, out_activation_max);
-                *out++ = (q15_t)sum;
+                *out++ = (int16_t)sum;
             }
         }
 #else
@@ -225,17 +225,6 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
-{
-#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
-#else
-    (void)input_dims;
-    (void)filter_dims;
-    return 0;
-#endif
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c
new file mode 100644
index 00000000..ee286a36
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_get_buffer_sizes_s16.c
+ * Description:  Collection of get buffer size functions for the various s16 convolution layer functions.
+ *
+ * $Date:        30 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h"
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup NNConv
+ */
+
+/**
+ * @addtogroup GetBufferSizeNNConv
+ * @{
+ */
+
+__STATIC_INLINE int32_t arm_convolve_fast_s16_get_buffer_size_dsp(const cmsis_nn_dims *input_dims,
+                                                                  const cmsis_nn_dims *filter_dims)
+{
+    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
+}
+
+int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return arm_convolve_fast_s16_get_buffer_size_dsp(input_dims, filter_dims);
+#else
+    (void)input_dims;
+    (void)filter_dims;
+    return 0;
+#endif
+}
+
+int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+    (void)input_dims;
+    (void)filter_dims;
+    return 0;
+}
+
+/*
+ * Get the required buffer size for arm_convolve_wrapper_s16. This is the recommended function convolve wrapper s16
+ * function.
+ *
+ * Refer to header file for details.
+ *
+ */
+int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
+                                                 const cmsis_nn_dims *input_dims,
+                                                 const cmsis_nn_dims *filter_dims,
+                                                 const cmsis_nn_dims *output_dims)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return arm_convolve_wrapper_s16_get_buffer_size_dsp(conv_params, input_dims, filter_dims, output_dims);
+#else
+    (void)conv_params;
+    (void)output_dims;
+
+    // MVE and scalar implementation have same buffer requirements
+    return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
+#endif
+}
+
+int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
+                                                     const cmsis_nn_dims *input_dims,
+                                                     const cmsis_nn_dims *filter_dims,
+                                                     const cmsis_nn_dims *output_dims)
+{
+    (void)output_dims;
+
+    if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
+        (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
+    {
+        return arm_convolve_fast_s16_get_buffer_size_dsp(input_dims, filter_dims);
+    }
+    else
+    {
+
+        return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
+    }
+}
+
+int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
+                                                     const cmsis_nn_dims *input_dims,
+                                                     const cmsis_nn_dims *filter_dims,
+                                                     const cmsis_nn_dims *output_dims)
+{
+    return arm_convolve_wrapper_s16_get_buffer_size(conv_params, input_dims, filter_dims, output_dims);
+}
+
+/**
+ * @} end of GetBufferSizeNNConv group
+ */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c
new file mode 100644
index 00000000..330622ac
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c
@@ -0,0 +1,164 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_get_buffer_sizes_s8.c
+ * Description:  Collection of get buffer size functions for the various s8 convolution layer functions.
+ *
+ * $Date:        31 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/Internal/arm_nn_compiler.h"
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup NNConv
+ */
+
+/**
+ * @addtogroup GetBufferSizeNNConv
+ * @{
+ */
+
+__STATIC_INLINE int32_t arm_convolve_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
+                                                            const cmsis_nn_dims *filter_dims)
+{
+    int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h;
+    // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on
+    // implementation of  arm_nn_mat_mult_s8
+    col_length = (col_length + 7) / 8;
+    // 4 -> number of im2col buffers, 8 -> 8 elements per Q register
+    return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
+}
+
+__STATIC_INLINE int32_t arm_convolve_1_x_n_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
+                                                                  const cmsis_nn_dims *filter_dims)
+{
+    (void)input_dims;
+    (void)filter_dims;
+    return 0;
+}
+
+int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if defined(ARM_MATH_MVEI)
+    return arm_convolve_s8_get_buffer_size_mve(input_dims, filter_dims);
+#else
+    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
+#endif
+}
+
+int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if !defined(ARM_MATH_MVEI)
+    return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
+#else
+    return arm_convolve_1_x_n_s8_get_buffer_size_mve(input_dims, filter_dims);
+#endif
+}
+
+int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
+{
+    (void)input_dims;
+    return 0;
+}
+
+/*
+ * Get the required buffer size for arm_convolve_wrapper_s8. This is the recommended function convolve wrapper s8
+ * function.
+ *
+ * Refer to header file for details.
+ *
+ */
+int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
+                                                const cmsis_nn_dims *input_dims,
+                                                const cmsis_nn_dims *filter_dims,
+                                                const cmsis_nn_dims *output_dims)
+{
+#if defined(ARM_MATH_MVEI)
+    return arm_convolve_wrapper_s8_get_buffer_size_mve(conv_params, input_dims, filter_dims, output_dims);
+#else
+
+    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (filter_dims->w == 1) &&
+        (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
+    {
+        if ((conv_params->stride.w == 1) && (conv_params->stride.h == 1))
+        {
+            return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
+        }
+        else
+        {
+            return 0;
+        }
+    }
+    else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && (conv_params->dilation.w == 1) &&
+             (filter_dims->h == 1))
+    {
+        return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
+    }
+    else
+    {
+        return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
+    }
+#endif
+}
+
+int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
+                                                    const cmsis_nn_dims *input_dims,
+                                                    const cmsis_nn_dims *filter_dims,
+                                                    const cmsis_nn_dims *output_dims)
+{
+    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (filter_dims->w == 1) &&
+        (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
+    {
+        if ((conv_params->stride.w == 1) && (conv_params->stride.h == 1))
+        {
+            return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
+        }
+        else
+        {
+            return 0;
+        }
+    }
+    else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && (conv_params->dilation.w == 1) &&
+             (filter_dims->h == 1))
+    {
+        return arm_convolve_1_x_n_s8_get_buffer_size_mve(input_dims, filter_dims);
+    }
+    else
+    {
+        return arm_convolve_s8_get_buffer_size_mve(input_dims, filter_dims);
+    }
+}
+
+int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
+                                                    const cmsis_nn_dims *input_dims,
+                                                    const cmsis_nn_dims *filter_dims,
+                                                    const cmsis_nn_dims *output_dims)
+{
+    return arm_convolve_wrapper_s8_get_buffer_size(conv_params, input_dims, filter_dims, output_dims);
+}
+
+/**
+ * @} end of GetBufferSizeNNConv group
+ */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c
index 2a895657..395a3561 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_convolve_s16.c
  * Description:  s16 version of convolution using symmetric quantization.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        30 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
                                      const cmsis_nn_conv_params *conv_params,
                                      const cmsis_nn_per_channel_quant_params *quant_params,
                                      const cmsis_nn_dims *input_dims,
-                                     const q15_t *input_data,
+                                     const int16_t *input_data,
                                      const cmsis_nn_dims *filter_dims,
-                                     const q7_t *filter_data,
+                                     const int8_t *filter_data,
                                      const cmsis_nn_dims *bias_dims,
                                      const int64_t *bias_data,
                                      const cmsis_nn_dims *output_dims,
-                                     q15_t *output_data)
+                                     int16_t *output_data)
 {
     (void)bias_dims;
     (void)ctx;
@@ -90,7 +90,7 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
         /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
         for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
         {
-            const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]);
+            const int32_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]);
 
             for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
             {
@@ -144,13 +144,6 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
-{
-    (void)input_dims;
-    (void)filter_dims;
-    return 0;
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c
index a35d0784..7c80bc00 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_convolve_s8.c
  * Description:  s8 version of convolution using symmetric quantization.
  *
- * $Date:        19 April 2022
- * $Revision:    V.3.0.0
+ * $Date:        30 January 2023
+ * $Revision:    V.3.2.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
                                     const cmsis_nn_conv_params *conv_params,
                                     const cmsis_nn_per_channel_quant_params *quant_params,
                                     const cmsis_nn_dims *input_dims,
-                                    const q7_t *input_data,
+                                    const int8_t *input_data,
                                     const cmsis_nn_dims *filter_dims,
-                                    const q7_t *filter_data,
+                                    const int8_t *filter_data,
                                     const cmsis_nn_dims *bias_dims,
                                     const int32_t *bias_data,
                                     const cmsis_nn_dims *output_dims,
-                                    q7_t *output_data)
+                                    int8_t *output_data)
 {
     (void)bias_dims;
 
@@ -66,7 +66,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q15_t *buffer_a = (q15_t *)ctx->buf;
+    int16_t *buffer_a = (int16_t *)ctx->buf;
 
     const int32_t input_batches = input_dims->n;
     const uint16_t input_x = input_dims->w;
@@ -95,8 +95,8 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
     {
 #if defined(ARM_MATH_MVEI)
         /* Generate upto four columns from the input tensor a GEMM computation */
-        q7_t *im2col_buf = (q7_t *)buffer_a;
-        q7_t *out = output_data;
+        int8_t *im2col_buf = (int8_t *)buffer_a;
+        int8_t *out = output_data;
         int32_t buffer_fill_cnt = 0;
         int32_t padded = 0;
         const int32_t num_elem = kernel_x * kernel_y * input_ch;
@@ -120,12 +120,12 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
 
                         if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
                         {
-                            memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch);
+                            memset(im2col_buf, (int8_t)-input_offset, sizeof(int8_t) * input_ch);
                             padded = 1;
                         }
                         else
                         {
-                            arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
+                            arm_memcpy_s8(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
                         }
                         im2col_buf += input_ch;
                     }
@@ -139,20 +139,20 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
                     buffer_fill_cnt = 0;
                     out = arm_nn_mat_mul_core_4x_s8(num_elem,
                                                     num_elem,
-                                                    (q7_t *)buffer_a,
+                                                    (int8_t *)buffer_a,
                                                     filter_data,
                                                     output_ch,
                                                     conv_params,
                                                     quant_params,
                                                     bias_data,
                                                     out);
-                    im2col_buf = (q7_t *)buffer_a;
+                    im2col_buf = (int8_t *)buffer_a;
                 }
                 else if (buffer_fill_cnt == 4 && (padded != 0))
                 {
                     buffer_fill_cnt = 0;
                     out = arm_nn_mat_mult_s8(filter_data,
-                                             (q7_t *)buffer_a,
+                                             (int8_t *)buffer_a,
                                              output_ch,
                                              4,
                                              output_shift,
@@ -166,16 +166,20 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
                                              bias_data,
                                              out);
 
-                    im2col_buf = (q7_t *)buffer_a;
+                    im2col_buf = (int8_t *)buffer_a;
                     padded = 0;
                 }
             }
+            if (out == NULL)
+            {
+                return ARM_CMSIS_NN_NO_IMPL_ERROR;
+            }
         }
         /* Handle left over columns */
         if (buffer_fill_cnt != 0)
         {
             out = arm_nn_mat_mult_s8(filter_data,
-                                     (q7_t *)buffer_a,
+                                     (int8_t *)buffer_a,
                                      output_ch,
                                      buffer_fill_cnt,
                                      output_shift,
@@ -196,8 +200,8 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
         int32_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
         /* Generate two columns from the input tensor a GEMM computation */
-        q15_t *two_column_buf = buffer_a;
-        q7_t *out = output_data;
+        int16_t *two_column_buf = buffer_a;
+        int8_t *out = output_data;
 
         /* This part implements the im2col function */
         for (i_out_y = 0; i_out_y < output_y; i_out_y++)
@@ -217,7 +221,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
                         if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
                         {
                             /* Filling 0 for out-of-bound paddings */
-                            memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
+                            memset(two_column_buf, 0, sizeof(int16_t) * input_ch);
                         }
                         else
                         {
@@ -253,48 +257,48 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
         /* left-over because odd number of output pixels */
         if (two_column_buf != buffer_a)
         {
-            const q7_t *ker_a = filter_data;
+            const int8_t *ker_a = filter_data;
             int i;
 
             for (i = 0; i < output_ch; i++)
             {
                 /* Load the accumulator with bias first */
-                q31_t sum = 0;
+                int32_t sum = 0;
                 if (bias_data)
                 {
                     sum = bias_data[i];
                 }
 
                 /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
-                const q15_t *ip_as_col = buffer_a;
+                const int16_t *ip_as_col = buffer_a;
 
                 /* 4 multiply and accumulates are done in one loop. */
-#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_DSP)
                 uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
 
                 while (col_count)
                 {
-                    q31_t ker_a1, ker_a2;
-                    q31_t ip_b1, ip_b2;
+                    int32_t ker_a1, ker_a2;
+                    int32_t ip_b1, ip_b2;
 
                     ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
 
                     ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
-                    sum = __SMLAD(ker_a1, ip_b1, sum);
+                    sum = SMLAD(ker_a1, ip_b1, sum);
                     ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
-                    sum = __SMLAD(ker_a2, ip_b2, sum);
+                    sum = SMLAD(ker_a2, ip_b2, sum);
 
                     col_count--;
                 }
                 /* Handle left over mac */
                 col_count = input_ch * kernel_y * kernel_x & 0x3;
-#else
+    #else
                 uint16_t col_count = input_ch * kernel_y * kernel_x;
-#endif
+    #endif
                 while (col_count)
                 {
-                    q7_t ker_a1 = *ker_a++;
-                    q15_t ip_b1 = *ip_as_col++;
+                    int8_t ker_a1 = *ker_a++;
+                    int16_t ip_b1 = *ip_as_col++;
                     sum += ker_a1 * ip_b1;
                     col_count--;
                 }
@@ -303,7 +307,7 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
                 sum += out_offset;
                 sum = MAX(sum, out_activation_min);
                 sum = MIN(sum, out_activation_max);
-                *out++ = (q7_t)sum;
+                *out++ = (int8_t)sum;
             }
         }
 #endif // #if defined(ARM_MATH_MVEI)
@@ -316,20 +320,6 @@ arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
-{
-#if defined(ARM_MATH_MVEI)
-    int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h;
-    // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on
-    // implementation of  arm_nn_mat_mult_s8
-    col_length = (col_length + 7) / 8;
-    // 4 -> number of im2col buffers, 8 -> 8 elements per Q register
-    return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
-#else
-    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
-#endif
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
index 357ef593..7f38d554 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
  * cmsis-nn to perform the convolution.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        30 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -51,13 +51,13 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
                                              const cmsis_nn_conv_params *conv_params,
                                              const cmsis_nn_per_channel_quant_params *quant_params,
                                              const cmsis_nn_dims *input_dims,
-                                             const q15_t *input_data,
+                                             const int16_t *input_data,
                                              const cmsis_nn_dims *filter_dims,
-                                             const q7_t *filter_data,
+                                             const int8_t *filter_data,
                                              const cmsis_nn_dims *bias_dims,
                                              const int64_t *bias_data,
                                              const cmsis_nn_dims *output_dims,
-                                             q15_t *output_data)
+                                             int16_t *output_data)
 {
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
@@ -104,27 +104,6 @@ arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
 #endif
 }
 
-int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
-                                                 const cmsis_nn_dims *input_dims,
-                                                 const cmsis_nn_dims *filter_dims,
-                                                 const cmsis_nn_dims *output_dims)
-{
-    (void)conv_params;
-    (void)output_dims;
-
-#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-    if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
-        (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
-    {
-        return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
-    }
-
-    return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
-#else
-    return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
-#endif
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
index 235a87e9..2735408c 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
  * cmsis-nn to perform the convolution.
  *
- * $Date:        4 August 2022
- * $Revision:    V.2.1.1
+ * $Date:        11 January 2023
+ * $Revision:    V.2.3.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -51,29 +51,45 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
                                             const cmsis_nn_conv_params *conv_params,
                                             const cmsis_nn_per_channel_quant_params *quant_params,
                                             const cmsis_nn_dims *input_dims,
-                                            const q7_t *input_data,
+                                            const int8_t *input_data,
                                             const cmsis_nn_dims *filter_dims,
-                                            const q7_t *filter_data,
+                                            const int8_t *filter_data,
                                             const cmsis_nn_dims *bias_dims,
                                             const int32_t *bias_data,
                                             const cmsis_nn_dims *output_dims,
-                                            q7_t *output_data)
+                                            int8_t *output_data)
 {
-    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (conv_params->stride.w == 1) &&
-        (conv_params->stride.h == 1) && (filter_dims->w == 1) && (filter_dims->h == 1) &&
-        (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
+    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (filter_dims->w == 1) &&
+        (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
     {
-        return arm_convolve_1x1_s8_fast(ctx,
-                                        conv_params,
-                                        quant_params,
-                                        input_dims,
-                                        input_data,
-                                        filter_dims,
-                                        filter_data,
-                                        bias_dims,
-                                        bias_data,
-                                        output_dims,
-                                        output_data);
+        if ((conv_params->stride.w == 1) && (conv_params->stride.h == 1))
+        {
+            return arm_convolve_1x1_s8_fast(ctx,
+                                            conv_params,
+                                            quant_params,
+                                            input_dims,
+                                            input_data,
+                                            filter_dims,
+                                            filter_data,
+                                            bias_dims,
+                                            bias_data,
+                                            output_dims,
+                                            output_data);
+        }
+        else
+        {
+            return arm_convolve_1x1_s8(ctx,
+                                       conv_params,
+                                       quant_params,
+                                       input_dims,
+                                       input_data,
+                                       filter_dims,
+                                       filter_data,
+                                       bias_dims,
+                                       bias_data,
+                                       output_dims,
+                                       output_data);
+        }
     }
     else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && conv_params->dilation.w == 1 && (filter_dims->h == 1))
     {
@@ -105,28 +121,6 @@ arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
     }
 }
 
-int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
-                                                const cmsis_nn_dims *input_dims,
-                                                const cmsis_nn_dims *filter_dims,
-                                                const cmsis_nn_dims *output_dims)
-{
-    if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (conv_params->stride.w == 1) &&
-        (conv_params->stride.h == 1) && (filter_dims->w == 1) && (filter_dims->h == 1) &&
-        (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
-    {
-        return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
-    }
-    else if ((input_dims->h == 1) && (output_dims->w % 4 == 0) && (conv_params->dilation.w == 1) &&
-             (filter_dims->h == 1))
-    {
-        return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
-    }
-    else
-    {
-        return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
-    }
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
index 0893ea54..3d8e6c2a 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  Optimized s8 depthwise convolution function for channel
  *               multiplier of 1 and 3x3 kernel size.
  *
- * $Date:        19 July 2022
- * $Revision:    V.3.1.0
+ * $Date:        5 January 2023
+ * $Revision:    V.3.2.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -53,13 +53,13 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
                                               const cmsis_nn_dw_conv_params *dw_conv_params,
                                               const cmsis_nn_per_channel_quant_params *quant_params,
                                               const cmsis_nn_dims *input_dims,
-                                              const q7_t *input,
+                                              const int8_t *input,
                                               const cmsis_nn_dims *filter_dims,
-                                              const q7_t *kernel,
+                                              const int8_t *kernel,
                                               const cmsis_nn_dims *bias_dims,
                                               const int32_t *bias,
                                               const cmsis_nn_dims *output_dims,
-                                              q7_t *output)
+                                              int8_t *output)
 {
     (void)ctx;
     (void)bias_dims;
@@ -116,25 +116,82 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
 
                 const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
                 const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
+#if defined(ARM_MATH_DSP)
+                const uint32_t lhs_offset_s16x2 = PKHBT(input_offset, input_offset, 16);
 
                 for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
                 {
                     int32_t in_val = 0;
                     int32_t ker_val = 0;
+                    int32_t in_val_1 = 0;
+                    int32_t ker_val_1 = 0;
 
                     if (ker_w_start == 0)
                     {
-                        in_val = arm_nn_read_q7x4(input_ptr);
-                        ker_val = arm_nn_read_q7x4(kernel_ptr);
+                        in_val = arm_nn_read_s8x4(input_ptr);
+                        ker_val = arm_nn_read_s8x4(kernel_ptr);
 
+                        in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8);
+                        ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8);
+
+                        out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1);
+                        in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val);
+                        out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3);
+                        ker_val = SXTB16((uint32_t)ker_val);
+                        out_buff0 = SMLABB(in_val, ker_val, out_buff0);
+                        out_buff2 = SMLATT(in_val, ker_val, out_buff2);
+                    }
+
+                    in_val = arm_nn_read_s8x4(input_ptr + input_ch);
+                    ker_val = arm_nn_read_s8x4(kernel_ptr + input_ch);
+                    in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8);
+                    ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8);
+
+                    out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1);
+                    in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val);
+                    out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3);
+                    ker_val = SXTB16((uint32_t)ker_val);
+                    out_buff0 = SMLABB(in_val, ker_val, out_buff0);
+                    out_buff2 = SMLATT(in_val, ker_val, out_buff2);
+
+                    if ((input_x - in_w) >= 3)
+                    {
+                        in_val = arm_nn_read_s8x4(input_ptr + (input_ch << 1));
+                        ker_val = arm_nn_read_s8x4(kernel_ptr + (input_ch << 1));
+                        in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8);
+                        ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8);
+
+                        out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1);
+                        in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val);
+                        out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3);
+                        ker_val = SXTB16((uint32_t)ker_val);
+                        out_buff0 = SMLABB(in_val, ker_val, out_buff0);
+                        out_buff2 = SMLATT(in_val, ker_val, out_buff2);
+                    }
+
+                    input_ptr += (input_ch * input_x);
+                    kernel_ptr += (input_ch * 3);
+                }
+
+#else
+
+                for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
+                {
+                    int32_t in_val = 0;
+                    int32_t ker_val = 0;
+
+                    if (ker_w_start == 0)
+                    {
+                        in_val = arm_nn_read_s8x4(input_ptr);
+                        ker_val = arm_nn_read_s8x4(kernel_ptr);
                         out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
                         out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
                         out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
                         out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
                     }
 
-                    in_val = arm_nn_read_q7x4(input_ptr + input_ch);
-                    ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
+                    in_val = arm_nn_read_s8x4(input_ptr + input_ch);
+                    ker_val = arm_nn_read_s8x4(kernel_ptr + input_ch);
 
                     out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
                     out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
@@ -143,8 +200,8 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
 
                     if ((input_x - in_w) >= 3)
                     {
-                        in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
-                        ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
+                        in_val = arm_nn_read_s8x4(input_ptr + (input_ch << 1));
+                        ker_val = arm_nn_read_s8x4(kernel_ptr + (input_ch << 1));
 
                         out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
                         out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
@@ -155,6 +212,7 @@ arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
                     input_ptr += (input_ch * input_x);
                     kernel_ptr += (input_ch * 3);
                 }
+#endif
 
                 out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
                 out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c
index 354ee10d..5afe2c8a 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  Optimized s16 depthwise separable convolution function for
  *               channel multiplier of 1.
  *
- * $Date:        6 July 2022
- * $Revision:    V.1.1.0
+ * $Date:        30 January 2023
+ * $Revision:    V.1.3.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                                                 const cmsis_nn_dw_conv_params *dw_conv_params,
                                                 const cmsis_nn_per_channel_quant_params *quant_params,
                                                 const cmsis_nn_dims *input_dims,
-                                                const q15_t *input,
+                                                const int16_t *input,
                                                 const cmsis_nn_dims *filter_dims,
-                                                const q7_t *kernel,
+                                                const int8_t *kernel,
                                                 const cmsis_nn_dims *bias_dims,
                                                 const int64_t *bias,
                                                 const cmsis_nn_dims *output_dims,
-                                                q15_t *output)
+                                                int16_t *output)
 {
     const int32_t input_ch = input_dims->c;
     const int32_t output_ch = output_dims->c;
@@ -96,9 +96,9 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
     const int32_t output_y = output_dims->h;
     const int32_t output_activation_min = dw_conv_params->activation.min;
     const int32_t output_activation_max = dw_conv_params->activation.max;
-    q15_t *buffer_a = (q15_t *)ctx->buf;
+    int16_t *buffer_a = (int16_t *)ctx->buf;
 
-#if defined(ARM_MATH_MVEI)
+    #if defined(ARM_MATH_MVEI)
     int16_t *lhs_buffer = buffer_a;
     int16_t *out = output;
     int buffer_count = 0;
@@ -214,11 +214,11 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
         }
     }
 
-#else // ARM_MATH_DSP
+    #else // ARM_MATH_DSP
 
     /* Run the following code in cores using DSP extension */
-    q15_t *const col_buffer_start = buffer_a;
-    q15_t *col_buffer = col_buffer_start;
+    int16_t *const col_buffer_start = buffer_a;
+    int16_t *col_buffer = col_buffer_start;
     const int64_t *const bias_start_pos = bias;
     const int32_t *const out_mult_start_pos = output_mult;
     const int32_t *const out_shift_start_pos = output_shift;
@@ -244,7 +244,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                 int32_t index = 0;
                 if (ker_y_start != 0)
                 {
-                    memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
+                    memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(int16_t));
                     index += (kernel_x * input_ch) * ker_y_start;
                 }
 
@@ -258,13 +258,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
 
                         if (idx_x < 0 || idx_x >= input_x)
                         {
-                            memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
+                            memset(&col_buffer[index], 0, input_ch * sizeof(int16_t));
                         }
                         else
                         {
                             arm_memcpy_q15(&col_buffer[index],
                                            input + (idx_y * input_x + idx_x) * input_ch,
-                                           input_ch * sizeof(q15_t));
+                                           input_ch * sizeof(int16_t));
                         }
                         index += input_ch;
                     }
@@ -273,7 +273,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                 const int diff = kernel_y - ker_y_end;
                 if (diff != 0)
                 {
-                    memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
+                    memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(int16_t));
                 }
 
                 row_count = output_ch / 4;
@@ -284,10 +284,10 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
 
                 while (row_count)
                 {
-                    q31_t sum_1 = 0;
-                    q31_t sum_2 = 0;
-                    q31_t sum_3 = 0;
-                    q31_t sum_4 = 0;
+                    int32_t sum_1 = 0;
+                    int32_t sum_2 = 0;
+                    int32_t sum_3 = 0;
+                    int32_t sum_4 = 0;
 
                     int32_t output_mult_1 = REDUCE_MULTIPLIER(output_mult[0]);
                     int32_t output_mult_2 = REDUCE_MULTIPLIER(output_mult[1]);
@@ -296,46 +296,46 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                     output_mult += 4;
 
                     uint16_t col_count = (kernel_x * kernel_y) / 2;
-                    q15_t *col_pos = col_buffer_start + row_shift;
-                    const q7_t *row_pos = kernel + row_shift;
+                    int16_t *col_pos = col_buffer_start + row_shift;
+                    const int8_t *row_pos = kernel + row_shift;
                     row_shift += 4;
 
                     while (col_count)
                     {
                         /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
                         use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
-                        q31_t row_a1, row_a2, row_b1, row_b2, col_a, row_c, col_b, col_c;
+                        int32_t row_a1, row_a2, row_b1, row_b2, col_a, row_c, col_b, col_c;
 
                         /* Read 4 weights */
-                        row_b1 = arm_nn_read_q7x4(row_pos);
-                        row_a1 = arm_nn_read_q7x4(row_pos + input_ch);
-                        col_a = arm_nn_read_q15x2(col_pos);
-                        col_b = arm_nn_read_q15x2(col_pos + input_ch);
+                        row_b1 = arm_nn_read_s8x4(row_pos);
+                        row_a1 = arm_nn_read_s8x4(row_pos + input_ch);
+                        col_a = arm_nn_read_s16x2(col_pos);
+                        col_b = arm_nn_read_s16x2(col_pos + input_ch);
 
-                        row_a2 = __SXTB16(row_b1);
-                        row_b1 = __SXTB16(__ROR(row_b1, 8));
+                        row_a2 = SXTB16(row_b1);
+                        row_b1 = SXTB16(ROR(row_b1, 8));
 
-                        row_b2 = __SXTB16(row_a1);
-                        row_a1 = __SXTB16(__ROR(row_a1, 8));
+                        row_b2 = SXTB16(row_a1);
+                        row_a1 = SXTB16(ROR(row_a1, 8));
 
-                        col_c = __PKHBT(col_b, col_a, 16);
-                        col_a = __PKHTB(col_b, col_a, 16);
-                        row_c = __PKHBT(row_b2, row_a2, 16);
-                        sum_1 = __SMLAD(col_c, row_c, sum_1);
+                        col_c = PKHBT(col_b, col_a, 16);
+                        col_a = PKHTB(col_b, col_a, 16);
+                        row_c = PKHBT(row_b2, row_a2, 16);
+                        sum_1 = SMLAD(col_c, row_c, sum_1);
 
-                        row_c = __PKHBT(row_b1, row_a1, 16);
-                        sum_2 = __SMLAD(col_a, row_c, sum_2);
+                        row_c = PKHBT(row_b1, row_a1, 16);
+                        sum_2 = SMLAD(col_a, row_c, sum_2);
 
-                        col_a = arm_nn_read_q15x2(col_pos + 2);
-                        col_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
+                        col_a = arm_nn_read_s16x2(col_pos + 2);
+                        col_b = arm_nn_read_s16x2(col_pos + input_ch + 2);
 
-                        col_c = __PKHBT(col_b, col_a, 16);
-                        col_a = __PKHTB(col_b, col_a, 16);
-                        row_c = __PKHTB(row_a2, row_b2, 16);
-                        sum_3 = __SMLAD(col_c, row_c, sum_3);
+                        col_c = PKHBT(col_b, col_a, 16);
+                        col_a = PKHTB(col_b, col_a, 16);
+                        row_c = PKHTB(row_a2, row_b2, 16);
+                        sum_3 = SMLAD(col_c, row_c, sum_3);
 
-                        row_c = __PKHTB(row_a1, row_b1, 16);
-                        sum_4 = __SMLAD(col_a, row_c, sum_4);
+                        row_c = PKHTB(row_a1, row_b1, 16);
+                        sum_4 = SMLAD(col_a, row_c, sum_4);
 
                         row_pos += input_ch << 1;
                         col_pos += input_ch << 1;
@@ -372,22 +372,22 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                     result = arm_nn_requantize_s64(acc_1, output_mult_1, *output_shift++);
                     result = MAX(result, output_activation_min);
                     result = MIN(result, output_activation_max);
-                    *output++ = (q15_t)result;
+                    *output++ = (int16_t)result;
 
                     result = arm_nn_requantize_s64(acc_2, output_mult_2, *output_shift++);
                     result = MAX(result, output_activation_min);
                     result = MIN(result, output_activation_max);
-                    *output++ = (q15_t)result;
+                    *output++ = (int16_t)result;
 
                     result = arm_nn_requantize_s64(acc_3, output_mult_3, *output_shift++);
                     result = MAX(result, output_activation_min);
                     result = MIN(result, output_activation_max);
-                    *output++ = (q15_t)result;
+                    *output++ = (int16_t)result;
 
                     result = arm_nn_requantize_s64(acc_4, output_mult_4, *output_shift++);
                     result = MAX(result, output_activation_min);
                     result = MIN(result, output_activation_max);
-                    *output++ = (q15_t)result;
+                    *output++ = (int16_t)result;
 
                     row_count--;
                 }
@@ -395,9 +395,9 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                 row_count = output_ch & 0x3;
                 while (row_count)
                 {
-                    q15_t *col_pos = col_buffer_start + row_shift;
-                    const q7_t *row_pos = kernel + row_shift;
-                    q31_t sum = 0;
+                    int16_t *col_pos = col_buffer_start + row_shift;
+                    const int8_t *row_pos = kernel + row_shift;
+                    int32_t sum = 0;
                     const uint16_t col_count = (kernel_x * kernel_y);
                     row_shift += 1;
 
@@ -414,7 +414,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                     output_mult++;
                     result = MAX(result, output_activation_min);
                     result = MIN(result, output_activation_max);
-                    *output++ = (q15_t)result;
+                    *output++ = (int16_t)result;
 
                     row_count--;
                 }
@@ -426,7 +426,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
         /* Advance to the next batch */
         input += (input_x * input_y * input_ch);
     }
-#endif
+    #endif
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
     return arm_depthwise_conv_s16(ctx,
@@ -446,22 +446,6 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
-{
-#if defined(ARM_MATH_DSP)
-#if defined(ARM_MATH_MVEI)
-    /* The + 8 accounts for a worst case out of bounds read of the lhs buffers in the *_nt_t_* function.  */
-    return 4 * input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t) + 8;
-#else // ARM_MATH_DSP
-    return input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t);
-#endif
-#else
-    (void)input_dims;
-    (void)filter_dims;
-    return 0;
-#endif
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c
new file mode 100644
index 00000000..fb0b8e1b
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c
@@ -0,0 +1,123 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_depthwise_conv_get_buffer_sizes_s16.c
+ * Description:  Collection of get buffer size functions for the various s16 convolution layer functions.
+ *
+ * $Date:        13 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup NNconv
+ */
+
+/**
+ * @addtogroup GetBufferSizeNNConv
+ * @{
+ */
+__STATIC_INLINE int32_t arm_depthwise_conv_fast_s16_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
+                                                                        const cmsis_nn_dims *filter_dims)
+{
+    /* The + 8 accounts for a worst case out of bounds read of the lhs buffers in the *_nt_t_* function.  */
+    return 4 * input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t) + 8;
+}
+
+__STATIC_INLINE int32_t arm_depthwise_conv_fast_s16_get_buffer_size_dsp(const cmsis_nn_dims *input_dims,
+                                                                        const cmsis_nn_dims *filter_dims)
+{
+    return input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t);
+}
+
+int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_MVEI)
+    return arm_depthwise_conv_fast_s16_get_buffer_size_mve(input_dims, filter_dims);
+    #else // ARM_MATH_DSP
+    return arm_depthwise_conv_fast_s16_get_buffer_size_dsp(input_dims, filter_dims);
+    #endif
+#else
+    (void)input_dims;
+    (void)filter_dims;
+    return 0;
+#endif
+}
+
+int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                       const cmsis_nn_dims *input_dims,
+                                                       const cmsis_nn_dims *filter_dims,
+                                                       const cmsis_nn_dims *output_dims)
+{
+    (void)output_dims;
+
+    int32_t size = 0;
+
+    if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims))
+    {
+        size = arm_depthwise_conv_fast_s16_get_buffer_size(input_dims, filter_dims);
+    }
+
+    return size;
+}
+
+int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                           const cmsis_nn_dims *input_dims,
+                                                           const cmsis_nn_dims *filter_dims,
+                                                           const cmsis_nn_dims *output_dims)
+{
+    (void)output_dims;
+
+    int32_t size = 0;
+
+    if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims))
+    {
+        size = arm_depthwise_conv_fast_s16_get_buffer_size_mve(input_dims, filter_dims);
+    }
+
+    return size;
+}
+
+int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                           const cmsis_nn_dims *input_dims,
+                                                           const cmsis_nn_dims *filter_dims,
+                                                           const cmsis_nn_dims *output_dims)
+{
+    (void)output_dims;
+
+    int32_t size = 0;
+
+    if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims))
+    {
+        size = arm_depthwise_conv_fast_s16_get_buffer_size_dsp(input_dims, filter_dims);
+    }
+
+    return size;
+}
+
+/**
+ * @} end of GetBufferSizeNNConv group
+ */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c
new file mode 100644
index 00000000..a462c53d
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c
@@ -0,0 +1,131 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_depthwise_conv_get_buffer_sizes_s8.c
+ * Description:  Collection of get buffer size functions for the various s8 convolution layer functions.
+ *
+ * $Date:        20 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup NNConv
+ */
+
+/**
+ * @addtogroup GetBufferSizeNNConv
+ * @{
+ */
+
+__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
+                                                                      const cmsis_nn_dims *filter_dims)
+{
+    (void)input_dims;
+    return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t);
+}
+
+__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_dsp(const cmsis_nn_dims *input_dims,
+                                                                      const cmsis_nn_dims *filter_dims)
+{
+    return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
+}
+
+int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if defined(ARM_MATH_MVEI)
+    return arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims);
+#elif defined(ARM_MATH_DSP)
+    return arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims);
+#else
+    (void)input_dims;
+    (void)filter_dims;
+    return 0;
+#endif
+}
+
+int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                      const cmsis_nn_dims *input_dims,
+                                                      const cmsis_nn_dims *filter_dims,
+                                                      const cmsis_nn_dims *output_dims)
+{
+    int32_t size = 0;
+
+    if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
+        dw_conv_params->dilation.h == 1)
+    {
+#if !defined(ARM_MATH_MVEI)
+        if (filter_dims->w == 3 && filter_dims->h == 3 && dw_conv_params->padding.h <= 1 &&
+            dw_conv_params->padding.w <= 1)
+        {
+            return size;
+        }
+#endif
+        size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
+    }
+
+    return size;
+}
+
+int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                          const cmsis_nn_dims *input_dims,
+                                                          const cmsis_nn_dims *filter_dims,
+                                                          const cmsis_nn_dims *output_dims)
+{
+    int32_t size = 0;
+
+    if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
+        dw_conv_params->dilation.h == 1)
+    {
+        if (filter_dims->w == 3 && filter_dims->h == 3 && dw_conv_params->padding.h <= 1 &&
+            dw_conv_params->padding.w <= 1)
+        {
+            return size;
+        }
+        size = arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims);
+    }
+
+    return size;
+}
+
+int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
+                                                          const cmsis_nn_dims *input_dims,
+                                                          const cmsis_nn_dims *filter_dims,
+                                                          const cmsis_nn_dims *output_dims)
+{
+    int32_t size = 0;
+
+    if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
+        dw_conv_params->dilation.h == 1)
+    {
+        size = arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims);
+    }
+
+    return size;
+}
+
+/**
+ * @} end of GetBufferSizeNNConv group
+ */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
index 33161d21..6587b877 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_conv_s16.c
  * Description:  s16 version of depthwise convolution.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.2.0.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -169,7 +169,7 @@ static void depthwise_conv_s16_generic_s16(const int16_t *input,
                     {
                         const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
 
-                        const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
+                        const int32_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
                         int64_t acc_0 = 0;
 
                         int ker_y_start;
@@ -245,13 +245,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
                                            const cmsis_nn_dw_conv_params *dw_conv_params,
                                            const cmsis_nn_per_channel_quant_params *quant_params,
                                            const cmsis_nn_dims *input_dims,
-                                           const q15_t *input,
+                                           const int16_t *input,
                                            const cmsis_nn_dims *filter_dims,
-                                           const q7_t *kernel,
+                                           const int8_t *kernel,
                                            const cmsis_nn_dims *bias_dims,
                                            const int64_t *bias,
                                            const cmsis_nn_dims *output_dims,
-                                           q15_t *output)
+                                           int16_t *output)
 {
     const uint16_t dilation_x = dw_conv_params->dilation.w;
     const uint16_t dilation_y = dw_conv_params->dilation.h;
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
index d019be96..0c67079b 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_conv_s8.c
  * Description:  s8 version of depthwise convolution.
  *
- * $Date:        29 July 2022
- * $Revision:    V.3.0.3
+ * $Date:        26 October 2022
+ * $Revision:    V.3.0.4
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -151,12 +151,12 @@ depthwise_conv_s8_mult_4(const int8_t *input,
     }
 }
 
-static void depthwise_conv_s8_generic(const q7_t *input,
+static void depthwise_conv_s8_generic(const int8_t *input,
                                       const uint16_t input_batches,
                                       const uint16_t input_x,
                                       const uint16_t input_y,
                                       const uint16_t input_ch,
-                                      const q7_t *kernel,
+                                      const int8_t *kernel,
                                       const uint16_t output_ch,
                                       const uint16_t ch_mult,
                                       const uint16_t kernel_x,
@@ -166,7 +166,7 @@ static void depthwise_conv_s8_generic(const q7_t *input,
                                       const uint16_t stride_x,
                                       const uint16_t stride_y,
                                       const int32_t *bias,
-                                      q7_t *output,
+                                      int8_t *output,
                                       const int32_t *output_shift,
                                       const int32_t *output_mult,
                                       const uint16_t output_x,
@@ -274,13 +274,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
                                           const cmsis_nn_dw_conv_params *dw_conv_params,
                                           const cmsis_nn_per_channel_quant_params *quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q7_t *input,
+                                          const int8_t *input,
                                           const cmsis_nn_dims *filter_dims,
-                                          const q7_t *kernel,
+                                          const int8_t *kernel,
                                           const cmsis_nn_dims *bias_dims,
                                           const int32_t *bias,
                                           const cmsis_nn_dims *output_dims,
-                                          q7_t *output)
+                                          int8_t *output)
 {
     const uint16_t dilation_x = dw_conv_params->dilation.w;
     const uint16_t dilation_y = dw_conv_params->dilation.h;
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
index c3659c80..572a6b02 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  Optimized s8 depthwise separable convolution function for
  *               channel multiplier of 1.
  *
- * $Date:        27 July 2022
- * $Revision:    V.3.1.0
+ * $Date:        30 January 2023
+ * $Revision:    V.3.3.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -52,13 +52,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                                               const cmsis_nn_dw_conv_params *dw_conv_params,
                                               const cmsis_nn_per_channel_quant_params *quant_params,
                                               const cmsis_nn_dims *input_dims,
-                                              const q7_t *input,
+                                              const int8_t *input,
                                               const cmsis_nn_dims *filter_dims,
-                                              const q7_t *kernel,
+                                              const int8_t *kernel,
                                               const cmsis_nn_dims *bias_dims,
                                               const int32_t *bias,
                                               const cmsis_nn_dims *output_dims,
-                                              q7_t *output)
+                                              int8_t *output)
 {
 
     const int32_t input_ch = input_dims->c;
@@ -92,12 +92,12 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
     const int32_t input_offset = dw_conv_params->input_offset;
     const int32_t output_activation_min = dw_conv_params->activation.min;
     const int32_t output_activation_max = dw_conv_params->activation.max;
-    q15_t *buffer_a = (q15_t *)ctx->buf;
+    int16_t *buffer_a = (int16_t *)ctx->buf;
 
-#ifdef ARM_MATH_MVEI
+    #ifdef ARM_MATH_MVEI
     /* Generate two columns from the input tensor */
-    q7_t *lhs_buffer = (q7_t *)buffer_a;
-    q7_t *out = output;
+    int8_t *lhs_buffer = (int8_t *)buffer_a;
+    int8_t *out = output;
     int padded = 0;
     int buffer_count = 0;
     const int32_t kernel_size = kernel_x * kernel_y;
@@ -122,12 +122,12 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                     {
                         if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
                         {
-                            arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)active_ch);
+                            arm_memset_s8(lhs_buffer, (int8_t)-input_offset, (uint32_t)active_ch);
                             padded = 1;
                         }
                         else
                         {
-                            arm_memcpy_q7(lhs_buffer,
+                            arm_memcpy_s8(lhs_buffer,
                                           input_slice + (i_ker_y * input_x + i_ker_x) * input_ch,
                                           (uint32_t)active_ch);
                         }
@@ -139,7 +139,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                 if (buffer_count == 4)
                 {
                     const int32_t block_offset = i_ch * CH_IN_BLOCK_MVE;
-                    lhs_buffer = (q7_t *)buffer_a;
+                    lhs_buffer = (int8_t *)buffer_a;
                     if (padded == 0)
                     {
                         arm_nn_depthwise_conv_nt_t_s8(lhs_buffer,
@@ -179,7 +179,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
             }
         }
         /* Handle left over buffers */
-        lhs_buffer = (q7_t *)buffer_a;
+        lhs_buffer = (int8_t *)buffer_a;
 
         int8_t *out_base = out;
         for (int i_buf = 0; i_buf < buffer_count; i_buf++)
@@ -228,13 +228,13 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
         remaining_ch -= CH_IN_BLOCK_MVE;
     }
 
-#else // ARM_MATH_DSP
+    #else // ARM_MATH_DSP
     /* Run the following code in cores using DSP extension */
-    q15_t *const col_buffer_start = buffer_a;
-    q15_t *col_buffer = col_buffer_start;
+    int16_t *const col_buffer_start = buffer_a;
+    int16_t *col_buffer = col_buffer_start;
     const int32_t *const bias_start_pos = bias;
-    const q31_t *const out_mult_start_pos = output_mult;
-    const q31_t *const out_shift_start_pos = output_shift;
+    const int32_t *const out_mult_start_pos = output_mult;
+    const int32_t *const out_shift_start_pos = output_shift;
     uint16_t row_count;
     uint16_t row_shift;
 
@@ -254,7 +254,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
             int32_t index = 0;
             if (ker_y_start != 0)
             {
-                memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
+                memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(int16_t));
                 index += (kernel_x * input_ch) * ker_y_start;
             }
 
@@ -267,11 +267,11 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                     const int32_t idx_x = base_idx_x + i_ker_x;
                     if (idx_x < 0 || idx_x >= input_x)
                     {
-                        memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
+                        memset(&col_buffer[index], 0, input_ch * sizeof(int16_t));
                     }
                     else
                     {
-                        arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch,
+                        arm_q7_to_q15_with_offset((int8_t *)input + (idx_y * input_x + idx_x) * input_ch,
                                                   &col_buffer[index],
                                                   input_ch,
                                                   input_offset);
@@ -283,7 +283,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
             const int diff = kernel_y - ker_y_end;
             if (diff != 0)
             {
-                memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
+                memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(int16_t));
             }
 
             row_count = output_ch / 4;
@@ -294,10 +294,10 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
 
             while (row_count)
             {
-                q31_t sum = 0;
-                q31_t sum_2 = 0;
-                q31_t sum_3 = 0;
-                q31_t sum_4 = 0;
+                int32_t sum = 0;
+                int32_t sum_2 = 0;
+                int32_t sum_3 = 0;
+                int32_t sum_4 = 0;
                 if (bias)
                 {
                     sum = *bias++;
@@ -307,8 +307,8 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                 }
 
                 uint16_t col_count = (kernel_x * kernel_y) / 2;
-                q15_t *col_pos = col_buffer_start + row_shift;
-                const q7_t *row_pos = kernel + row_shift;
+                int16_t *col_pos = col_buffer_start + row_shift;
+                const int8_t *row_pos = kernel + row_shift;
                 row_shift += 4;
 
                 while (col_count)
@@ -316,37 +316,37 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                     /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
                     use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
                     /* Note: variable names can be improved here to align with rows and columns. */
-                    q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
+                    int32_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
                     /* Read 4 weights */
-                    ip_b1 = arm_nn_read_q7x4(row_pos);
-                    ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
-                    op_a = arm_nn_read_q15x2(col_pos);
-                    op_b = arm_nn_read_q15x2(col_pos + input_ch);
+                    ip_b1 = arm_nn_read_s8x4(row_pos);
+                    ip_a1 = arm_nn_read_s8x4(row_pos + input_ch);
+                    op_a = arm_nn_read_s16x2(col_pos);
+                    op_b = arm_nn_read_s16x2(col_pos + input_ch);
 
-                    ip_a2 = __SXTB16(ip_b1);
-                    ip_b1 = __SXTB16(__ROR(ip_b1, 8));
+                    ip_a2 = SXTB16(ip_b1);
+                    ip_b1 = SXTB16(ROR(ip_b1, 8));
 
-                    ip_b2 = __SXTB16(ip_a1);
-                    ip_a1 = __SXTB16(__ROR(ip_a1, 8));
+                    ip_b2 = SXTB16(ip_a1);
+                    ip_a1 = SXTB16(ROR(ip_a1, 8));
 
-                    op_c = __PKHBT(op_b, op_a, 16);
-                    op_a = __PKHTB(op_b, op_a, 16);
-                    op_b = __PKHBT(ip_b2, ip_a2, 16);
-                    sum = __SMLAD(op_c, op_b, sum);
+                    op_c = PKHBT(op_b, op_a, 16);
+                    op_a = PKHTB(op_b, op_a, 16);
+                    op_b = PKHBT(ip_b2, ip_a2, 16);
+                    sum = SMLAD(op_c, op_b, sum);
 
-                    op_b = __PKHBT(ip_b1, ip_a1, 16);
-                    sum_2 = __SMLAD(op_a, op_b, sum_2);
+                    op_b = PKHBT(ip_b1, ip_a1, 16);
+                    sum_2 = SMLAD(op_a, op_b, sum_2);
 
-                    op_a = arm_nn_read_q15x2(col_pos + 2);
-                    op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
+                    op_a = arm_nn_read_s16x2(col_pos + 2);
+                    op_b = arm_nn_read_s16x2(col_pos + input_ch + 2);
 
-                    op_c = __PKHBT(op_b, op_a, 16);
-                    op_a = __PKHTB(op_b, op_a, 16);
-                    op_b = __PKHTB(ip_a2, ip_b2, 16);
-                    sum_3 = __SMLAD(op_c, op_b, sum_3);
+                    op_c = PKHBT(op_b, op_a, 16);
+                    op_a = PKHTB(op_b, op_a, 16);
+                    op_b = PKHTB(ip_a2, ip_b2, 16);
+                    sum_3 = SMLAD(op_c, op_b, sum_3);
 
-                    op_b = __PKHTB(ip_a1, ip_b1, 16);
-                    sum_4 = __SMLAD(op_a, op_b, sum_4);
+                    op_b = PKHTB(ip_a1, ip_b1, 16);
+                    sum_4 = SMLAD(op_a, op_b, sum_4);
 
                     row_pos += input_ch << 1;
                     col_pos += input_ch << 1;
@@ -370,24 +370,24 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                 sum += output_offset;
                 sum = MAX(sum, output_activation_min);
                 sum = MIN(sum, output_activation_max);
-                *output++ = (q7_t)sum;
+                *output++ = (int8_t)sum;
 
                 sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
                 sum_2 += output_offset;
                 sum_2 = MAX(sum_2, output_activation_min);
                 sum_2 = MIN(sum_2, output_activation_max);
-                *output++ = (q7_t)sum_2;
+                *output++ = (int8_t)sum_2;
                 sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
                 sum_3 += output_offset;
                 sum_3 = MAX(sum_3, output_activation_min);
                 sum_3 = MIN(sum_3, output_activation_max);
-                *output++ = (q7_t)sum_3;
+                *output++ = (int8_t)sum_3;
 
                 sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
                 sum_4 += output_offset;
                 sum_4 = MAX(sum_4, output_activation_min);
                 sum_4 = MIN(sum_4, output_activation_max);
-                *output++ = (q7_t)sum_4;
+                *output++ = (int8_t)sum_4;
 
                 row_count--;
             }
@@ -395,9 +395,9 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
             row_count = output_ch & 0x3;
             while (row_count)
             {
-                q15_t *col_pos = col_buffer_start + row_shift;
-                const q7_t *row_pos = kernel + row_shift;
-                q31_t sum = 0;
+                int16_t *col_pos = col_buffer_start + row_shift;
+                const int8_t *row_pos = kernel + row_shift;
+                int32_t sum = 0;
                 if (bias)
                 {
                     sum = *bias++;
@@ -413,7 +413,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
                 sum += output_offset;
                 sum = MAX(sum, output_activation_min);
                 sum = MIN(sum, output_activation_max);
-                *output++ = (q7_t)sum;
+                *output++ = (int8_t)sum;
 
                 row_count--;
             }
@@ -422,7 +422,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
             col_buffer = col_buffer_start;
         }
     }
-#endif
+    #endif
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
     return arm_depthwise_conv_s8(ctx,
@@ -442,20 +442,6 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
-{
-#if defined(ARM_MATH_MVEI)
-    (void)input_dims;
-    return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t);
-#elif defined(ARM_MATH_DSP)
-    return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
-#else
-    (void)input_dims;
-    (void)filter_dims;
-    return 0;
-#endif
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c
index bc42de10..8a2ff210 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,20 +22,15 @@
  * Description:  Wrapper API to select appropriate depthwise conv API based
  *               on dimensions.
  *
- * $Date:        6 July 2022
- * $Revision:    V.1.0.1
+ * $Date:        20 January 2023
+ * $Revision:    V.1.1.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
-
-
-#define USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims)                                             \
-    (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 &&             \
-     filter_dims->w * filter_dims->h * input_dims->c < 512)
-
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
 
 /**
  *  @ingroup Public
@@ -56,17 +51,17 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
                                                    const cmsis_nn_dw_conv_params *dw_conv_params,
                                                    const cmsis_nn_per_channel_quant_params *quant_params,
                                                    const cmsis_nn_dims *input_dims,
-                                                   const q15_t *input,
+                                                   const int16_t *input,
                                                    const cmsis_nn_dims *filter_dims,
-                                                   const q7_t *filter,
+                                                   const int8_t *filter,
                                                    const cmsis_nn_dims *bias_dims,
                                                    const int64_t *bias,
                                                    const cmsis_nn_dims *output_dims,
-                                                   q15_t *output)
+                                                   int16_t *output)
 {
     arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS;
 
-    if (USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims))
+    if (USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims))
     {
         status = arm_depthwise_conv_fast_s16(ctx,
                                              dw_conv_params,
@@ -99,25 +94,6 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
     return status;
 }
 
-int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
-                                                       const cmsis_nn_dims *input_dims,
-                                                       const cmsis_nn_dims *filter_dims,
-                                                       const cmsis_nn_dims *output_dims)
-{
-    (void)dw_conv_params;
-    (void)input_dims;
-    (void)filter_dims;
-    (void)output_dims;
-    int32_t size = 0;
-
-    if (USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims))
-    {
-        size = arm_depthwise_conv_fast_s16_get_buffer_size(input_dims, filter_dims);
-    }
-
-    return size;
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
index 157aa92f..0107f757 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  Wrapper API to select appropriate depthwise conv API based
  *               on dimensions.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        13 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -50,21 +50,21 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
                                                   const cmsis_nn_per_channel_quant_params *quant_params,
                                                   const cmsis_nn_dims *input_dims,
-                                                  const q7_t *input,
+                                                  const int8_t *input,
                                                   const cmsis_nn_dims *filter_dims,
-                                                  const q7_t *filter,
+                                                  const int8_t *filter,
                                                   const cmsis_nn_dims *bias_dims,
                                                   const int32_t *bias,
                                                   const cmsis_nn_dims *output_dims,
-                                                  q7_t *output)
+                                                  int8_t *output)
 {
     arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS;
     if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
         dw_conv_params->dilation.h == 1)
     {
 #if !defined(ARM_MATH_MVEI)
-        if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) &&
-            (dw_conv_params->padding.w <= 1))
+        if (filter_dims->w == 3 && filter_dims->h == 3 && dw_conv_params->padding.h <= 1 &&
+            dw_conv_params->padding.w <= 1)
         {
             status = arm_depthwise_conv_3x3_s8(ctx,
                                                dw_conv_params,
@@ -113,23 +113,6 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
     return status;
 }
 
-int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
-                                                      const cmsis_nn_dims *input_dims,
-                                                      const cmsis_nn_dims *filter_dims,
-                                                      const cmsis_nn_dims *output_dims)
-{
-    (void)dw_conv_params;
-    int32_t size = 0;
-
-    if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
-        dw_conv_params->dilation.h == 1)
-    {
-        size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
-    }
-
-    return size;
-}
-
 /**
  * @} end of NNConv group
  */
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
index f4c9e386..341f1571 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_depthwise_conv_s8_core.c
  * Description:  Depthwise convolution on im2col buffers.
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.4
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.5
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -37,17 +37,17 @@
  *
  */
 
-q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
-                                    const q15_t *col,
-                                    const uint16_t num_ch,
-                                    const int32_t *out_shift,
-                                    const int32_t *out_mult,
-                                    const int32_t out_offset,
-                                    const int32_t activation_min,
-                                    const int32_t activation_max,
-                                    const uint16_t kernel_size,
-                                    const int32_t *const output_bias,
-                                    q7_t *out)
+int8_t *arm_nn_depthwise_conv_s8_core(const int8_t *row,
+                                      const int16_t *col,
+                                      const uint16_t num_ch,
+                                      const int32_t *out_shift,
+                                      const int32_t *out_mult,
+                                      const int32_t out_offset,
+                                      const int32_t activation_min,
+                                      const int32_t activation_max,
+                                      const uint16_t kernel_size,
+                                      const int32_t *const output_bias,
+                                      int8_t *out)
 {
 #if defined(ARM_MATH_MVEI)
     int32_t ch_per_loop = num_ch / 4;
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
index 5c27bf59..8e1708d2 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_nn_mat_mult_kernel_s8_s16.c
  * Description:  Matrix-multiplication function for convolution
  *
- * $Date:        14. December 2021
- * $Revision:    V.1.1.0
+ * $Date:        5 Januray 2023
+ * $Revision:    V.1.2.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
@@ -37,39 +37,39 @@
  *
  */
 
-q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
-                                    const q15_t *input_b,
-                                    const uint16_t output_ch,
-                                    const int32_t *out_shift,
-                                    const int32_t *out_mult,
-                                    const int32_t out_offset,
-                                    const int16_t activation_min,
-                                    const int16_t activation_max,
-                                    const uint16_t num_col_a,
-                                    const int32_t *const output_bias,
-                                    q7_t *out_0)
+int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a,
+                                      const int16_t *input_b,
+                                      const uint16_t output_ch,
+                                      const int32_t *out_shift,
+                                      const int32_t *out_mult,
+                                      const int32_t out_offset,
+                                      const int16_t activation_min,
+                                      const int16_t activation_max,
+                                      const uint16_t num_col_a,
+                                      const int32_t *const output_bias,
+                                      int8_t *out_0)
 {
 #if !defined(ARM_MATH_MVEI)
     /* set up the second output pointers */
-    q7_t *out_1 = out_0 + output_ch;
+    int8_t *out_1 = out_0 + output_ch;
     const int32_t *bias = output_bias;
 
     uint16_t row_count = output_ch / 2;
-    const q7_t *ip_a0 = input_a;
+    const int8_t *ip_a0 = input_a;
     /* this loop over rows in A */
     while (row_count)
     {
         /* setup pointers for B */
-        const q15_t *ip_b0 = input_b;
-        const q15_t *ip_b1 = ip_b0 + num_col_a;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + num_col_a;
 
         /* align the second pointer for A */
-        const q7_t *ip_a1 = ip_a0 + num_col_a;
+        const int8_t *ip_a1 = ip_a0 + num_col_a;
 
-        q31_t ch_0_out_0 = 0;
-        q31_t ch_0_out_1 = 0;
-        q31_t ch_1_out_0 = 0;
-        q31_t ch_1_out_1 = 0;
+        int32_t ch_0_out_0 = 0;
+        int32_t ch_0_out_1 = 0;
+        int32_t ch_1_out_0 = 0;
+        int32_t ch_1_out_1 = 0;
         /* Init accumulator with bias for channel N and N + 1 */
         if (bias)
         {
@@ -79,43 +79,43 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
             ch_1_out_1 = *bias++;
         }
 
-#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_DSP)
         uint16_t col_count = num_col_a / 4;
         /* accumulate over the vector */
         while (col_count)
         {
-            q31_t a01, a02, a11, a12;
-            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+            int32_t a01, a02, a11, a12;
+            int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
 
             ip_a0 = read_and_pad(ip_a0, &a01, &a02);
             ip_a1 = read_and_pad(ip_a1, &a11, &a12);
 
-            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
-            ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
-            ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
+            ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1);
+            ch_1_out_0 = SMLAD(a11, b0, ch_1_out_0);
+            ch_1_out_1 = SMLAD(a11, b1, ch_1_out_1);
 
             b0 = arm_nn_read_q15x2_ia(&ip_b0);
             b1 = arm_nn_read_q15x2_ia(&ip_b1);
 
-            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
-            ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
-            ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
+            ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1);
+            ch_1_out_0 = SMLAD(a12, b0, ch_1_out_0);
+            ch_1_out_1 = SMLAD(a12, b1, ch_1_out_1);
 
             col_count--;
         } /* while over col_count */
         col_count = num_col_a & 0x3;
-#else
+    #else
         uint16_t col_count = num_col_a;
-#endif
+    #endif
         while (col_count)
         {
-            q7_t a0 = *ip_a0++;
-            q15_t b0 = *ip_b0++;
-            q7_t a1 = *ip_a1++;
-            q15_t b1 = *ip_b1++;
+            int8_t a0 = *ip_a0++;
+            int16_t b0 = *ip_b0++;
+            int8_t a1 = *ip_a1++;
+            int16_t b1 = *ip_b1++;
 
             ch_0_out_0 += a0 * b0;
             ch_0_out_1 += a0 * b1;
@@ -128,13 +128,13 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
         ch_0_out_0 += out_offset;
         ch_0_out_0 = MAX(ch_0_out_0, activation_min);
         ch_0_out_0 = MIN(ch_0_out_0, activation_max);
-        *out_0++ = (q7_t)ch_0_out_0;
+        *out_0++ = (int8_t)ch_0_out_0;
 
         ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
         ch_0_out_1 += out_offset;
         ch_0_out_1 = MAX(ch_0_out_1, activation_min);
         ch_0_out_1 = MIN(ch_0_out_1, activation_max);
-        *out_1++ = (q7_t)ch_0_out_1;
+        *out_1++ = (int8_t)ch_0_out_1;
         out_mult++;
         out_shift++;
 
@@ -142,13 +142,13 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
         ch_1_out_0 += out_offset;
         ch_1_out_0 = MAX(ch_1_out_0, activation_min);
         ch_1_out_0 = MIN(ch_1_out_0, activation_max);
-        *out_0++ = (q7_t)ch_1_out_0;
+        *out_0++ = (int8_t)ch_1_out_0;
 
         ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
         ch_1_out_1 += out_offset;
         ch_1_out_1 = MAX(ch_1_out_1, activation_min);
         ch_1_out_1 = MIN(ch_1_out_1, activation_max);
-        *out_1++ = (q7_t)ch_1_out_1;
+        *out_1++ = (int8_t)ch_1_out_1;
         out_mult++;
         out_shift++;
 
@@ -161,11 +161,11 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
     if (output_ch & 0x1)
     {
         /* setup pointers for B */
-        const q15_t *ip_b0 = input_b;
-        const q15_t *ip_b1 = ip_b0 + num_col_a;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + num_col_a;
 
-        q31_t ch_0_out_0 = 0;
-        q31_t ch_0_out_1 = 0;
+        int32_t ch_0_out_0 = 0;
+        int32_t ch_0_out_1 = 0;
 
         /* load the bias */
         if (bias)
@@ -174,35 +174,35 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
             ch_0_out_1 = *bias++;
         }
 
-#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_DSP)
         uint16_t col_count = num_col_a >> 2;
         while (col_count)
         {
-            q31_t a01, a02;
-            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+            int32_t a01, a02;
+            int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
 
             ip_a0 = read_and_pad(ip_a0, &a01, &a02);
 
-            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+            ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1);
 
             b0 = arm_nn_read_q15x2_ia(&ip_b0);
             b1 = arm_nn_read_q15x2_ia(&ip_b1);
-            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+            ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1);
 
             col_count--;
         }
         col_count = num_col_a & 0x3;
-#else
+    #else
         uint16_t col_count = num_col_a;
-#endif
+    #endif
         while (col_count)
         {
-            q7_t a0 = *ip_a0++;
-            q15_t b0 = *ip_b0++;
-            q15_t b1 = *ip_b1++;
+            int8_t a0 = *ip_a0++;
+            int16_t b0 = *ip_b0++;
+            int16_t b1 = *ip_b1++;
 
             ch_0_out_0 += a0 * b0;
             ch_0_out_1 += a0 * b1;
@@ -212,13 +212,13 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
         ch_0_out_0 += out_offset;
         ch_0_out_0 = MAX(ch_0_out_0, activation_min);
         ch_0_out_0 = MIN(ch_0_out_0, activation_max);
-        *out_0++ = (q7_t)ch_0_out_0;
+        *out_0++ = (int8_t)ch_0_out_0;
 
         ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
         ch_0_out_1 += out_offset;
         ch_0_out_1 = MAX(ch_0_out_1, activation_min);
         ch_0_out_1 = MIN(ch_0_out_1, activation_max);
-        *out_1++ = (q7_t)ch_0_out_1;
+        *out_1++ = (int8_t)ch_0_out_1;
         out_mult++;
         out_shift++;
     }
diff --git a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
index add72484..06b89a9c 100644
--- a/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
+++ b/src/third_party/cmsis_nn/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mult_s8.c
  * Description:  General Matrix-multiplication function
  *
- * $Date:        16 August 2022
- * $Revision:    V.2.0.7
+ * $Date:        26 October 2022
+ * $Revision:    V.2.0.8
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -36,20 +36,20 @@
  *
  */
 
-q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
-                         const q7_t *input_col,
-                         const uint16_t output_ch,
-                         const uint16_t col_batches,
-                         const int32_t *output_shift,
-                         const int32_t *output_mult,
-                         const int32_t out_offset,
-                         const int32_t col_offset,
-                         const int32_t row_offset,
-                         const int16_t activation_min,
-                         const int16_t activation_max,
-                         const uint16_t row_len,
-                         const int32_t *const bias,
-                         q7_t *out)
+int8_t *arm_nn_mat_mult_s8(const int8_t *input_row,
+                           const int8_t *input_col,
+                           const uint16_t output_ch,
+                           const uint16_t col_batches,
+                           const int32_t *output_shift,
+                           const int32_t *output_mult,
+                           const int32_t out_offset,
+                           const int32_t col_offset,
+                           const int32_t row_offset,
+                           const int16_t activation_min,
+                           const int16_t activation_max,
+                           const uint16_t row_len,
+                           const int32_t *const bias,
+                           int8_t *out)
 {
 #if defined(ARM_MATH_MVEI)
     (void)row_offset;
@@ -153,7 +153,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
                 acc_0 += out_offset;
                 acc_0 = MAX(acc_0, activation_min);
                 acc_0 = MIN(acc_0, activation_max);
-                out[i_out_ch] = (q7_t)acc_0;
+                out[i_out_ch] = (int8_t)acc_0;
             }
             out += output_ch;
         }
diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c
new file mode 100644
index 00000000..44baf6b6
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_fully_connected_get_buffer_sizes_s16.c
+ * Description:  Collection of get buffer size functions for fully connected s16 layer function.
+ *
+ * $Date:        30 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup FC
+ */
+
+/**
+ * @addtogroup GetBufferSizeFC
+ * @{
+ */
+
+int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims)
+{
+    (void)filter_dims;
+    return 0;
+}
+
+int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
+{
+    return arm_fully_connected_s16_get_buffer_size(filter_dims);
+}
+
+int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
+{
+    return arm_fully_connected_s16_get_buffer_size(filter_dims);
+}
+
+/**
+ * @} end of GetBufferSizeFC group
+ */
diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
new file mode 100644
index 00000000..15ff9b2e
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_fully_connected_get_buffer_sizes_s8.c
+ * Description:  Collection of get buffer size functions for fully connected s8 layer function.
+ *
+ * $Date:        31 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup FC
+ */
+
+/**
+ * @addtogroup GetBufferSizeFC
+ * @{
+ */
+
+int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
+{
+    (void)filter_dims;
+    return 0;
+}
+
+int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
+{
+    return arm_fully_connected_s8_get_buffer_size(filter_dims);
+}
+
+int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
+{
+    return arm_fully_connected_s8_get_buffer_size(filter_dims);
+}
+
+/**
+ * @} end of GetBufferSizeFC group
+ */
diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c
index 8bd428db..f67efc59 100644
--- a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c
+++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_fully_connected_s16
  * Description:  Fully connected function compatible with TF Lite.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        13 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M and Cortex-A cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -50,13 +50,13 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
                                             const cmsis_nn_fc_params *fc_params,
                                             const cmsis_nn_per_tensor_quant_params *quant_params,
                                             const cmsis_nn_dims *input_dims,
-                                            const q15_t *input,
+                                            const int16_t *input,
                                             const cmsis_nn_dims *filter_dims,
-                                            const q7_t *kernel,
+                                            const int8_t *kernel,
                                             const cmsis_nn_dims *bias_dims,
                                             const int64_t *bias,
                                             const cmsis_nn_dims *output_dims,
-                                            q15_t *output)
+                                            int16_t *output)
 {
     (void)bias_dims;
     (void)ctx;
@@ -64,7 +64,7 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
 
     int32_t batch_cnt = input_dims->n;
 
-    const q31_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier);
+    const int32_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier);
 
     while (batch_cnt)
     {
@@ -86,12 +86,6 @@ arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
     return (ARM_CMSIS_NN_SUCCESS);
 }
 
-int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims)
-{
-    (void)filter_dims;
-    return 0;
-}
-
 /**
  * @} end of FC group
  */
diff --git a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
index bc05a8cc..84ce9d76 100644
--- a/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+++ b/src/third_party/cmsis_nn/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_fully_connected_s8
  * Description:  Fully connected function compatible with TF Lite.
  *
- * $Date:        19 April 2022
- * $Revision:    V.4.0.0
+ * $Date:        13 January 2023
+ * $Revision:    V.5.1.0
  *
- * Target Processor:  Cortex-M and Cortex-A cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -51,13 +51,13 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                            const cmsis_nn_fc_params *fc_params,
                                            const cmsis_nn_per_tensor_quant_params *quant_params,
                                            const cmsis_nn_dims *input_dims,
-                                           const q7_t *input,
+                                           const int8_t *input,
                                            const cmsis_nn_dims *filter_dims,
-                                           const q7_t *kernel,
+                                           const int8_t *kernel,
                                            const cmsis_nn_dims *bias_dims,
                                            const int32_t *bias,
                                            const cmsis_nn_dims *output_dims,
-                                           q7_t *output)
+                                           int8_t *output)
 {
     (void)bias_dims;
     (void)ctx;
@@ -72,7 +72,6 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                  bias,
                                  output,
                                  fc_params->input_offset,
-                                 0,
                                  fc_params->output_offset,
                                  quant_params->multiplier,
                                  quant_params->shift,
@@ -88,12 +87,6 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
     return (ARM_CMSIS_NN_SUCCESS);
 }
 
-int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
-{
-    (void)filter_dims;
-    return 0;
-}
-
 /**
  * @} end of FC group
  */
diff --git a/src/third_party/cmsis_nn/Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c b/src/third_party/cmsis_nn/Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c
new file mode 100644
index 00000000..58a0141d
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c
@@ -0,0 +1,184 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_lstm_unidirectional_s16_s8.c
+ * Description:  S8 LSTM function with S16 gate output
+ *
+ * $Date:        4 November 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M processors
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup Public
+ */
+
+/**
+ * @addtogroup LSTM
+ * @{
+ */
+
+/*
+ * S8 LSTM function for TensorFlow Lite with S16 gate output
+ *
+ * Refer to header file for details.
+ *
+ */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/*
+ * LSTM unidirectional function with 8 bit input and output and 16 bit weights
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratch_buffers,
+                                                   const int8_t *input_data,
+                                                   const cmsis_nn_lstm_dims *lstm_dims,
+                                                   const int8_t *in_to_in_weights,
+                                                   const int8_t *in_to_forget_weights,
+                                                   const int8_t *in_to_cell_weights,
+                                                   const int8_t *in_to_out_weights,
+                                                   const int8_t *recurrent_to_in_weights,
+                                                   const int8_t *recurrent_to_forget_weights,
+                                                   const int8_t *recurrent_to_cell_weights,
+                                                   const int8_t *recurrent_to_out_weights,
+                                                   const int16_t *cell_to_in_weights,
+                                                   const int16_t *cell_to_forget_weights,
+                                                   const int16_t *cell_to_out_weights,
+                                                   const int8_t *projection_weights,
+                                                   const cmsis_nn_lstm_params *lstm,
+                                                   int8_t *output_state,
+                                                   int16_t *cell_state,
+                                                   int8_t *output_data)
+{
+    (void)cell_to_in_weights;
+    (void)cell_to_forget_weights;
+    (void)cell_to_out_weights;
+
+    const int32_t num_batch = lstm_dims->num_batches;
+    const int32_t num_input = lstm_dims->num_inputs;
+    const int32_t max_time = lstm_dims->max_time;
+
+    const int32_t num_output = lstm_dims->num_outputs;
+    const int32_t out_batch_leading_dim = num_output;
+
+    // num_cell = num_output is considered in the code under the assumption that projection is NULL.
+    const int32_t num_cell = num_output;
+
+    if (projection_weights != NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    if (lstm->i2f_effective_bias == NULL || lstm->i2c_effective_bias == NULL || lstm->i2o_effective_bias == NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    if (lstm->r2f_effective_bias == NULL || lstm->r2c_effective_bias == NULL || lstm->r2o_effective_bias == NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    if (lstm->i2i_effective_bias == NULL || lstm->r2i_effective_bias == NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    if (lstm->time_major)
+    {
+        const int32_t in_step = num_batch * num_input;
+        const int32_t out_step = num_batch * out_batch_leading_dim;
+        for (int i_max_time = 0; i_max_time < max_time; i_max_time++)
+        {
+            arm_cmsis_nn_status status = arm_nn_lstm_step_s8_s16(input_data + i_max_time * in_step,
+                                                                 in_to_in_weights,
+                                                                 in_to_forget_weights,
+                                                                 in_to_cell_weights,
+                                                                 in_to_out_weights,
+                                                                 recurrent_to_in_weights,
+                                                                 recurrent_to_forget_weights,
+                                                                 recurrent_to_cell_weights,
+                                                                 recurrent_to_out_weights,
+                                                                 lstm,
+                                                                 num_batch,
+                                                                 num_cell,
+                                                                 num_input,
+                                                                 num_output,
+                                                                 output_state,
+                                                                 cell_state,
+                                                                 output_data + i_max_time * out_step,
+                                                                 scratch_buffers);
+            if (status != ARM_CMSIS_NN_SUCCESS)
+            {
+                return status;
+            }
+        }
+    }
+    else
+    {
+        for (int i_num_batch = 0; i_num_batch < num_batch; i_num_batch++)
+        {
+            const int32_t in_step = num_input;
+            const int32_t out_step = out_batch_leading_dim;
+            for (int i_max_time = 0; i_max_time < max_time; i_max_time++)
+            {
+                const int32_t time_offset = i_num_batch * max_time + i_max_time;
+
+                arm_cmsis_nn_status status = arm_nn_lstm_step_s8_s16(input_data + time_offset * in_step,
+                                                                     in_to_in_weights,
+                                                                     in_to_forget_weights,
+                                                                     in_to_cell_weights,
+                                                                     in_to_out_weights,
+                                                                     recurrent_to_in_weights,
+                                                                     recurrent_to_forget_weights,
+                                                                     recurrent_to_cell_weights,
+                                                                     recurrent_to_out_weights,
+                                                                     lstm,
+                                                                     /*num_batch=*/1,
+                                                                     num_cell,
+                                                                     num_input,
+                                                                     num_output,
+                                                                     output_state + i_num_batch * out_batch_leading_dim,
+                                                                     cell_state + i_num_batch * num_cell,
+                                                                     output_data + time_offset * out_step,
+                                                                     scratch_buffers);
+                if (status != ARM_CMSIS_NN_SUCCESS)
+                {
+                    return status;
+                }
+            }
+        }
+    }
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of LSTM group
+ */
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
index 263ff780..c75694b2 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_depthwise_conv_nt_t_padded_s8.c
  * Description:  Depthwise convolution with padded matrices.
  *
- * $Date:        27. July 2022
- * $Revision:    V.2.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.2.0.1
  *
  * Target Processor:  Cortex-M processors with MVE extension
  * -------------------------------------------------------------------- */
@@ -53,8 +53,8 @@
  *
  */
 
-arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
-                                                         const q7_t *rhs,
+arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const int8_t *lhs,
+                                                         const int8_t *rhs,
                                                          const int32_t input_offset,
                                                          const int32_t active_ch,
                                                          const int32_t total_ch,
@@ -65,7 +65,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
                                                          const int32_t activation_max,
                                                          const uint16_t row_x_col,
                                                          const int32_t *const output_bias,
-                                                         q7_t *out)
+                                                         int8_t *out)
 {
 #if defined(ARM_MATH_MVEI)
     int32_t loop_count = (active_ch + 3) / 4;
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c
index 7bcff790..b623b896 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_depthwise_conv_nt_t_s16.c
  * Description:  Depthwise convolution on matrices with no padding.
  *
- * $Date:        6 July 2022
- * $Revision:    V.1.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M processors with MVE extension
  * -------------------------------------------------------------------- */
@@ -45,7 +45,7 @@
  *
  */
 int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs,
-                                        const q7_t *rhs,
+                                        const int8_t *rhs,
                                         const uint16_t num_ch,
                                         const int32_t *out_shift,
                                         const int32_t *out_mult,
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
index 30700631..6ec09708 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_depthwise_conv_nt_t_s8.c
  * Description:  Depthwise convolution on matrices with no padding.
  *
- * $Date:        27. July 2022
- * $Revision:    V.2.0.0
+ * $Date:        26 October 2022
+ * $Revision:    V.2.0.1
  *
  * Target Processor:  Cortex-M processors with MVE extension.
  * -------------------------------------------------------------------- */
@@ -44,8 +44,8 @@
  * Refer header file for details.
  *
  */
-arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
-                                                  const q7_t *rhs,
+arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs,
+                                                  const int8_t *rhs,
                                                   const int32_t input_offset,
                                                   const int32_t active_ch,
                                                   const int32_t total_ch,
@@ -56,7 +56,7 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
                                                   const int32_t activation_max,
                                                   const uint16_t row_x_col,
                                                   const int32_t *const output_bias,
-                                                  q7_t *out)
+                                                  int8_t *out)
 {
 #if defined(ARM_MATH_MVEI)
     const int32_t *bias = output_bias;
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c
new file mode 100644
index 00000000..020dd197
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c
@@ -0,0 +1,99 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_lstm_calculate_gate_s8_s16.c
+ * Description:  Update single gate for an incremental step of LSTM function.
+ *
+ * $Date:        8 September 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nn_tables.h"
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @defgroup supportLSTM LSTM
+ *
+ * Support functions for LSTM
+ *
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ * Calculates a single LSTM gate, int8x8_16 version.
+ * Refer to header file for details
+ */
+void arm_nn_lstm_calculate_gate_s8_s16(const int8_t *input,
+                                       const int8_t *input_to_gate_weights,
+                                       const int32_t *input_to_gate_bias,
+                                       const cmsis_nn_scaling input_to_gate_scaling,
+                                       const int8_t *output_state,
+                                       const int8_t *recurrent_to_gate_weights,
+                                       const int32_t *recurrent_to_gate_bias,
+                                       const cmsis_nn_scaling recurrent_to_gate,
+                                       const int32_t n_batch,
+                                       const int32_t n_input,
+                                       const int32_t n_output,
+                                       const int32_t n_cell,
+                                       const arm_nn_activation_type activation_type,
+                                       int16_t *gate)
+{
+    const int32_t n_block = n_batch * n_cell;
+
+    memset(gate, 0, n_block * sizeof(int16_t));
+    arm_nn_vec_mat_mul_result_acc_s8(input,
+                                     input_to_gate_weights,
+                                     input_to_gate_bias,
+                                     gate,
+                                     0,
+                                     input_to_gate_scaling.multiplier,
+                                     input_to_gate_scaling.shift,
+                                     n_input,
+                                     n_cell,
+                                     n_batch);
+
+    arm_nn_vec_mat_mul_result_acc_s8(output_state,
+                                     recurrent_to_gate_weights,
+                                     recurrent_to_gate_bias,
+                                     gate,
+                                     0,
+                                     recurrent_to_gate.multiplier,
+                                     recurrent_to_gate.shift,
+                                     n_output,
+                                     n_cell,
+                                     n_batch);
+
+    arm_nn_activation_s16(gate, gate, n_block, 0, activation_type);
+}
+/**
+ * @} end of supportLSTM group
+ */
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_step_s8_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_step_s8_s16.c
new file mode 100644
index 00000000..e4fd5b8b
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_step_s8_s16.c
@@ -0,0 +1,154 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_lstm_step_s8_s16.c
+ * Description:  Update LSTM function for a single iteration step.
+ *
+ * $Date:        9 Februari 2023
+ * $Revision:    V.1.1.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ * Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version.
+ * Refer to header file for details.
+ */
+arm_cmsis_nn_status arm_nn_lstm_step_s8_s16(const int8_t *input,
+                                            const int8_t *input_to_input_weight,
+                                            const int8_t *input_to_forget_weight,
+                                            const int8_t *input_to_cell_weight,
+                                            const int8_t *input_to_output_weight,
+                                            const int8_t *recurrent_to_input_weight,
+                                            const int8_t *recurrent_to_forget_weight,
+                                            const int8_t *recurrent_to_cell_weight,
+                                            const int8_t *recurrent_to_output_weight,
+                                            const cmsis_nn_lstm_params *lstm,
+                                            const int n_batch,
+                                            const int n_cell,
+                                            const int n_input,
+                                            const int n_output,
+                                            int8_t *output_state,
+                                            int16_t *cell_state,
+                                            int8_t *output,
+                                            cmsis_nn_lstm_context *scratch_buffers)
+{
+    const int32_t n_block = n_batch * n_cell;
+
+    // Calculate the input gate
+    arm_nn_lstm_calculate_gate_s8_s16(input,
+                                      input_to_input_weight,
+                                      lstm->i2i_effective_bias,
+                                      lstm->input_to_input_scaling,
+                                      output_state,
+                                      recurrent_to_input_weight,
+                                      lstm->r2i_effective_bias,
+                                      lstm->recurrent_to_input_scaling,
+                                      n_batch,
+                                      n_input,
+                                      n_output,
+                                      n_cell,
+                                      ARM_SIGMOID,
+                                      scratch_buffers->input_gate);
+
+    // Calculate the forget gate
+    arm_nn_lstm_calculate_gate_s8_s16(input,
+                                      input_to_forget_weight,
+                                      lstm->i2f_effective_bias,
+                                      lstm->input_to_forget_scaling,
+                                      output_state,
+                                      recurrent_to_forget_weight,
+                                      lstm->r2f_effective_bias,
+                                      lstm->recurrent_to_forget_scaling,
+                                      n_batch,
+                                      n_input,
+                                      n_output,
+                                      n_cell,
+                                      ARM_SIGMOID,
+                                      scratch_buffers->forget_gate);
+
+    // Calculate the cell update gate
+    arm_nn_lstm_calculate_gate_s8_s16(input,
+                                      input_to_cell_weight,
+                                      lstm->i2c_effective_bias,
+                                      lstm->input_to_cell_scaling,
+                                      output_state,
+                                      recurrent_to_cell_weight,
+                                      lstm->r2c_effective_bias,
+                                      lstm->recurrent_to_cell_scaling,
+                                      n_batch,
+                                      n_input,
+                                      n_output,
+                                      n_cell,
+                                      ARM_TANH,
+                                      scratch_buffers->cell_gate);
+
+    // Update the cell state
+    arm_nn_lstm_update_cell_state_s16(n_block,
+                                      lstm->cell_state_shift,
+                                      cell_state,
+                                      scratch_buffers->input_gate,
+                                      scratch_buffers->forget_gate,
+                                      scratch_buffers->cell_gate);
+
+    // Calculate the output gate
+    arm_nn_lstm_calculate_gate_s8_s16(input,
+                                      input_to_output_weight,
+                                      lstm->i2o_effective_bias,
+                                      lstm->input_to_output_scaling,
+                                      output_state,
+                                      recurrent_to_output_weight,
+                                      lstm->r2o_effective_bias,
+                                      lstm->recurrent_to_output_scaling,
+                                      n_batch,
+                                      n_input,
+                                      n_output,
+                                      n_cell,
+                                      ARM_SIGMOID,
+                                      scratch_buffers->output_gate);
+
+    // Update the output state
+    arm_nn_lstm_update_output_s8_s16(n_batch,
+                                     n_cell,
+                                     cell_state,
+                                     lstm->cell_state_shift,
+                                     scratch_buffers->output_gate,
+                                     lstm->hidden_scaling,
+                                     lstm->hidden_offset,
+                                     output_state,
+                                     scratch_buffers->input_gate);
+
+    arm_memcpy_s8(output, output_state, n_batch * n_output * sizeof(int8_t));
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+/**
+ * @} end of supportLSTM group
+ */
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_cell_state_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_cell_state_s16.c
new file mode 100644
index 00000000..4a81c288
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_cell_state_s16.c
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_lstm_update_cell_state_s16.c
+ * Description:  Update cell state for an incremental step of LSTM function.
+ *
+ * $Date:        20 January 2023
+ * $Revision:    V.1.2.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ * Update cell state for a single LSTM iteration step, int8x8_16 version.
+ *
+ * Refer to header file for more details
+ */
+void arm_nn_lstm_update_cell_state_s16(const int32_t n_block,
+                                       const int32_t cell_state_scale,
+                                       int16_t *cell_state,
+                                       const int16_t *input_gate,
+                                       const int16_t *forget_gate,
+                                       const int16_t *cell_gate)
+{
+    const int32_t cell_scale = 30 + cell_state_scale;
+    int32_t loop_count = n_block;
+
+#if defined(ARM_MATH_MVEI)
+
+    while (loop_count > 0)
+    {
+        mve_pred16_t p = vctp32q(loop_count);
+        loop_count -= 4;
+
+        int32x4_t res_1 = vmulq_s32(vldrhq_z_s32(cell_state, p), vldrhq_z_s32(forget_gate, p));
+        forget_gate += 4;
+        res_1 = arm_divide_by_power_of_two_mve(res_1, 15);
+        int32x4_t res_2 = vmulq_s32(vldrhq_z_s32(input_gate, p), vldrhq_z_s32(cell_gate, p));
+        input_gate += 4;
+        cell_gate += 4;
+
+        res_2 = arm_divide_by_power_of_two_mve(res_2, cell_scale);
+        res_1 += res_2;
+
+        res_1 = vmaxq_s32(res_1, vdupq_n_s32(NN_Q15_MIN));
+        res_1 = vminq_s32(res_1, vdupq_n_s32(NN_Q15_MAX));
+
+        vstrhq_p_s32(cell_state, res_1, p);
+        cell_state += 4;
+    }
+#else
+    #if defined(ARM_MATH_DSP)
+    while (loop_count > 1)
+    {
+        int32_t cell_state_01 = arm_nn_read_s16x2(cell_state);
+        int32_t forget_gate_01 = arm_nn_read_q15x2_ia(&forget_gate);
+
+        int32_t value_00 = SMULBB(cell_state_01, forget_gate_01);
+        int32_t value_01 = SMULTT(cell_state_01, forget_gate_01);
+        value_00 = arm_nn_divide_by_power_of_two(value_00, 15);
+        value_01 = arm_nn_divide_by_power_of_two(value_01, 15);
+
+        int32_t input_gate_01 = arm_nn_read_q15x2_ia(&input_gate);
+        int32_t cell_gate_01 = arm_nn_read_q15x2_ia(&cell_gate);
+
+        int32_t value_10 = SMULBB(input_gate_01, cell_gate_01);
+        int32_t value_11 = SMULTT(input_gate_01, cell_gate_01);
+
+        value_10 = arm_nn_divide_by_power_of_two(value_10, cell_scale);
+        value_11 = arm_nn_divide_by_power_of_two(value_11, cell_scale);
+
+        value_00 += value_10;
+        value_01 += value_11;
+
+        value_00 = CLAMP(value_00, NN_Q15_MAX, NN_Q15_MIN);
+        value_01 = CLAMP(value_01, NN_Q15_MAX, NN_Q15_MIN);
+
+        arm_nn_write_q15x2_ia(&cell_state, PACK_Q15x2_32x1(value_00, value_01));
+        loop_count -= 2;
+    }
+    #endif
+    for (int i = 0; i < loop_count; i++)
+    {
+        int32_t value = cell_state[i] * forget_gate[i];
+        int32_t value_1 = input_gate[i] * cell_gate[i];
+
+        value = arm_nn_divide_by_power_of_two(value, 15);
+        value_1 = arm_nn_divide_by_power_of_two(value_1, cell_scale);
+
+        cell_state[i] = CLAMP(value + value_1, NN_Q15_MAX, NN_Q15_MIN);
+    }
+#endif // #if defined(ARM_MATH_MVEI)
+}
+/**
+ * @} end of supportLSTM group
+ */
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_output_s8_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_output_s8_s16.c
new file mode 100644
index 00000000..3367e3f2
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_lstm_update_output_s8_s16.c
@@ -0,0 +1,81 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_lstm_update_output_s8_s16.c
+ * Description:  Update output gate for an incremental step of LSTM function.
+ *
+ * $Date:        13 Februari 2023
+ * $Revision:    V.2.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ * Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version.
+ * Refer to header files for details
+ */
+void arm_nn_lstm_update_output_s8_s16(const int n_batch,
+                                      const int n_cell,
+                                      int16_t *cell_state,
+                                      const int32_t cell_state_scale,
+                                      const int16_t *output_gate,
+                                      const cmsis_nn_scaling hidden_scaling,
+                                      const int32_t hidden_offset,
+                                      int8_t *output_state,
+                                      int16_t *cell_gate_scratch)
+{
+    const int32_t size = n_batch * n_cell;
+
+    int32_t tanh_input_left_shift = (15 + cell_state_scale) - 3;
+    if (tanh_input_left_shift < 0)
+    {
+        tanh_input_left_shift = -tanh_input_left_shift;
+        for (int32_t i = 0; i < size; i++)
+        {
+            cell_state[i] = cell_state[i] >> tanh_input_left_shift;
+        }
+        tanh_input_left_shift = 0;
+    }
+    arm_nn_activation_s16(cell_state, cell_gate_scratch, size, tanh_input_left_shift, ARM_TANH);
+
+    arm_elementwise_mul_s16_s8(output_gate,
+                               cell_gate_scratch,
+                               output_state,
+                               hidden_offset,
+                               hidden_scaling.multiplier,
+                               hidden_scaling.shift,
+                               size);
+}
+/**
+ * @} end of supportLSTM group
+ */
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
index 5b227d4c..03dc7ab6 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_nn_mat_mul_core_1x_s8.c
  * Description:  General Matrix-multiplication function
  *
- * $Date:        22 Aug 2022
- * $Revision:    V.3.1.0
+ * $Date:        20 January 2023
+ * $Revision:    V.3.1.3
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
@@ -70,27 +70,28 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
 
         int32_t sum_tmp = 0;
 
-#if defined(ARM_MATH_AUTOVECTORIZE)
+    #if defined(ARM_MATH_AUTOVECTORIZE)
         for (int j = 0; j < row_elements; j++)
         {
             int32_t col = col_base[j];
             sum_tmp += col;
             acc_n0 += row_base[j] * col;
         }
-#else
-        __ASM volatile("   vldrb.8         q0, [%[col]], #16     \n"
-                       "   wlstp.8         lr, %[cnt], 1f       \n"
+    #else
+        __ASM volatile(" .p2align 2                             \n"
+                       "  vldrb.8         q0, [%[col]], #16     \n"
+                       "  wlstp.8         lr, %[cnt], 1f       \n"
                        "2:                                      \n"
-                       "   vaddva.s8      %[sum], q0            \n"
-                       "   vldrb.8         q1, [%[row0]], #16    \n"
-                       "   vmladava.s8    %[out0], q0, q1       \n"
-                       "   vldrb.8         q0, [%[col]], #16     \n"
-                       "   letp            lr, 2b               \n"
+                       "  vaddva.s8      %[sum], q0            \n"
+                       "  vldrb.8         q1, [%[row0]], #16   \n"
+                       "  vmladava.s8    %[out0], q0, q1       \n"
+                       "  vldrb.8         q0, [%[col]], #16    \n"
+                       "  letp            lr, 2b               \n"
                        "1:                                      \n"
                        : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0)
                        : [cnt] "r"(row_elements)
                        : "q0", "q1", "memory", "r14");
-#endif
+    #endif
 
         sum_tmp *= conv_params->input_offset;
         acc_n0 += sum_tmp;
@@ -129,8 +130,9 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
         acc_n0 += conv_params->output_offset;
         acc_n0 = MAX(acc_n0, conv_params->activation.min);
         acc_n0 = MIN(acc_n0, conv_params->activation.max);
-        *output++ = (q7_t)acc_n0;
+        *output++ = (int8_t)acc_n0;
     }
+    return ARM_CMSIS_NN_SUCCESS;
 
 #else
     (void)row_elements;
@@ -142,8 +144,8 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
     (void)quant_params;
     (void)bias;
     (void)output;
+    return ARM_CMSIS_NN_NO_IMPL_ERROR;
 #endif
-    return ARM_CMSIS_NN_SUCCESS;
 }
 
 /**
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
index 8b69107c..643a9c7f 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mul_core_4x_s8.c
  * Description:  General matrix multiplication function for MVE extension
  *
- * $Date:        22. Aug 2022
- * $Revision:    V.3.1.0
+ * $Date:        13 December 2022
+ * $Revision:    V.3.1.1
  *
  * Target Processor:  Cortex-M processors
  * -------------------------------------------------------------------- */
@@ -81,19 +81,20 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
             acc_n3 += ip_row_3[j] * col;
         }
 #else
-        __ASM volatile("   vldrb.8         q0, [%[col]], #16     \n"
+        __ASM volatile(" .p2align 2                             \n"
+                       "   vldrb.8         q0, [%[col]], #16    \n"
                        "   wlstp.8         lr, %[cnt], 1f       \n"
                        "2:                                      \n"
                        "   vaddva.s8      %[sum], q0            \n"
-                       "   vldrb.8         q1, [%[row0]], #16    \n"
+                       "   vldrb.8         q1, [%[row0]], #16   \n"
                        "   vmladava.s8    %[out0], q0, q1       \n"
-                       "   vldrb.8         q2, [%[row1]], #16    \n"
+                       "   vldrb.8         q2, [%[row1]], #16   \n"
                        "   vmladava.s8     %[out1], q0, q2      \n"
-                       "   vldrb.8         q3, [%[row2]], #16    \n"
+                       "   vldrb.8         q3, [%[row2]], #16   \n"
                        "   vmladava.s8     %[out2], q0, q3      \n"
-                       "   vldrb.8         q4, [%[row3]], #16    \n"
+                       "   vldrb.8         q4, [%[row3]], #16   \n"
                        "   vmladava.s8     %[out3], q0, q4      \n"
-                       "   vldrb.8         q0, [%[col]], #16     \n"
+                       "   vldrb.8         q0, [%[col]], #16    \n"
                        "   letp            lr, 2b               \n"
                        "1:                                      \n"
                        : [col] "+r"(col_base),
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
index 2295b0fe..28cd534a 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates
+ * <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +22,10 @@
  * Title:        arm_nn_mat_mult_kernel_s16.c
  * Description:  Matrix-multiplication function for convolution
  *
- * $Date:        12 August 2021
- * $Revision:    V.1.1.0
+ * $Date:        5 Janauray 2023
+ * $Revision:    V.1.2.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
@@ -46,74 +47,74 @@
  *
  */
 
-q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
-                                  const q15_t *input_b,
-                                  const int32_t output_ch,
-                                  const int32_t *out_shift,
-                                  const int32_t *out_mult,
-                                  const int16_t activation_min,
-                                  const int16_t activation_max,
-                                  const int32_t num_col_a,
-                                  const int64_t *const output_bias,
-                                  q15_t *out_0)
+int16_t *arm_nn_mat_mult_kernel_s16(const int8_t *input_a,
+                                    const int16_t *input_b,
+                                    const int32_t output_ch,
+                                    const int32_t *out_shift,
+                                    const int32_t *out_mult,
+                                    const int16_t activation_min,
+                                    const int16_t activation_max,
+                                    const int32_t num_col_a,
+                                    const int64_t *const output_bias,
+                                    int16_t *out_0)
 {
 
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* set up the second output pointers */
-    q15_t *out_1 = out_0 + output_ch;
+    int16_t *out_1 = out_0 + output_ch;
     const int64_t *bias = output_bias;
     uint16_t row_count = output_ch / 2;
-    const q7_t *ip_a0 = input_a;
+    const int8_t *ip_a0 = input_a;
 
     /* this loop over rows in A */
     while (row_count)
     {
         /* setup pointers for B */
-        const q15_t *ip_b0 = input_b;
-        const q15_t *ip_b1 = ip_b0 + num_col_a;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + num_col_a;
 
         /* align the second pointer for A */
-        const q7_t *ip_a1 = ip_a0 + num_col_a;
+        const int8_t *ip_a1 = ip_a0 + num_col_a;
 
         /* Init accumulator for channel N and N + 1 */
-        q31_t ch_0_out_0 = 0;
-        q31_t ch_0_out_1 = 0;
-        q31_t ch_1_out_0 = 0;
-        q31_t ch_1_out_1 = 0;
+        int32_t ch_0_out_0 = 0;
+        int32_t ch_0_out_1 = 0;
+        int32_t ch_1_out_0 = 0;
+        int32_t ch_1_out_1 = 0;
 
         uint16_t col_count = num_col_a / 4;
         /* accumulate over the vector */
         while (col_count)
         {
-            q31_t a01, a02, a11, a12;
-            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+            int32_t a01, a02, a11, a12;
+            int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
 
             ip_a0 = read_and_pad(ip_a0, &a01, &a02);
             ip_a1 = read_and_pad(ip_a1, &a11, &a12);
 
-            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
-            ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
-            ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
+            ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1);
+            ch_1_out_0 = SMLAD(a11, b0, ch_1_out_0);
+            ch_1_out_1 = SMLAD(a11, b1, ch_1_out_1);
 
             b0 = arm_nn_read_q15x2_ia(&ip_b0);
             b1 = arm_nn_read_q15x2_ia(&ip_b1);
 
-            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
-            ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
-            ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
+            ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1);
+            ch_1_out_0 = SMLAD(a12, b0, ch_1_out_0);
+            ch_1_out_1 = SMLAD(a12, b1, ch_1_out_1);
 
             col_count--;
         } /* while over col_count */
         col_count = num_col_a & 0x3;
         while (col_count)
         {
-            q7_t a0 = *ip_a0++;
-            q15_t b0 = *ip_b0++;
-            q7_t a1 = *ip_a1++;
-            q15_t b1 = *ip_b1++;
+            int8_t a0 = *ip_a0++;
+            int16_t b0 = *ip_b0++;
+            int8_t a1 = *ip_a1++;
+            int16_t b1 = *ip_b1++;
 
             ch_0_out_0 += a0 * b0;
             ch_0_out_1 += a0 * b1;
@@ -123,8 +124,8 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
         } /* while over col_count */
         if (bias)
         {
-            q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
-            q63_t acc_64 = ch_0_out_0 + *bias;
+            int32_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+            int64_t acc_64 = ch_0_out_0 + *bias;
             ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
             acc_64 = ch_0_out_1 + *bias++;
             ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
@@ -138,17 +139,17 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
         }
         ch_0_out_0 = MAX(ch_0_out_0, activation_min);
         ch_0_out_0 = MIN(ch_0_out_0, activation_max);
-        *out_0++ = (q15_t)ch_0_out_0;
+        *out_0++ = (int16_t)ch_0_out_0;
 
         ch_0_out_1 = MAX(ch_0_out_1, activation_min);
         ch_0_out_1 = MIN(ch_0_out_1, activation_max);
-        *out_1++ = (q15_t)ch_0_out_1;
+        *out_1++ = (int16_t)ch_0_out_1;
         out_shift++;
 
         if (bias)
         {
-            q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
-            q63_t acc_64 = ch_1_out_0 + *bias;
+            int32_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+            int64_t acc_64 = ch_1_out_0 + *bias;
             ch_1_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
             acc_64 = ch_1_out_1 + *bias++;
             ch_1_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
@@ -162,11 +163,11 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
         }
         ch_1_out_0 = MAX(ch_1_out_0, activation_min);
         ch_1_out_0 = MIN(ch_1_out_0, activation_max);
-        *out_0++ = (q15_t)ch_1_out_0;
+        *out_0++ = (int16_t)ch_1_out_0;
 
         ch_1_out_1 = MAX(ch_1_out_1, activation_min);
         ch_1_out_1 = MIN(ch_1_out_1, activation_max);
-        *out_1++ = (q15_t)ch_1_out_1;
+        *out_1++ = (int16_t)ch_1_out_1;
         out_shift++;
 
         /* skip row */
@@ -178,37 +179,37 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
     if (output_ch & 0x1)
     {
         /* setup pointers for B */
-        const q15_t *ip_b0 = input_b;
-        const q15_t *ip_b1 = ip_b0 + num_col_a;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + num_col_a;
 
-        q31_t ch_0_out_0 = 0;
-        q31_t ch_0_out_1 = 0;
+        int32_t ch_0_out_0 = 0;
+        int32_t ch_0_out_1 = 0;
 
         uint16_t col_count = num_col_a >> 2;
         while (col_count)
         {
-            q31_t a01, a02;
-            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+            int32_t a01, a02;
+            int32_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            int32_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
 
             ip_a0 = read_and_pad(ip_a0, &a01, &a02);
 
-            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+            ch_0_out_0 = SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a01, b1, ch_0_out_1);
 
             b0 = arm_nn_read_q15x2_ia(&ip_b0);
             b1 = arm_nn_read_q15x2_ia(&ip_b1);
-            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+            ch_0_out_0 = SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = SMLAD(a02, b1, ch_0_out_1);
 
             col_count--;
         }
         col_count = num_col_a & 0x3;
         while (col_count)
         {
-            q7_t a0 = *ip_a0++;
-            q15_t b0 = *ip_b0++;
-            q15_t b1 = *ip_b1++;
+            int8_t a0 = *ip_a0++;
+            int16_t b0 = *ip_b0++;
+            int16_t b1 = *ip_b1++;
 
             ch_0_out_0 += a0 * b0;
             ch_0_out_1 += a0 * b1;
@@ -216,8 +217,8 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
         }
         if (bias)
         {
-            q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
-            q63_t acc_64 = ch_0_out_0 + *bias;
+            int32_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+            int64_t acc_64 = ch_0_out_0 + *bias;
             ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
             acc_64 = ch_0_out_1 + *bias++;
             ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
@@ -229,11 +230,11 @@ q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
         }
         ch_0_out_0 = MAX(ch_0_out_0, activation_min);
         ch_0_out_0 = MIN(ch_0_out_0, activation_max);
-        *out_0++ = (q15_t)ch_0_out_0;
+        *out_0++ = (int16_t)ch_0_out_0;
 
         ch_0_out_1 = MAX(ch_0_out_1, activation_min);
         ch_0_out_1 = MIN(ch_0_out_1, activation_max);
-        *out_1++ = (q15_t)ch_0_out_1;
+        *out_1++ = (int16_t)ch_0_out_1;
         out_mult++;
         out_shift++;
     }
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
index a446db8c..b8f9c14f 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_nn_mat_mult_s8_nt_t_s8
  * Description:  Matrix multiplication support function with the right-hand-side (rhs) matrix transposed
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        5 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -45,10 +45,10 @@
  * Refer header file for details.
  *
  */
-arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
-                                            const q7_t *rhs,
-                                            const q31_t *bias,
-                                            q7_t *dst,
+arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs,
+                                            const int8_t *rhs,
+                                            const int32_t *bias,
+                                            int8_t *dst,
                                             const int32_t *dst_multipliers,
                                             const int32_t *dst_shifts,
                                             const int32_t lhs_rows,
@@ -57,18 +57,179 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
                                             const int32_t lhs_offset,
                                             const int32_t dst_offset,
                                             const int32_t activation_min,
-                                            const int32_t activation_max)
+                                            const int32_t activation_max,
+                                            const int32_t rhs_cols_offset)
 {
-#if defined(ARM_MATH_DSP)
+    if (rhs_cols_offset < rhs_cols)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+#if defined(ARM_MATH_MVEI)
+
+    int8_t *out_ref = dst;
+    const int8_t *in_ref = lhs;
+    (void)out_ref;
+    (void)in_ref;
+    int32_t offset = rhs_cols_offset;
+    int i_items = 0;
+    for (; i_items <= (lhs_rows - 4); i_items += 4)
+    {
+        for (int i = 0; i < rhs_rows; i++)
+        {
+            int32_t acc_n0 = 0;
+            int32_t acc_n1 = 0;
+            int32_t acc_n2 = 0;
+            int32_t acc_n3 = 0;
+
+            const int8_t *lhs_vec = lhs;
+            const int8_t *ip_row_1 = lhs + offset;
+            const int8_t *ip_row_2 = lhs + (2 * offset);
+            const int8_t *ip_row_3 = lhs + (3 * offset);
+            const int8_t *col_base = rhs + i * rhs_cols;
+            int32_t sum_tmp = 0;
+
+    #if defined(ARM_MATH_AUTOVECTORIZE)
+            for (int j = 0; j < rhs_cols; j++)
+            {
+                int32_t col = col_base[j];
+                sum_tmp += col;
+                acc_n0 += lhs_vec[j] * col;
+                acc_n1 += ip_row_1[j] * col;
+                acc_n2 += ip_row_2[j] * col;
+                acc_n3 += ip_row_3[j] * col;
+            }
+    #else
+            __ASM volatile(" .p2align 2                             \n"
+                           "   vldrb.8         q0, [%[col]], #16    \n"
+                           "   wlstp.8         lr, %[cnt], 1f       \n"
+                           "2:                                      \n"
+                           "   vaddva.s8      %[sum], q0            \n"
+                           "   vldrb.8         q1, [%[row0]], #16   \n"
+                           "   vmladava.s8    %[out0], q0, q1       \n"
+                           "   vldrb.8         q2, [%[row1]], #16   \n"
+                           "   vmladava.s8     %[out1], q0, q2      \n"
+                           "   vldrb.8         q3, [%[row2]], #16   \n"
+                           "   vmladava.s8     %[out2], q0, q3      \n"
+                           "   vldrb.8         q4, [%[row3]], #16   \n"
+                           "   vmladava.s8     %[out3], q0, q4      \n"
+                           "   vldrb.8         q0, [%[col]], #16    \n"
+                           "   letp            lr, 2b               \n"
+                           "1:                                      \n"
+                           : [col] "+r"(col_base),
+                             [sum] "+Te"(sum_tmp),
+                             [row0] "+r"(lhs_vec),
+                             [row1] "+r"(ip_row_1),
+                             [row2] "+r"(ip_row_2),
+                             [row3] "+r"(ip_row_3),
+                             [out0] "+Te"(acc_n0),
+                             [out1] "+Te"(acc_n1),
+                             [out2] "+Te"(acc_n2),
+                             [out3] "+Te"(acc_n3)
+                           : [cnt] "r"(rhs_cols)
+                           : "q0", "q1", "q2", "q3", "q4", "memory", "r14");
+    #endif
+            int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3};
+            sum_tmp *= lhs_offset;
+            if (bias)
+            {
+                sum_tmp += bias[i];
+            }
+            res = vaddq_n_s32(res, sum_tmp);
+
+            res = arm_requantize_mve(res, dst_multipliers[i], dst_shifts[i]);
+            res = vaddq_n_s32(res, dst_offset);
+
+            res = vmaxq_s32(res, vdupq_n_s32(activation_min));
+            res = vminq_s32(res, vdupq_n_s32(activation_max));
+
+            const uint32x4_t scatter_offset = {0, (uint32_t)rhs_rows, (uint32_t)rhs_rows * 2, (uint32_t)rhs_rows * 3};
+            vstrbq_scatter_offset_s32(dst, scatter_offset, res);
+            dst++;
+        }
+        lhs += 4 * offset;
+        dst += (3 * rhs_rows);
+    }
+
+    for (; i_items < lhs_rows; i_items++)
+    {
+        int32_t acc[4];
+        const int32_t *multipliers = dst_multipliers;
+        const int32_t *shifts = dst_shifts;
+        for (int i = 0; i < rhs_rows; i++)
+        {
+            int32_t acc_n0 = 0;
+            const int8_t *lhs_vec = lhs;
+            const int8_t *col_base = rhs + i * rhs_cols;
+            int32_t sum_tmp = 0;
+
+    #if defined(ARM_MATH_AUTOVECTORIZE)
+            for (int j = 0; j < rhs_cols; j++)
+            {
+                int32_t col = col_base[j];
+                sum_tmp += col;
+                acc_n0 += lhs_vec[j] * col;
+            }
+    #else
+            __ASM volatile(" .p2align 2                             \n"
+                           "   vldrb.8         q0, [%[col]], #16    \n"
+                           "   wlstp.8         lr, %[cnt], 1f       \n"
+                           "2:                                      \n"
+                           "   vaddva.s8      %[sum], q0            \n"
+                           "   vldrb.8         q1, [%[row0]], #16   \n"
+                           "   vmladava.s8    %[out0], q0, q1       \n"
+                           "   vldrb.8         q0, [%[col]], #16    \n"
+                           "   letp            lr, 2b               \n"
+                           "1:                                      \n"
+                           : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(lhs_vec), [out0] "+Te"(acc_n0)
+                           : [cnt] "r"(rhs_cols)
+                           : "q0", "q1", "memory", "r14");
+    #endif
+            sum_tmp *= lhs_offset;
+            sum_tmp += acc_n0;
+            if (bias)
+            {
+                sum_tmp += bias[i];
+            }
+            const int32_t index = i & 0x3;
+            acc[index] = sum_tmp;
+
+            if (index == 3)
+            {
+                int32x4_t res = vldrwq_s32(acc);
+                res = arm_requantize_mve_32x4(res, vldrwq_s32(multipliers), vldrwq_s32(shifts));
+                multipliers += 4;
+                shifts += 4;
+                res = vaddq_n_s32(res, dst_offset);
+                res = vmaxq_s32(res, vdupq_n_s32(activation_min));
+                res = vminq_s32(res, vdupq_n_s32(activation_max));
+                vstrbq_s32(dst, res);
+                dst += 4;
+            }
+        }
+        lhs += offset;
+
+        for (int i = 0; i < (rhs_rows & 0x3); i++)
+        {
+            int32_t acc_n0 = acc[i];
+            acc_n0 = arm_nn_requantize(acc_n0, multipliers[i], shifts[i]);
+            acc_n0 += dst_offset;
+            acc_n0 = MAX(acc_n0, activation_min);
+            acc_n0 = MIN(acc_n0, activation_max);
+            *dst++ = (int8_t)acc_n0;
+        }
+    }
+
+#elif defined(ARM_MATH_DSP)
     const int32_t off0 = rhs_cols - 4;
+    const int32_t lhs_off0 = rhs_cols_offset - 4;
 
     for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        q7_t *dst_ptr = &dst[0];
+        const int8_t *lhs_ptr = &lhs[0];
+        int8_t *dst_ptr = &dst[0];
 
-        q31_t lhs_offset_contribution0 = 0;
-        q31_t lhs_offset_contribution1 = 0;
+        int32_t lhs_offset_contribution0 = 0;
+        int32_t lhs_offset_contribution1 = 0;
 
         for (int32_t x = 0; x < rhs_cols; ++x)
         {
@@ -88,130 +249,158 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
         while (lhs_rows_idx)
         {
-            const q7_t *rhs_ptr = &rhs[0];
+            const int8_t *rhs_ptr = &rhs[0];
 
-            q31_t res00 = lhs_offset_contribution0;
-            q31_t res01 = lhs_offset_contribution1;
-            q31_t res10 = lhs_offset_contribution0;
-            q31_t res11 = lhs_offset_contribution1;
+            int32_t res00 = lhs_offset_contribution0;
+            int32_t res01 = lhs_offset_contribution1;
+            int32_t res10 = lhs_offset_contribution0;
+            int32_t res11 = lhs_offset_contribution1;
 
             int32_t rhs_cols_idx = 0;
 
-            q31_t val0, val1, val2, val3, val4, val5;
+            int32_t val0, val1, val2, val3, val4, val5;
 
             for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
             {
-                val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                val2 = __SXTB16(val1);
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val1 = __SXTB16_RORn(val1, 8);
-                val0 = __SXTB16_RORn(val0, 8);
+                val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val2 = SXTB16(val1);
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val1 = SXTB16_RORn(val1, 8);
+                val0 = SXTB16_RORn(val0, 8);
+
+                // 4 x MAC res00, res01
+                res00 = SMLAD(val3, val2, res00);
+                val5 = SXTB16(val4);
+                res00 = SMLAD(val0, val1, res00);
+                val4 = SXTB16_RORn(val4, 8);
+                res01 = SMLAD(val3, val5, res01);
+                res01 = SMLAD(val0, val4, res01);
+
+                // 4 x MAC res10, res11
+                val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]);
+                val3 = SXTB16(val0);
+                val0 = SXTB16_RORn(val0, 8);
+                res10 = SMLAD(val3, val2, res10);
+                res11 = SMLAD(val3, val5, res11);
+                res10 = SMLAD(val0, val1, res10);
+                val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                res11 = SMLAD(val0, val4, res11);
+
+                val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = SXTB16(val1);
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val1 = SXTB16_RORn(val1, 8);
+                val0 = SXTB16_RORn(val0, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val3, val2, res00);
-                val5 = __SXTB16(val4);
-                res00 = __SMLAD(val0, val1, res00);
-                val4 = __SXTB16_RORn(val4, 8);
-                res01 = __SMLAD(val3, val5, res01);
-                res01 = __SMLAD(val0, val4, res01);
+                res00 = SMLAD(val3, val2, res00);
+                val5 = SXTB16(val4);
+                res00 = SMLAD(val0, val1, res00);
+                val4 = SXTB16_RORn(val4, 8);
+                res01 = SMLAD(val3, val5, res01);
+                res01 = SMLAD(val0, val4, res01);
 
                 // 4 x MAC res10, res11
-                val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
-                val3 = __SXTB16(val0);
-                val0 = __SXTB16_RORn(val0, 8);
-                res10 = __SMLAD(val3, val2, res10);
-                res11 = __SMLAD(val3, val5, res11);
-                res10 = __SMLAD(val0, val1, res10);
-                val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                res11 = __SMLAD(val0, val4, res11);
-
-                val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = __SXTB16(val1);
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val1 = __SXTB16_RORn(val1, 8);
-                val0 = __SXTB16_RORn(val0, 8);
+                val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]);
+                val3 = SXTB16(val0);
+                val0 = SXTB16_RORn(val0, 8);
+                res10 = SMLAD(val3, val2, res10);
+                res11 = SMLAD(val3, val5, res11);
+                res10 = SMLAD(val0, val1, res10);
+                val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                res11 = SMLAD(val0, val4, res11);
+
+                val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = SXTB16(val1);
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val1 = SXTB16_RORn(val1, 8);
+                val0 = SXTB16_RORn(val0, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val3, val2, res00);
-                val5 = __SXTB16(val4);
-                res00 = __SMLAD(val0, val1, res00);
-                val4 = __SXTB16_RORn(val4, 8);
-                res01 = __SMLAD(val3, val5, res01);
-                res01 = __SMLAD(val0, val4, res01);
+                res00 = SMLAD(val3, val2, res00);
+                val5 = SXTB16(val4);
+                res00 = SMLAD(val0, val1, res00);
+                val4 = SXTB16_RORn(val4, 8);
+                res01 = SMLAD(val3, val5, res01);
+                res01 = SMLAD(val0, val4, res01);
 
                 // 4 x MAC res10, res11
-                val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
-                val3 = __SXTB16(val0);
-                val0 = __SXTB16_RORn(val0, 8);
-                res10 = __SMLAD(val3, val2, res10);
-                res11 = __SMLAD(val3, val5, res11);
-                res10 = __SMLAD(val0, val1, res10);
-                val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                res11 = __SMLAD(val0, val4, res11);
-
-                val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = __SXTB16(val1);
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val1 = __SXTB16_RORn(val1, 8);
-                val0 = __SXTB16_RORn(val0, 8);
+                val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]);
+                val3 = SXTB16(val0);
+                val0 = SXTB16_RORn(val0, 8);
+                res10 = SMLAD(val3, val2, res10);
+                res11 = SMLAD(val3, val5, res11);
+                res10 = SMLAD(val0, val1, res10);
+                val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                res11 = SMLAD(val0, val4, res11);
+
+                val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = SXTB16(val1);
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val1 = SXTB16_RORn(val1, 8);
+                val0 = SXTB16_RORn(val0, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val3, val2, res00);
-                val5 = __SXTB16(val4);
-                res00 = __SMLAD(val0, val1, res00);
-                val4 = __SXTB16_RORn(val4, 8);
-                res01 = __SMLAD(val3, val5, res01);
-                res01 = __SMLAD(val0, val4, res01);
+                res00 = SMLAD(val3, val2, res00);
+                val5 = SXTB16(val4);
+                res00 = SMLAD(val0, val1, res00);
+                val4 = SXTB16_RORn(val4, 8);
+                res01 = SMLAD(val3, val5, res01);
+                res01 = SMLAD(val0, val4, res01);
 
                 // 4 x MAC res10, res11
-                val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
-                val3 = __SXTB16(val0);
-                val0 = __SXTB16_RORn(val0, 8);
-                res10 = __SMLAD(val3, val2, res10);
-                res11 = __SMLAD(val3, val5, res11);
-                res10 = __SMLAD(val0, val1, res10);
-                val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                res11 = __SMLAD(val0, val4, res11);
-
-                val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = __SXTB16(val1);
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val1 = __SXTB16_RORn(val1, 8);
-                val0 = __SXTB16_RORn(val0, 8);
+                val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]);
+                val3 = SXTB16(val0);
+                val0 = SXTB16_RORn(val0, 8);
+                res10 = SMLAD(val3, val2, res10);
+                res11 = SMLAD(val3, val5, res11);
+                res10 = SMLAD(val0, val1, res10);
+                res11 = SMLAD(val0, val4, res11);
+            }
+
+            for (; rhs_cols_idx <= (rhs_cols - 4); rhs_cols_idx += 4)
+            {
+                val1 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val2 = SXTB16(val1);
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val4 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val1 = SXTB16_RORn(val1, 8);
+                val0 = SXTB16_RORn(val0, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val3, val2, res00);
-                val5 = __SXTB16(val4);
-                res00 = __SMLAD(val0, val1, res00);
-                val4 = __SXTB16_RORn(val4, 8);
-                res01 = __SMLAD(val3, val5, res01);
-                res01 = __SMLAD(val0, val4, res01);
+                res00 = SMLAD(val3, val2, res00);
+                val5 = SXTB16(val4);
+                res00 = SMLAD(val0, val1, res00);
+                val4 = SXTB16_RORn(val4, 8);
+                res01 = SMLAD(val3, val5, res01);
+                res01 = SMLAD(val0, val4, res01);
 
                 // 4 x MAC res10, res11
-                val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
-                val3 = __SXTB16(val0);
-                val0 = __SXTB16_RORn(val0, 8);
-                res10 = __SMLAD(val3, val2, res10);
-                res11 = __SMLAD(val3, val5, res11);
-                res10 = __SMLAD(val0, val1, res10);
-                res11 = __SMLAD(val0, val4, res11);
+                val0 = arm_nn_read_s8x4((const int8_t *)&lhs_ptr[lhs_off0]);
+                val3 = SXTB16(val0);
+                val0 = SXTB16_RORn(val0, 8);
+                res10 = SMLAD(val3, val2, res10);
+                res11 = SMLAD(val3, val5, res11);
+                res10 = SMLAD(val0, val1, res10);
+                res11 = SMLAD(val0, val4, res11);
             }
 
             for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
             {
-                q7_t rhs_value0 = rhs_ptr[0];
-                q7_t rhs_value1 = rhs_ptr[rhs_cols];
-                q7_t lhs_value = lhs_ptr[0];
+                int8_t rhs_value0 = rhs_ptr[0];
+                int8_t rhs_value1 = rhs_ptr[rhs_cols];
+                int8_t lhs_value = lhs_ptr[0];
 
                 res00 += lhs_value * rhs_value0;
                 res01 += lhs_value * rhs_value1;
 
-                lhs_value = lhs_ptr[rhs_cols];
+                lhs_value = lhs_ptr[rhs_cols_offset];
                 res10 += lhs_value * rhs_value0;
                 res11 += lhs_value * rhs_value1;
 
@@ -241,14 +430,15 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
             res11 = MAX(res11, activation_min);
             res11 = MIN(res11, activation_max);
 
-            dst_ptr[0] = (q7_t)res00;
-            dst_ptr[1] = (q7_t)res01;
+            dst_ptr[0] = (int8_t)res00;
+            dst_ptr[1] = (int8_t)res01;
             dst_ptr += rhs_rows;
-            dst_ptr[0] = (q7_t)res10;
-            dst_ptr[1] = (q7_t)res11;
+            dst_ptr[0] = (int8_t)res10;
+            dst_ptr[1] = (int8_t)res11;
             dst_ptr += rhs_rows;
 
-            lhs_ptr += rhs_cols;
+            lhs_ptr -= rhs_cols;
+            lhs_ptr += 2 * rhs_cols_offset;
 
             lhs_rows_idx--;
         }
@@ -256,87 +446,106 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
         // Left-over rows
         if (lhs_rows % 2)
         {
-            const q7_t *rhs_ptr = &rhs[0];
+            const int8_t *rhs_ptr = &rhs[0];
 
-            q31_t res00 = lhs_offset_contribution0;
-            q31_t res01 = lhs_offset_contribution1;
+            int32_t res00 = lhs_offset_contribution0;
+            int32_t res01 = lhs_offset_contribution1;
 
             int32_t rhs_cols_idx = 0;
 
-            q31_t val0, val1, val2, val3, val4, val5;
+            int32_t val0, val1, val2, val3, val4, val5;
             for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
             {
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val5 = __SXTB16(val2);
-                val4 = __SXTB16(val1);
-                val0 = __SXTB16_RORn(val0, 8);
-                val2 = __SXTB16_RORn(val2, 8);
-                val1 = __SXTB16_RORn(val1, 8);
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val5 = SXTB16(val2);
+                val4 = SXTB16(val1);
+                val0 = SXTB16_RORn(val0, 8);
+                val2 = SXTB16_RORn(val2, 8);
+                val1 = SXTB16_RORn(val1, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val5, val3, res00);
-                res00 = __SMLAD(val2, val0, res00);
-                res01 = __SMLAD(val5, val4, res01);
-                res01 = __SMLAD(val2, val1, res01);
-
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val5 = __SXTB16(val2);
-                val4 = __SXTB16(val1);
-                val0 = __SXTB16_RORn(val0, 8);
-                val2 = __SXTB16_RORn(val2, 8);
-                val1 = __SXTB16_RORn(val1, 8);
+                res00 = SMLAD(val5, val3, res00);
+                res00 = SMLAD(val2, val0, res00);
+                res01 = SMLAD(val5, val4, res01);
+                res01 = SMLAD(val2, val1, res01);
+
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val5 = SXTB16(val2);
+                val4 = SXTB16(val1);
+                val0 = SXTB16_RORn(val0, 8);
+                val2 = SXTB16_RORn(val2, 8);
+                val1 = SXTB16_RORn(val1, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val5, val3, res00);
-                res00 = __SMLAD(val2, val0, res00);
-                res01 = __SMLAD(val5, val4, res01);
-                res01 = __SMLAD(val2, val1, res01);
-
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val5 = __SXTB16(val2);
-                val4 = __SXTB16(val1);
-                val0 = __SXTB16_RORn(val0, 8);
-                val2 = __SXTB16_RORn(val2, 8);
-                val1 = __SXTB16_RORn(val1, 8);
+                res00 = SMLAD(val5, val3, res00);
+                res00 = SMLAD(val2, val0, res00);
+                res01 = SMLAD(val5, val4, res01);
+                res01 = SMLAD(val2, val1, res01);
+
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val5 = SXTB16(val2);
+                val4 = SXTB16(val1);
+                val0 = SXTB16_RORn(val0, 8);
+                val2 = SXTB16_RORn(val2, 8);
+                val1 = SXTB16_RORn(val1, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val5, val3, res00);
-                res00 = __SMLAD(val2, val0, res00);
-                res01 = __SMLAD(val5, val4, res01);
-                res01 = __SMLAD(val2, val1, res01);
-
-                val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
-                val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
-                val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
-                val3 = __SXTB16(val0);
-                val5 = __SXTB16(val2);
-                val4 = __SXTB16(val1);
-                val0 = __SXTB16_RORn(val0, 8);
-                val2 = __SXTB16_RORn(val2, 8);
-                val1 = __SXTB16_RORn(val1, 8);
+                res00 = SMLAD(val5, val3, res00);
+                res00 = SMLAD(val2, val0, res00);
+                res01 = SMLAD(val5, val4, res01);
+                res01 = SMLAD(val2, val1, res01);
+
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val5 = SXTB16(val2);
+                val4 = SXTB16(val1);
+                val0 = SXTB16_RORn(val0, 8);
+                val2 = SXTB16_RORn(val2, 8);
+                val1 = SXTB16_RORn(val1, 8);
+
+                // 4 x MAC res00, res01
+                res00 = SMLAD(val5, val3, res00);
+                res00 = SMLAD(val2, val0, res00);
+                res01 = SMLAD(val5, val4, res01);
+                res01 = SMLAD(val2, val1, res01);
+            }
+
+            for (; rhs_cols_idx <= (rhs_cols - 4); rhs_cols_idx += 4)
+            {
+                val0 = arm_nn_read_s8x4_ia((const int8_t **)&rhs_ptr);
+                val1 = arm_nn_read_s8x4((const int8_t *)&rhs_ptr[off0]);
+                val2 = arm_nn_read_s8x4_ia((const int8_t **)&lhs_ptr);
+                val3 = SXTB16(val0);
+                val5 = SXTB16(val2);
+                val4 = SXTB16(val1);
+                val0 = SXTB16_RORn(val0, 8);
+                val2 = SXTB16_RORn(val2, 8);
+                val1 = SXTB16_RORn(val1, 8);
 
                 // 4 x MAC res00, res01
-                res00 = __SMLAD(val5, val3, res00);
-                res00 = __SMLAD(val2, val0, res00);
-                res01 = __SMLAD(val5, val4, res01);
-                res01 = __SMLAD(val2, val1, res01);
+                res00 = SMLAD(val5, val3, res00);
+                res00 = SMLAD(val2, val0, res00);
+                res01 = SMLAD(val5, val4, res01);
+                res01 = SMLAD(val2, val1, res01);
             }
 
             // Left-over accumulations
             for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
             {
-                q7_t rhs_value0 = rhs_ptr[0];
-                q7_t rhs_value1 = rhs_ptr[rhs_cols];
-                q7_t lhs_value = lhs_ptr[0];
+                int8_t rhs_value0 = rhs_ptr[0];
+                int8_t rhs_value1 = rhs_ptr[rhs_cols];
+                int8_t lhs_value = lhs_ptr[0];
 
                 res00 += lhs_value * rhs_value0;
                 res01 += lhs_value * rhs_value1;
@@ -359,8 +568,8 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
             res01 = MAX(res01, activation_min);
             res01 = MIN(res01, activation_max);
 
-            dst_ptr[0] = (q7_t)res00;
-            dst_ptr[1] = (q7_t)res01;
+            dst_ptr[0] = (int8_t)res00;
+            dst_ptr[1] = (int8_t)res01;
         }
 
         rhs += 2 * rhs_cols;
@@ -369,13 +578,13 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
     if (rhs_rows % 2)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        q7_t *dst_ptr = &dst[0];
+        const int8_t *lhs_ptr = &lhs[0];
+        int8_t *dst_ptr = &dst[0];
 
         for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx)
         {
-            const q7_t *rhs_ptr = &rhs[0];
-            q31_t res00 = 0;
+            const int8_t *rhs_ptr = &rhs[0];
+            int32_t res00 = 0;
             if (bias)
             {
                 res00 = bias[rhs_rows - 1];
@@ -383,14 +592,16 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
             for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
             {
-                q31_t rhs_value = rhs_ptr[0];
-                q31_t lhs_value = lhs_ptr[0] + lhs_offset;
+                int32_t rhs_value = rhs_ptr[0];
+                int32_t lhs_value = lhs_ptr[0] + lhs_offset;
 
                 res00 += lhs_value * rhs_value;
 
                 ++rhs_ptr;
                 ++lhs_ptr;
             }
+            lhs_ptr -= rhs_cols;
+            lhs_ptr += rhs_cols_offset;
 
             // Quantize down
             res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]);
@@ -402,18 +613,19 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
             res00 = MAX(res00, activation_min);
             res00 = MIN(res00, activation_max);
 
-            dst_ptr[0] = (q7_t)res00;
+            dst_ptr[0] = (int8_t)res00;
             dst_ptr += rhs_rows;
         }
     }
 #else
+    (void)rhs_cols_offset;
     for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        q7_t *dst_ptr = &dst[0];
+        const int8_t *lhs_ptr = &lhs[0];
+        int8_t *dst_ptr = &dst[0];
 
-        q31_t lhs_offset_contribution0 = 0;
-        q31_t lhs_offset_contribution1 = 0;
+        int32_t lhs_offset_contribution0 = 0;
+        int32_t lhs_offset_contribution1 = 0;
 
         for (int32_t x = 0; x < rhs_cols; ++x)
         {
@@ -433,23 +645,23 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
         while (lhs_rows_idx)
         {
-            const q7_t *rhs_ptr = &rhs[0];
+            const int8_t *rhs_ptr = &rhs[0];
 
-            q31_t res00 = lhs_offset_contribution0;
-            q31_t res01 = lhs_offset_contribution1;
-            q31_t res10 = lhs_offset_contribution0;
-            q31_t res11 = lhs_offset_contribution1;
+            int32_t res00 = lhs_offset_contribution0;
+            int32_t res01 = lhs_offset_contribution1;
+            int32_t res10 = lhs_offset_contribution0;
+            int32_t res11 = lhs_offset_contribution1;
 
             for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--)
             {
-                q7_t rhs_value0 = rhs_ptr[0];
-                q7_t rhs_value1 = rhs_ptr[rhs_cols];
-                q7_t lhs_value = lhs_ptr[0];
+                int8_t rhs_value0 = rhs_ptr[0];
+                int8_t rhs_value1 = rhs_ptr[rhs_cols];
+                int8_t lhs_value = lhs_ptr[0];
 
                 res00 += lhs_value * rhs_value0;
                 res01 += lhs_value * rhs_value1;
 
-                lhs_value = lhs_ptr[rhs_cols];
+                lhs_value = lhs_ptr[rhs_cols_offset];
                 res10 += lhs_value * rhs_value0;
                 res11 += lhs_value * rhs_value1;
 
@@ -479,14 +691,15 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
             res11 = MAX(res11, activation_min);
             res11 = MIN(res11, activation_max);
 
-            dst_ptr[0] = (q7_t)res00;
-            dst_ptr[1] = (q7_t)res01;
+            dst_ptr[0] = (int8_t)res00;
+            dst_ptr[1] = (int8_t)res01;
             dst_ptr += rhs_rows;
-            dst_ptr[0] = (q7_t)res10;
-            dst_ptr[1] = (q7_t)res11;
+            dst_ptr[0] = (int8_t)res10;
+            dst_ptr[1] = (int8_t)res11;
             dst_ptr += rhs_rows;
 
-            lhs_ptr += rhs_cols;
+            lhs_ptr -= rhs_cols;
+            lhs_ptr += 2 * rhs_cols_offset;
 
             lhs_rows_idx--;
         }
@@ -494,16 +707,16 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
         // Left-over rows
         if (lhs_rows % 2)
         {
-            const q7_t *rhs_ptr = &rhs[0];
+            const int8_t *rhs_ptr = &rhs[0];
 
-            q31_t res00 = lhs_offset_contribution0;
-            q31_t res01 = lhs_offset_contribution1;
+            int32_t res00 = lhs_offset_contribution0;
+            int32_t res01 = lhs_offset_contribution1;
 
             for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--)
             {
-                q7_t rhs_value0 = rhs_ptr[0];
-                q7_t rhs_value1 = rhs_ptr[rhs_cols];
-                q7_t lhs_value = lhs_ptr[0];
+                int8_t rhs_value0 = rhs_ptr[0];
+                int8_t rhs_value1 = rhs_ptr[rhs_cols];
+                int8_t lhs_value = lhs_ptr[0];
 
                 res00 += lhs_value * rhs_value0;
                 res01 += lhs_value * rhs_value1;
@@ -526,8 +739,8 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
             res01 = MAX(res01, activation_min);
             res01 = MIN(res01, activation_max);
 
-            dst_ptr[0] = (q7_t)res00;
-            dst_ptr[1] = (q7_t)res01;
+            dst_ptr[0] = (int8_t)res00;
+            dst_ptr[1] = (int8_t)res01;
         }
 
         rhs += 2 * rhs_cols;
@@ -536,13 +749,13 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
     if (rhs_rows % 2)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        q7_t *dst_ptr = &dst[0];
+        const int8_t *lhs_ptr = &lhs[0];
+        int8_t *dst_ptr = &dst[0];
 
         for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx)
         {
-            const q7_t *rhs_ptr = &rhs[0];
-            q31_t res00 = 0;
+            const int8_t *rhs_ptr = &rhs[0];
+            int32_t res00 = 0;
             if (bias)
             {
                 res00 = bias[rhs_rows - 1];
@@ -550,14 +763,16 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
             for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--)
             {
-                q31_t rhs_value = rhs_ptr[0];
-                q31_t lhs_value = lhs_ptr[0] + lhs_offset;
+                int32_t rhs_value = rhs_ptr[0];
+                int32_t lhs_value = lhs_ptr[0] + lhs_offset;
 
                 res00 += lhs_value * rhs_value;
 
                 ++rhs_ptr;
                 ++lhs_ptr;
             }
+            lhs_ptr -= rhs_cols;
+            lhs_ptr += rhs_cols_offset;
 
             // Quantize down
             res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]);
@@ -569,7 +784,7 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
             res00 = MAX(res00, activation_min);
             res00 = MIN(res00, activation_max);
 
-            dst_ptr[0] = (q7_t)res00;
+            dst_ptr[0] = (int8_t)res00;
             dst_ptr += rhs_rows;
         }
     }
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8.c
new file mode 100644
index 00000000..b1427cca
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8.c
@@ -0,0 +1,347 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_vec_mat_mul_result_acc_s8.c
+ * Description:  Multiplies a matrix by a vector and accumulate with output.
+ *
+ * $Date:        20 January 2023
+ * $Revision:    V.1.2.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ *  Refer to header file for details.
+ */
+void arm_nn_vec_mat_mul_result_acc_s8(const int8_t *lhs_in,
+                                      const int8_t *rhs_in,
+                                      const int32_t *bias,
+                                      int16_t *dst,
+                                      const int32_t dst_offset,
+                                      const int32_t dst_multiplier,
+                                      const int32_t dst_shift,
+                                      const int32_t rhs_cols,
+                                      const int32_t rhs_rows,
+                                      const int32_t batch)
+{
+    for (int i_batch = 0; i_batch < batch; ++i_batch)
+    {
+        const int8_t *rhs = rhs_in;
+        const int8_t *lhs = lhs_in + i_batch * rhs_cols;
+
+#if defined(ARM_MATH_MVEI)
+        const int32_t row_loop_cnt = rhs_rows / 4;
+
+        for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
+        {
+            int32_t acc_0 = 0;
+            int32_t acc_1 = 0;
+            int32_t acc_2 = 0;
+            int32_t acc_3 = 0;
+
+            const int8_t *lhs_vec = lhs;
+            const int8_t *rhs_0 = rhs;
+            const int8_t *rhs_1 = rhs + rhs_cols;
+            const int8_t *rhs_2 = rhs + 2 * rhs_cols;
+            const int8_t *rhs_3 = rhs + 3 * rhs_cols;
+
+            int32_t col_cnt = rhs_cols;
+
+            while (col_cnt > 0)
+            {
+                mve_pred16_t p = vctp8q(col_cnt);
+                col_cnt -= 16;
+
+                const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
+
+                const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
+                acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p);
+
+                const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p);
+                acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p);
+
+                const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p);
+                acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p);
+
+                const int8x16_t ker_3 = vldrbq_z_s8(rhs_3, p);
+                acc_3 = vmladavaq_p_s8(acc_3, ker_3, input, p);
+
+                lhs_vec += 16;
+                rhs_0 += 16;
+                rhs_1 += 16;
+                rhs_2 += 16;
+                rhs_3 += 16;
+            }
+            rhs += 4 * rhs_cols;
+
+            int32x4_t acc = {acc_0, acc_1, acc_2, acc_3};
+            int32x4_t b = vldrwq_s32(bias);
+            acc = vaddq_s32(acc, b);
+            bias += 4;
+
+            acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
+            acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
+
+            acc = vaddq_s32(acc, vldrhq_s32(dst));
+
+            acc = vmaxq_s32(acc, vdupq_n_s32(NN_Q15_MIN));
+            acc = vminq_s32(acc, vdupq_n_s32(NN_Q15_MAX));
+
+            vstrhq_s32(dst, acc);
+            dst += 4;
+        }
+
+        const int loop_cnt = rhs_rows % 4;
+        for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
+        {
+            int32_t acc_0 = 0;
+            const int8_t *lhs_vec = lhs;
+            const int8_t *rhs_0 = rhs;
+            int32_t col_cnt = rhs_cols;
+
+            while (col_cnt > 0)
+            {
+                mve_pred16_t p = vctp8q(col_cnt);
+                col_cnt -= 16;
+                const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
+
+                const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
+                acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p);
+
+                lhs_vec += 16;
+                rhs_0 += 16;
+            }
+            rhs += rhs_cols;
+
+            acc_0 += *bias;
+            bias++;
+
+            acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
+            acc_0 += dst_offset + *dst;
+
+            // Clamp the result
+            acc_0 = CLAMP(acc_0, NN_Q15_MAX, NN_Q15_MIN);
+            *dst++ = (int16_t)acc_0;
+        }
+
+#elif defined(ARM_MATH_DSP)
+        const int32_t row_loop_cnt = rhs_rows / 2;
+
+        for (int32_t i = 0; i < row_loop_cnt; i++)
+        {
+            int32_t acc_0 = *bias++;
+            int32_t acc_1 = *bias++;
+
+            const int32_t col_loop_cnt = rhs_cols / 4;
+
+            const int8_t *lhs_vec = lhs;
+            const int8_t *rhs_0 = rhs;
+            const int8_t *rhs_1 = rhs + rhs_cols;
+            rhs += 2 * rhs_cols;
+
+            for (int j = col_loop_cnt; j != 0; j--)
+            {
+                int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+                int32_t vec_1 = SXTB16_RORn((uint32_t)vec_0, 8);
+
+                vec_0 = SXTB16(vec_0);
+
+                int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+                int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+                acc_0 = SMLAD(ker_1, vec_1, acc_0);
+
+                ker_0 = SXTB16(ker_0);
+                acc_0 = SMLAD(ker_0, vec_0, acc_0);
+
+                ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+                ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+                acc_1 = SMLAD(ker_1, vec_1, acc_1);
+
+                ker_0 = SXTB16(ker_0);
+                acc_1 = SMLAD(ker_0, vec_0, acc_1);
+            }
+
+            for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
+            {
+                const int32_t lhs_temp = *lhs_vec;
+                lhs_vec++;
+                acc_0 += lhs_temp * (*rhs_0);
+                rhs_0++;
+                acc_1 += lhs_temp * (*rhs_1);
+                rhs_1++;
+            }
+
+            acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
+            acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift);
+
+            // Add offset
+            acc_0 += dst_offset + *dst;
+            acc_1 += dst_offset + dst[1];
+            // Clamp the result
+            acc_0 = CLAMP(acc_0, NN_Q15_MAX, NN_Q15_MIN);
+            acc_1 = CLAMP(acc_1, NN_Q15_MAX, NN_Q15_MIN);
+
+            *dst++ = (int16_t)acc_0;
+            *dst++ = (int16_t)acc_1;
+        }
+
+        if (rhs_rows & 0x1)
+        {
+            int32_t acc_0 = *bias++;
+
+            const int32_t col_loop_cnt = rhs_cols / 4;
+
+            const int8_t *lhs_vec = lhs;
+            const int8_t *rhs_0 = rhs;
+
+            for (int i = col_loop_cnt; i != 0; i--)
+            {
+                int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+                int32_t vec_1 = SXTB16_RORn((uint32_t)vec_0, 8);
+                vec_0 = SXTB16(vec_0);
+
+                int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+                int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+                ker_0 = SXTB16(ker_0);
+
+                acc_0 = SMLAD(ker_1, vec_1, acc_0);
+                acc_0 = SMLAD(ker_0, vec_0, acc_0);
+            }
+
+            for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
+            {
+                const int32_t lhs_temp = *lhs_vec++;
+                acc_0 += lhs_temp * (*rhs_0++);
+            }
+
+            acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
+
+            // Add offset
+            acc_0 += dst_offset + *dst;
+            // Clamp the result
+            acc_0 = CLAMP(acc_0, NN_Q15_MAX, NN_Q15_MIN);
+            *dst++ = (int16_t)acc_0;
+        }
+
+#else
+
+        const int32_t row_loop_cnt = rhs_rows / 3;
+
+        for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
+        {
+            const int8_t *lhs_ptr = lhs;
+            const int8_t *rhs_ptr_0 = &rhs[0];
+            const int8_t *rhs_ptr_1 = &rhs[rhs_cols];
+            const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
+
+            int32_t res00 = *bias++;
+            int32_t res01 = *bias++;
+            int32_t res02 = *bias++;
+
+            for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
+            {
+                const int32_t rhs_value0 = (int8_t)*rhs_ptr_0;
+                const int32_t rhs_value1 = (int8_t)*rhs_ptr_1;
+                const int32_t rhs_value2 = (int8_t)*rhs_ptr_2;
+                const int32_t lhs_value = (int8_t)*lhs_ptr;
+
+                res00 += lhs_value * rhs_value0;
+                res01 += lhs_value * rhs_value1;
+                res02 += lhs_value * rhs_value2;
+
+                ++rhs_ptr_0;
+                ++rhs_ptr_1;
+                ++rhs_ptr_2;
+                ++lhs_ptr;
+            }
+            // Quantize down
+            res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
+            res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift);
+            res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift);
+
+            // Add offset
+            res00 += dst_offset + *dst;
+            res01 += dst_offset + dst[1];
+            res02 += dst_offset + dst[2];
+
+            // Clamp the result
+            res00 = CLAMP(res00, NN_Q15_MAX, NN_Q15_MIN);
+            res01 = CLAMP(res01, NN_Q15_MAX, NN_Q15_MIN);
+            res02 = CLAMP(res02, NN_Q15_MAX, NN_Q15_MIN);
+
+            dst[0] = (int16_t)res00;
+            dst[1] = (int16_t)res01;
+            dst[2] = (int16_t)res02;
+            dst += 3;
+
+            rhs += 3 * rhs_cols;
+        }
+
+        const int loop_cnt = rhs_rows % 3;
+
+        for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++)
+        {
+            const int8_t *lhs_ptr = &lhs[0];
+            const int8_t *rhs_ptr = &rhs[0];
+
+            int32_t res00 = *bias++;
+
+            for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
+            {
+                int32_t rhs_value0 = (int8_t)rhs_ptr[0];
+                int32_t lhs_value = (int8_t)lhs_ptr[0];
+
+                res00 += lhs_value * rhs_value0;
+
+                ++rhs_ptr;
+                ++lhs_ptr;
+            }
+
+            // Quantize down
+            res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
+
+            // Add offset
+            res00 += dst_offset + *dst;
+
+            // Clamp the result
+            res00 = CLAMP(res00, NN_Q15_MAX, NN_Q15_MIN);
+
+            *dst++ = (int16_t)res00;
+            rhs += rhs_cols;
+        }
+#endif
+    }
+}
+
+/**
+ * @} end of supportLSTM group
+ */
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c
index c273d3af..4f056739 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,14 +21,15 @@
  * Title:        arm_nn_vec_mat_mult_t_s16
  * Description:  s16 vector by matrix (transposed) multiplication
  *
- * $Date:        11 August 2022
- * $Revision:    V.2.1.0
+ * $Date:        5 January 2023
+ * $Revision:    V.2.2.0
  *
- * Target Processor:  Cortex-M
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
+
 #define MAX_COL_COUNT (512)
 
 /**
@@ -46,10 +47,10 @@
  * Refer header file for details.
  *
  */
-arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
-                                              const q7_t *rhs,
-                                              const q63_t *bias,
-                                              q15_t *dst,
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const int16_t *lhs,
+                                              const int8_t *rhs,
+                                              const int64_t *bias,
+                                              int16_t *dst,
                                               const int32_t dst_multiplier,
                                               const int32_t dst_shift,
                                               const int32_t rhs_cols,
@@ -66,7 +67,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
         rhs_cols_fast = MAX_COL_COUNT;
     }
 
-#if defined(ARM_MATH_MVEI)
+    #if defined(ARM_MATH_MVEI)
     int32_t row_loop_cnt = rhs_rows / 4;
     int32_t col_loop_cnt = (rhs_cols_fast + 7) / 8;
 
@@ -140,25 +141,25 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
         tmp = arm_nn_requantize_s64(result_64_0, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
 
         tmp = 0;
         tmp = arm_nn_requantize_s64(result_64_1, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
 
         tmp = 0;
         tmp = arm_nn_requantize_s64(result_64_2, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
 
         tmp = 0;
         tmp = arm_nn_requantize_s64(result_64_3, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
 
         rhs += 4 * rhs_cols;
     }
@@ -209,20 +210,20 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
         tmp = arm_nn_requantize_s64(result_64, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
 
         rhs += rhs_cols;
     }
 
-#else // ARM_MATH_MVEI
+    #else // ARM_MATH_MVEI
 
     const int32_t row_loop_cnt = rhs_rows / 2;
 
     for (int32_t i = 0; i < row_loop_cnt; i++)
     {
 
-        q63_t acc_64_0 = 0;
-        q63_t acc_64_1 = 0;
+        int64_t acc_64_0 = 0;
+        int64_t acc_64_1 = 0;
         int32_t acc_0 = 0;
         int32_t acc_1 = 0;
 
@@ -242,13 +243,13 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
 
             rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1);
 
-            acc_0 = __SMLAD(ker_0, vec_part_0, acc_0);
-            acc_0 = __SMLAD(ker_1, vec_part_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_part_0, acc_0);
+            acc_0 = SMLAD(ker_1, vec_part_1, acc_0);
 
             rhs_1 = read_and_pad(rhs_1, &ker_0, &ker_1);
 
-            acc_1 = __SMLAD(ker_0, vec_part_0, acc_1);
-            acc_1 = __SMLAD(ker_1, vec_part_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_part_0, acc_1);
+            acc_1 = SMLAD(ker_1, vec_part_1, acc_1);
         }
 
         acc_64_0 += acc_0;
@@ -269,22 +270,22 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
             acc_64_0 += *bias++;
             acc_64_1 += *bias++;
         }
-        q31_t tmp;
+        int32_t tmp;
 
         tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
 
         tmp = arm_nn_requantize_s64(acc_64_1, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
     }
 
     if (rhs_rows & 0x1)
     {
-        q63_t acc_64_0 = 0;
+        int64_t acc_64_0 = 0;
         int32_t acc_0 = 0;
         const int32_t col_loop_cnt = rhs_cols_fast / 4;
 
@@ -297,10 +298,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
             rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1);
 
             vec = arm_nn_read_q15x2_ia(&lhs_vec);
-            acc_0 = __SMLAD(ker_0, vec, acc_0);
+            acc_0 = SMLAD(ker_0, vec, acc_0);
 
             vec = arm_nn_read_q15x2_ia(&lhs_vec);
-            acc_0 = __SMLAD(ker_1, vec, acc_0);
+            acc_0 = SMLAD(ker_1, vec, acc_0);
         }
 
         acc_64_0 += acc_0;
@@ -317,26 +318,26 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
         {
             acc_64_0 += *bias++;
         }
-        q31_t tmp;
+        int32_t tmp;
         tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift);
         tmp = MAX(tmp, activation_min);
         tmp = MIN(tmp, activation_max);
-        *dst++ = (q15_t)tmp;
+        *dst++ = (int16_t)tmp;
     }
 
-#endif // ARM_MATH_MVEI
-#else  // ARM_MATH_DSP
+    #endif // ARM_MATH_MVEI
+#else      // ARM_MATH_DSP
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < rhs_rows; i_row_loop_cnt++)
     {
-        const q15_t *lhs_ptr = lhs;
-        const q7_t *rhs_ptr_0 = &rhs[0];
+        const int16_t *lhs_ptr = lhs;
+        const int8_t *rhs_ptr_0 = &rhs[0];
 
-        q63_t result = 0;
+        int64_t result = 0;
 
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            const q63_t rhs_value0 = (int8_t)*rhs_ptr_0;
-            const q63_t lhs_value = *lhs_ptr;
+            const int64_t rhs_value0 = (int8_t)*rhs_ptr_0;
+            const int64_t lhs_value = *lhs_ptr;
 
             result += lhs_value * rhs_value0;
 
@@ -355,10 +356,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
         result = ((result) > (activation_min) ? (result) : (activation_min));
         result = ((result) < (activation_max) ? (result) : (activation_max));
 
-        *dst++ = (q15_t)result;
+        *dst++ = (int16_t)result;
         rhs += rhs_cols;
     }
-#endif // ARM_MATH_DSP
+#endif     // ARM_MATH_DSP
 
     return ARM_CMSIS_NN_SUCCESS;
 }
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 371a1830..938530a4 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_nn_vec_mat_mult_t_s8
  * Description:  s8 vector by matrix (transposed) multiplication
  *
- * $Date:        16 Aug 2022
- * $Revision:    V.4.0.2
+ * $Date:        26 January 2023
+ * $Revision:    V.5.3.0
  *
- * Target Processor:  Cortex-M
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -52,12 +52,14 @@
  * Refer header file for details.
  *
  */
-arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
-                                             const q7_t *rhs,
-                                             const q31_t *bias,
-                                             q7_t *dst,
+#if defined(ARM_MATH_DSP) && !defined(__ARMCC_VERSION) && !defined(__ICCARM__)
+    #pragma GCC optimize("unroll-loops")
+#endif
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
+                                             const int8_t *rhs,
+                                             const int32_t *bias,
+                                             int8_t *dst,
                                              const int32_t lhs_offset,
-                                             const int32_t rhs_offset,
                                              const int32_t dst_offset,
                                              const int32_t dst_multiplier,
                                              const int32_t dst_shift,
@@ -67,7 +69,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
                                              const int32_t activation_max,
                                              const int32_t address_offset)
 {
-    (void)rhs_offset;
 #if defined(ARM_MATH_MVEI)
     const int32_t row_loop_cnt = rhs_rows / 3;
     const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3};
@@ -188,7 +189,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 #elif defined(ARM_MATH_DSP)
     const int32_t row_loop_cnt = rhs_rows / 2;
     const int16_t lhs_offset_s16 = (int16_t)lhs_offset;
-    const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
+    const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
 
     for (int32_t i = 0; i < row_loop_cnt; i++)
     {
@@ -209,24 +210,24 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
         for (int j = col_loop_cnt; j != 0; j--)
         {
-            int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
-            int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+            int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
 
-            vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
 
-            int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
-            int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8);
-            ker_0 = __SXTB16(ker_0);
+            int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
 
-            acc_0 = __SMLAD(ker_1, vec_1, acc_0);
-            acc_0 = __SMLAD(ker_0, vec_0, acc_0);
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
 
-            ker_0 = arm_nn_read_q7x4_ia(&rhs_1);
-            ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8);
-            ker_0 = __SXTB16(ker_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
 
-            acc_1 = __SMLAD(ker_1, vec_1, acc_1);
-            acc_1 = __SMLAD(ker_0, vec_0, acc_1);
+            acc_1 = SMLAD(ker_1, vec_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_0, acc_1);
         }
 
         for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
@@ -269,16 +270,16 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
         for (int i = col_loop_cnt; i != 0; i--)
         {
-            int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
-            int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
-            vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
+            int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
 
-            int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
-            int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8);
-            ker_0 = __SXTB16(ker_0);
+            int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
 
-            acc_0 = __SMLAD(ker_1, vec_1, acc_0);
-            acc_0 = __SMLAD(ker_0, vec_0, acc_0);
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
         }
 
         for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
@@ -306,14 +307,14 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
     {
-        const q7_t *lhs_ptr = lhs;
-        const q7_t *rhs_ptr_0 = &rhs[0];
-        const q7_t *rhs_ptr_1 = &rhs[rhs_cols];
-        const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
-
-        q31_t res00 = 0;
-        q31_t res01 = 0;
-        q31_t res02 = 0;
+        const int8_t *lhs_ptr = lhs;
+        const int8_t *rhs_ptr_0 = &rhs[0];
+        const int8_t *rhs_ptr_1 = &rhs[rhs_cols];
+        const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
+
+        int32_t res00 = 0;
+        int32_t res01 = 0;
+        int32_t res02 = 0;
         if (bias)
         {
             res00 = *bias++;
@@ -322,10 +323,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
         }
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            const q31_t rhs_value0 = (int8_t)*rhs_ptr_0;
-            const q31_t rhs_value1 = (int8_t)*rhs_ptr_1;
-            const q31_t rhs_value2 = (int8_t)*rhs_ptr_2;
-            const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
+            const int32_t rhs_value0 = (int8_t)*rhs_ptr_0;
+            const int32_t rhs_value1 = (int8_t)*rhs_ptr_1;
+            const int32_t rhs_value2 = (int8_t)*rhs_ptr_2;
+            const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
 
             res00 += lhs_value * rhs_value0;
             res01 += lhs_value * rhs_value1;
@@ -354,9 +355,9 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
         res02 = MAX(res02, activation_min);
         res02 = MIN(res02, activation_max);
 
-        *dst = (q7_t)res00;
-        *(dst + address_offset) = (q7_t)res01;
-        *(dst + 2 * address_offset) = (q7_t)res02;
+        *dst = (int8_t)res00;
+        *(dst + address_offset) = (int8_t)res01;
+        *(dst + 2 * address_offset) = (int8_t)res02;
         dst += 3 * address_offset;
 
         rhs += 3 * rhs_cols;
@@ -366,10 +367,10 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
     for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        const q7_t *rhs_ptr = &rhs[0];
+        const int8_t *lhs_ptr = &lhs[0];
+        const int8_t *rhs_ptr = &rhs[0];
 
-        q31_t res00 = 0;
+        int32_t res00 = 0;
         if (bias)
         {
             res00 = *bias++;
@@ -377,8 +378,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            q31_t rhs_value0 = (int8_t)rhs_ptr[0];
-            q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
+            int32_t rhs_value0 = (int8_t)rhs_ptr[0];
+            int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
 
             res00 += lhs_value * rhs_value0;
 
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c
index e22095c3..1e2f4f84 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,10 +22,10 @@
  * Description:  s8 vector by matrix (transposed) multiplication with
  *               s16 output. Targetted at SVDF operator.
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        5 January 2023
+ * $Revision:    V.3.1.0
  *
- * Target Processor:  Cortex-M
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -46,11 +46,10 @@
  * Refer header file for details.
  *
  */
-arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
-                                                  const q7_t *rhs,
-                                                  q15_t *dst,
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const int8_t *lhs,
+                                                  const int8_t *rhs,
+                                                  int16_t *dst,
                                                   const int32_t lhs_offset,
-                                                  const int32_t rhs_offset,
                                                   const int32_t dst_offset,
                                                   const int32_t dst_multiplier,
                                                   const int32_t dst_shift,
@@ -59,13 +58,11 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
                                                   const int32_t activation_min,
                                                   const int32_t activation_max)
 {
-    (void)rhs_offset;
     if (rhs_cols < 0 || (NN_Q31_MAX - rhs_cols) < 16 || dst_offset < 0)
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
 
-    (void)rhs_offset;
 #if defined(ARM_MATH_MVEI)
     int32_t row_loop_cnt = rhs_rows / 3;
 
@@ -153,12 +150,12 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
         rhs += rhs_cols;
 
         const int32_t offsets = rhs_sum_0 * lhs_offset;
-        acc_0 = __QADD(acc_0, offsets);
+        acc_0 = QADD(acc_0, offsets);
         acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
 
         // Clamp the result
         acc_0 = MAX(acc_0, activation_min);
-        *dst = (q15_t)MIN(acc_0, activation_max);
+        *dst = (int16_t)MIN(acc_0, activation_max);
         dst += dst_offset;
     }
 
@@ -166,45 +163,126 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
     int32_t row_loop_cnt = rhs_rows / 2;
 
     const int16_t lhs_offset_s16 = lhs_offset;
-    const int16_t rhs_offset_s16 = rhs_offset;
 
-    const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
-    const uint32_t rhs_offset_s16x2 = __PKHBT(rhs_offset_s16, rhs_offset_s16, 16);
+    const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
     for (int32_t i = 0; i < row_loop_cnt; i++)
     {
         int32_t acc_0 = 0;
         int32_t acc_1 = 0;
 
-        const int32_t col_loop_cnt = rhs_cols / 4;
         const int8_t *lhs_vec = lhs;
         const int8_t *rhs_0 = rhs;
         const int8_t *rhs_1 = rhs + rhs_cols;
         rhs += 2 * rhs_cols;
-        for (int j = col_loop_cnt; j != 0; j--)
+
+        int32_t rhs_cols_idx = 0;
+
+        int32_t vec_0, vec_1, ker_0, ker_1;
+
+    #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+        #pragma clang loop unroll(disable)
+    #endif
+        for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
         {
-            int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
-            int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
-            vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
-            int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
-            int32_t ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8);
-            ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0);
-            acc_0 = __SMLAD(ker_1, vec_1, acc_0);
-            acc_0 = __SMLAD(ker_0, vec_0, acc_0);
-            ker_0 = arm_nn_read_q7x4_ia(&rhs_1);
-            ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8);
-            ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0);
-            acc_1 = __SMLAD(ker_1, vec_1, acc_1);
-            acc_1 = __SMLAD(ker_0, vec_0, acc_1);
+            // 4 x MAC acc_0, acc1
+            vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_1 = SMLAD(ker_1, vec_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_0, acc_1);
+
+            // 4 x MAC acc_0, acc1
+            vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_1 = SMLAD(ker_1, vec_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_0, acc_1);
+
+            // 4 x MAC acc_0, acc1
+            vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_1 = SMLAD(ker_1, vec_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_0, acc_1);
+
+            // 4 x MAC acc_0, acc1
+            vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+            acc_1 = SMLAD(ker_1, vec_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_0, acc_1);
+        }
+
+    #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+        #pragma clang loop unroll(disable)
+    #endif
+        for (; rhs_cols_idx <= (rhs_cols - 4); rhs_cols_idx += 4)
+        {
+            vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8);
+
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
+
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
+
+            ker_0 = arm_nn_read_s8x4_ia(&rhs_1);
+            ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+
+            acc_1 = SMLAD(ker_1, vec_1, acc_1);
+            acc_1 = SMLAD(ker_0, vec_0, acc_1);
         }
-        for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
+
+    #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+        #pragma clang loop unroll(disable)
+    #endif
+        for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
             const int32_t lhs_temp = (*lhs_vec + lhs_offset);
             lhs_vec++;
-            acc_0 += lhs_temp * (*rhs_0 + rhs_offset);
+            acc_0 += lhs_temp * (*rhs_0);
             rhs_0++;
-            acc_1 += lhs_temp * (*rhs_1 + rhs_offset);
+            acc_1 += lhs_temp * (*rhs_1);
             rhs_1++;
         }
+
         acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
         acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift);
 
@@ -213,8 +291,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
         acc_0 = MIN(acc_0, activation_max);
         acc_1 = MAX(acc_1, activation_min);
         acc_1 = MIN(acc_1, activation_max);
-        *dst = (q15_t)acc_0;
-        *(dst + dst_offset) = (q15_t)acc_1;
+        *dst = (int16_t)acc_0;
+        *(dst + dst_offset) = (int16_t)acc_1;
         dst += 2 * dst_offset;
     }
     if (rhs_rows & 0x1)
@@ -225,20 +303,22 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
         const int8_t *rhs_0 = rhs;
         for (int i = col_loop_cnt; i != 0; i--)
         {
-            int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec);
-            int32_t vec_1 = __SXTAB16(lhs_offset_s16x2, __ROR((uint32_t)vec_0, 8));
-            vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0);
-            int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0);
-            int32_t ker_1 = __SXTAB16(rhs_offset_s16x2, __ROR((uint32_t)ker_0, 8));
-            ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0);
-            acc_0 = __SMLAD(ker_1, vec_1, acc_0);
-            acc_0 = __SMLAD(ker_0, vec_0, acc_0);
+            int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec);
+            int32_t vec_1 = SXTAB16(lhs_offset_s16x2, ROR((uint32_t)vec_0, 8));
+            vec_0 = SXTAB16(lhs_offset_s16x2, vec_0);
+
+            int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0);
+            int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8);
+            ker_0 = SXTB16(ker_0);
+
+            acc_0 = SMLAD(ker_1, vec_1, acc_0);
+            acc_0 = SMLAD(ker_0, vec_0, acc_0);
         }
         for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
         {
             const int32_t lhs_temp = (*lhs_vec + lhs_offset);
             lhs_vec++;
-            acc_0 += lhs_temp * (*rhs_0 + rhs_offset);
+            acc_0 += lhs_temp * *rhs_0;
             rhs_0++;
         }
         acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
@@ -246,7 +326,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
         // Clamp the result
         acc_0 = MAX(acc_0, activation_min);
         acc_0 = MIN(acc_0, activation_max);
-        *dst = (q15_t)acc_0;
+        *dst = (int16_t)acc_0;
         dst += dst_offset;
     }
 
@@ -256,20 +336,20 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
 
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
     {
-        const q7_t *lhs_ptr = lhs;
-        const q7_t *rhs_ptr_0 = &rhs[0];
-        const q7_t *rhs_ptr_1 = &rhs[rhs_cols];
-        const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
-
-        q31_t res00 = 0;
-        q31_t res01 = 0;
-        q31_t res02 = 0;
+        const int8_t *lhs_ptr = lhs;
+        const int8_t *rhs_ptr_0 = &rhs[0];
+        const int8_t *rhs_ptr_1 = &rhs[rhs_cols];
+        const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
+
+        int32_t res00 = 0;
+        int32_t res01 = 0;
+        int32_t res02 = 0;
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            const q31_t rhs_value0 = (int8_t)*rhs_ptr_0;
-            const q31_t rhs_value1 = (int8_t)*rhs_ptr_1;
-            const q31_t rhs_value2 = (int8_t)*rhs_ptr_2;
-            const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
+            const int32_t rhs_value0 = (int8_t)*rhs_ptr_0;
+            const int32_t rhs_value1 = (int8_t)*rhs_ptr_1;
+            const int32_t rhs_value2 = (int8_t)*rhs_ptr_2;
+            const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
 
             res00 += lhs_value * rhs_value0;
             res01 += lhs_value * rhs_value1;
@@ -293,9 +373,9 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
         res02 = MAX(res02, activation_min);
         res02 = MIN(res02, activation_max);
 
-        *dst = (q15_t)res00;
-        *(dst + dst_offset) = (q15_t)res01;
-        *(dst + 2 * dst_offset) = (q15_t)res02;
+        *dst = (int16_t)res00;
+        *(dst + dst_offset) = (int16_t)res01;
+        *(dst + 2 * dst_offset) = (int16_t)res02;
         dst += 3 * dst_offset;
         rhs += 3 * rhs_cols;
     }
@@ -304,15 +384,15 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
 
     for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        const q7_t *rhs_ptr = &rhs[0];
+        const int8_t *lhs_ptr = &lhs[0];
+        const int8_t *rhs_ptr = &rhs[0];
 
-        q31_t res00 = 0;
+        int32_t res00 = 0;
 
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset;
-            q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
+            int32_t rhs_value0 = (int8_t)rhs_ptr[0];
+            int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
 
             res00 += lhs_value * rhs_value0;
 
@@ -327,7 +407,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
         res00 = MAX(res00, activation_min);
         res00 = MIN(res00, activation_max);
 
-        *dst = (q15_t)res00;
+        *dst = (int16_t)res00;
         dst += dst_offset;
         rhs += rhs_cols;
     }
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c
index 1bbfcfb1..7ccb89e0 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_nntables.c
@@ -21,8 +21,8 @@
  * Title:        arm_nntables.c
  * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
  *
- * $Date:        30 September 2022
- * $Revision:    V.2.0.0
+ * $Date:        28 October 2022
+ * $Revision:    V.2.1.0
  *
  * Target Processor:  Cortex-M cores
  *
@@ -38,3 +38,25 @@
  *
  */
 
+// Table of sigmoid(i/24) at 0.16 format - 256 elements.
+// Combined sigmoid and tanh look-up table, since
+// tanh(x) = 2*sigmoid(2*x) -1.
+// Both functions are symmetric, so the LUT table is only needed
+// for the absolute value of the input.
+const uint16_t sigmoid_table_uint16[256] = {
+    32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688,
+    43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409,
+    51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433,
+    57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110,
+    61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186,
+    63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308,
+    64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900,
+    64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208,
+    65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367,
+    65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449,
+    65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
+    65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513,
+    65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524,
+    65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530,
+    65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533,
+    65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535};
diff --git a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
index fd4bb8d4..1c4e87e7 100644
--- a/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
+++ b/src/third_party/cmsis_nn/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_q7_to_q15_with_offset.c
  * Description:  Converts the elements of the Q7 vector to Q15 vector with an added offset
  *
- * $Date:        March 3, 2020
- * $Revision:    V.2.0.2
+ * $Date:        5 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -39,7 +39,7 @@
  * @{
  */
 
-void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset)
+void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, uint32_t block_size, int16_t offset)
 {
     int block_cnt;
 
@@ -63,28 +63,28 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
 
 #elif defined(ARM_MATH_DSP)
     /* Run the below code for cores that support SIMD instructions  */
-    q31_t in_q7x4;
-    q31_t in_q15x2_1;
-    q31_t in_q15x2_2;
-    q31_t out_q15x2_1;
-    q31_t out_q15x2_2;
+    int32_t in_q7x4;
+    int32_t in_q15x2_1;
+    int32_t in_q15x2_2;
+    int32_t out_q15x2_1;
+    int32_t out_q15x2_2;
 
     /*loop unrolling */
     block_cnt = block_size >> 2;
 
     /* First part of the processing with loop unrolling.  Compute 4 outputs at a time. */
-    const q31_t offset_q15x2 = __PKHBT(offset, offset, 16);
+    const int32_t offset_q15x2 = PKHBT(offset, offset, 16);
     while (block_cnt > 0)
     {
-        /* convert from q7 to q15 and then store the results in the destination buffer */
-        in_q7x4 = arm_nn_read_q7x4_ia(&src);
+        /* convert from s8 to s16 and then store the results in the destination buffer */
+        in_q7x4 = arm_nn_read_s8x4_ia(&src);
 
-        /* Extract and sign extend each of the four q7 values to q15 */
-        in_q15x2_1 = __SXTAB16(offset_q15x2, __ROR(in_q7x4, 8));
-        in_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);
+        /* Extract and sign extend each of the four s8 values to s16 */
+        in_q15x2_1 = SXTAB16(offset_q15x2, ROR(in_q7x4, 8));
+        in_q15x2_2 = SXTAB16(offset_q15x2, in_q7x4);
 
-        out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16);
-        out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16);
+        out_q15x2_2 = PKHTB(in_q15x2_1, in_q15x2_2, 16);
+        out_q15x2_1 = PKHBT(in_q15x2_2, in_q15x2_1, 16);
 
         arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
         arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
@@ -102,7 +102,7 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
 
     while (block_cnt > 0)
     {
-        *dst++ = (q15_t)*src++ + offset;
+        *dst++ = (int16_t)*src++ + offset;
 
         /* Decrement the loop counter */
         block_cnt--;
diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s16.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s16.c
new file mode 100644
index 00000000..fae40824
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s16.c
@@ -0,0 +1,71 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_avgpool_get_buffer_sizes_s16.c
+ * Description:  Collection of get buffer size functions for avgpool s16 layer function.
+ *
+ * $Date:        13 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup Pooling
+ */
+
+/**
+ * @addtogroup GetBufferSizePooling
+ * @{
+ */
+
+int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src)
+{
+#if defined(ARM_MATH_MVEI)
+    return arm_avgpool_s16_get_buffer_size_mve(output_x, ch_src);
+#elif defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return arm_avgpool_s16_get_buffer_size_dsp(output_x, ch_src);
+#else
+    (void)output_x;
+    (void)ch_src;
+    return 0;
+#endif
+}
+
+int32_t arm_avgpool_s16_get_buffer_size_dsp(const int output_x, const int ch_src)
+{
+    (void)output_x;
+    return (ch_src * sizeof(int32_t));
+}
+
+int32_t arm_avgpool_s16_get_buffer_size_mve(const int output_x, const int ch_src)
+{
+    (void)output_x;
+    (void)ch_src;
+
+    return 0;
+}
+
+/**
+ * @} end of GetBufferSizePooling group
+ */
diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s8.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s8.c
new file mode 100644
index 00000000..62b75f32
--- /dev/null
+++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_get_buffer_sizes_s8.c
@@ -0,0 +1,71 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_avgpool_get_buffer_sizes_s8.c
+ * Description:  Collection of get buffer size functions for avgpool s8 layer function.
+ *
+ * $Date:        25 January 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
+
+/**
+ *  @ingroup Pooling
+ */
+
+/**
+ * @addtogroup GetBufferSizePooling
+ * @{
+ */
+
+int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src)
+{
+#if defined(ARM_MATH_MVEI)
+    return arm_avgpool_s8_get_buffer_size_mve(output_x, ch_src);
+#elif defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return arm_avgpool_s8_get_buffer_size_dsp(output_x, ch_src);
+#else
+    (void)output_x;
+    (void)ch_src;
+    return 0;
+#endif
+}
+
+int32_t arm_avgpool_s8_get_buffer_size_dsp(const int output_x, const int ch_src)
+{
+    (void)output_x;
+    return (ch_src * sizeof(int32_t));
+}
+
+int32_t arm_avgpool_s8_get_buffer_size_mve(const int output_x, const int ch_src)
+{
+    (void)output_x;
+    (void)ch_src;
+
+    return 0;
+}
+
+/**
+ * @} end of GetBufferSizePooling group
+ */
diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c
index 0178d15f..3c38a515 100644
--- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c
+++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s16.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_avgpool_s16.c
  * Description:  Pooling function implementations
  *
- * $Date:        27 July 2022
- * $Revision:    V.2.2.0
+ * $Date:        30 January 2023
+ * $Revision:    V.2.4.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -33,8 +33,8 @@
 
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
 
-static void scale_q31_to_q15_and_clamp(const q31_t *buffer,
-                                       q15_t *target,
+static void scale_q31_to_q15_and_clamp(const int32_t *buffer,
+                                       int16_t *target,
                                        int32_t length,
                                        const int32_t count,
                                        const int act_min,
@@ -49,7 +49,7 @@ static void scale_q31_to_q15_and_clamp(const q31_t *buffer,
         sum = MAX(sum, act_min);
         sum = MIN(sum, act_max);
 
-        target[i] = (q15_t)sum;
+        target[i] = (int16_t)sum;
     }
 }
 #endif
@@ -73,10 +73,10 @@ static void scale_q31_to_q15_and_clamp(const q31_t *buffer,
 arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
                                     const cmsis_nn_pool_params *pool_params,
                                     const cmsis_nn_dims *input_dims,
-                                    const q15_t *src,
+                                    const int16_t *src,
                                     const cmsis_nn_dims *filter_dims,
                                     const cmsis_nn_dims *output_dims,
-                                    q15_t *dst)
+                                    int16_t *dst)
 {
     const int32_t input_y = input_dims->h;
     const int32_t input_x = input_dims->w;
@@ -180,7 +180,7 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
     }
 #elif defined(ARM_MATH_DSP)
 
-    q31_t *buffer = (q31_t *)ctx->buf;
+    int32_t *buffer = (int32_t *)ctx->buf;
 
     if (buffer == NULL)
     {
@@ -209,7 +209,7 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
             {
                 for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++)
                 {
-                    const q15_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
+                    const int16_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
 
                     if (count == 0)
                     {
@@ -222,7 +222,7 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
                     {
                         for (int i = 0; i < ch_src; i++)
                         {
-                            buffer[i] = __QADD(start[i], buffer[i]);
+                            buffer[i] = QADD(start[i], buffer[i]);
                         }
                     }
                     count++;
@@ -291,17 +291,6 @@ arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
     return ARM_CMSIS_NN_SUCCESS;
 }
 
-int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src)
-{
-    (void)output_x;
-#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-    return (ch_src * (int32_t)sizeof(int32_t));
-#else
-    (void)ch_src;
-#endif
-    return 0;
-}
-
 /**
  * @} end of Pooling group
  */
diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c
index 1a61f376..0001b0ee 100644
--- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c
+++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_avgpool_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_avgpool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        7 July 2022
- * $Revision:    V.3.0.2
+ * $Date:        30 January 2023
+ * $Revision:    V.3.2.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -32,8 +32,8 @@
 #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
 
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-static void scale_q31_to_q7_and_clamp(const q31_t *buffer,
-                                      q7_t *target,
+static void scale_q31_to_q7_and_clamp(const int32_t *buffer,
+                                      int8_t *target,
                                       int32_t length,
                                       const int32_t count,
                                       const int act_min,
@@ -48,7 +48,7 @@ static void scale_q31_to_q7_and_clamp(const q31_t *buffer,
         sum = MAX(sum, act_min);
         sum = MIN(sum, act_max);
 
-        target[i] = (q7_t)sum;
+        target[i] = (int8_t)sum;
     }
 }
 #endif
@@ -74,10 +74,10 @@ static void scale_q31_to_q7_and_clamp(const q31_t *buffer,
 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
                                    const cmsis_nn_pool_params *pool_params,
                                    const cmsis_nn_dims *input_dims,
-                                   const q7_t *src,
+                                   const int8_t *src,
                                    const cmsis_nn_dims *filter_dims,
                                    const cmsis_nn_dims *output_dims,
-                                   q7_t *dst)
+                                   int8_t *dst)
 {
     (void)ctx;
     const int32_t input_y = input_dims->h;
@@ -220,10 +220,10 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
                                    const cmsis_nn_pool_params *pool_params,
                                    const cmsis_nn_dims *input_dims,
-                                   const q7_t *src,
+                                   const int8_t *src,
                                    const cmsis_nn_dims *filter_dims,
                                    const cmsis_nn_dims *output_dims,
-                                   q7_t *dst)
+                                   int8_t *dst)
 {
     const int32_t input_y = input_dims->h;
     const int32_t input_x = input_dims->w;
@@ -243,9 +243,9 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q31_t *buffer = (q31_t *)ctx->buf;
+    int32_t *buffer = (int32_t *)ctx->buf;
 
-#if defined(ARM_MATH_DSP)
+    #if defined(ARM_MATH_DSP)
 
     /* Run the following code for CPU's with DSP extension
      */
@@ -269,7 +269,7 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
             {
                 for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++)
                 {
-                    const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
+                    const int8_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
 
                     if (count == 0)
                     {
@@ -282,7 +282,7 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
                     {
                         for (int i = 0; i < ch_src; i++)
                         {
-                            buffer[i] = __QADD(start[i], buffer[i]);
+                            buffer[i] = QADD(start[i], buffer[i]);
                         }
                     }
                     count++;
@@ -299,7 +299,7 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
             dst += ch_src;
         }
     }
-#else
+    #else
 
     /* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC.
      */
@@ -340,23 +340,12 @@ arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
         }
     }
 
-#endif
+    #endif
     return ARM_CMSIS_NN_SUCCESS;
 }
 
 #endif /* ARM_MATH_MVEI */
 
-int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src)
-{
-    (void)output_x;
-
-#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-    return (ch_src * sizeof(int32_t));
-#else
-    (void)ch_src;
-    return 0;
-#endif
-}
 /**
  * @} end of Pooling group
  */
diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c
index 8a0a01df..a306d05c 100644
--- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c
+++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s16.c
  * Description:  Pooling function implementations
  *
- * $Date:        16 August 2022
- * $Revision:    V.2.1.1
+ * $Date:        26 October 2022
+ * $Revision:    V.2.1.2
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -47,15 +47,15 @@ static void compare_and_replace_if_larger(int16_t *base, const int16_t *target,
         length -= 8;
     }
 #else
-    q15_t *dst = base;
-    const q15_t *src = target;
+    int16_t *dst = base;
+    const int16_t *src = target;
     union arm_nnword ref_max;
     union arm_nnword comp_max;
     int32_t cnt = length >> 1;
 
     while (cnt > 0l)
     {
-        ref_max.word = arm_nn_read_q15x2(dst);
+        ref_max.word = arm_nn_read_s16x2(dst);
         comp_max.word = arm_nn_read_q15x2_ia(&src);
 
         if (comp_max.half_words[0] > ref_max.half_words[0])
@@ -105,7 +105,7 @@ static void clamp_output(int16_t *source, int32_t length, const int16_t act_min,
 
     while (cnt > 0l)
     {
-        in.word = arm_nn_read_q15x2(source);
+        in.word = arm_nn_read_s16x2(source);
 
         in.half_words[0] = MAX(in.half_words[0], act_min);
         in.half_words[0] = MIN(in.half_words[0], act_max);
diff --git a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c
index 3fcf64da..2afb704a 100644
--- a/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c
+++ b/src/third_party/cmsis_nn/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        16 August 2022
- * $Revision:    V.3.0.1
+ * $Date:        26 October 2022
+ * $Revision:    V.3.0.2
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -31,7 +31,7 @@
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
 #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
 
-static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length)
+static void compare_and_replace_if_larger_q7(int8_t *base, const int8_t *target, int32_t length)
 {
 #if defined(ARM_MATH_MVEI)
     int32_t loop_count = (length + 15) / 16;
@@ -47,16 +47,16 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
         length -= 16;
     }
 #else
-    q7_t *dst = base;
-    const q7_t *src = target;
+    int8_t *dst = base;
+    const int8_t *src = target;
     union arm_nnword ref_max;
     union arm_nnword comp_max;
     int32_t cnt = length >> 2;
 
     while (cnt > 0l)
     {
-        ref_max.word = arm_nn_read_q7x4(dst);
-        comp_max.word = arm_nn_read_q7x4_ia(&src);
+        ref_max.word = arm_nn_read_s8x4(dst);
+        comp_max.word = arm_nn_read_s8x4_ia(&src);
 
         if (comp_max.bytes[0] > ref_max.bytes[0])
         {
@@ -75,7 +75,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
             ref_max.bytes[3] = comp_max.bytes[3];
         }
 
-        arm_nn_write_q7x4_ia(&dst, ref_max.word);
+        arm_nn_write_s8x4_ia(&dst, ref_max.word);
 
         cnt--;
     }
@@ -94,7 +94,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
 #endif
 }
 
-static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max)
+static void clamp_output(int8_t *source, int32_t length, const int32_t act_min, const int32_t act_max)
 {
 #if defined(ARM_MATH_MVEI)
     int32_t loop_count = (length + 15) / 16;
@@ -117,7 +117,7 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
 
     while (cnt > 0l)
     {
-        in.word = arm_nn_read_q7x4(source);
+        in.word = arm_nn_read_s8x4(source);
 
         in.bytes[0] = MAX(in.bytes[0], act_min);
         in.bytes[0] = MIN(in.bytes[0], act_max);
@@ -128,7 +128,7 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
         in.bytes[3] = MAX(in.bytes[3], act_min);
         in.bytes[3] = MIN(in.bytes[3], act_max);
 
-        arm_nn_write_q7x4_ia(&source, in.word);
+        arm_nn_write_s8x4_ia(&source, in.word);
         cnt--;
     }
 
@@ -163,10 +163,10 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
 arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
                                     const cmsis_nn_pool_params *pool_params,
                                     const cmsis_nn_dims *input_dims,
-                                    const q7_t *src,
+                                    const int8_t *src,
                                     const cmsis_nn_dims *filter_dims,
                                     const cmsis_nn_dims *output_dims,
-                                    q7_t *dst)
+                                    int8_t *dst)
 {
     const int32_t input_y = input_dims->h;
     const int32_t input_x = input_dims->w;
@@ -182,7 +182,7 @@ arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
     const int32_t act_max = pool_params->activation.max;
     const int32_t channel_in = input_dims->c;
     (void)ctx;
-    q7_t *dst_base = dst;
+    int8_t *dst_base = dst;
 
     for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
     {
@@ -202,11 +202,11 @@ arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
             {
                 for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
                 {
-                    const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x);
+                    const int8_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x);
 
                     if (count == 0)
                     {
-                        arm_memcpy_q7(dst, start, channel_in);
+                        arm_memcpy_s8(dst, start, channel_in);
                         count++;
                     }
                     else
diff --git a/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c b/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c
index 11aaf349..8d4ff3eb 100644
--- a/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c
+++ b/src/third_party/cmsis_nn/Source/ReshapeFunctions/arm_reshape_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_reshape_s8.c
  * Description:  Reshape a s8 vector
  *
- * $Date:        4 Aug 2022
- * $Revision:    V.1.0.1
+ * $Date:        26 October 2022
+ * $Revision:    V.1.0.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -49,7 +49,7 @@
 
 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size)
 {
-    arm_memcpy_q7(output, input, total_size);
+    arm_memcpy_s8(output, input, total_size);
 }
 
 /**
diff --git a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c
index 9b3457c4..e689f13a 100644
--- a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c
+++ b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_svdf_s8.c
  * Description:  S8 basic SVDF layer function
  *
- * $Date:        4 May 2022
- * $Revision:    V.4.0.1
+ * $Date:        5 January 2023
+ * $Revision:    V.5.1.0
  *
- * Target Processor:  Cortex-M processors
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -53,26 +53,26 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
                                 const cmsis_nn_dims *input_dims,
-                                const q7_t *input_data,
+                                const int8_t *input_data,
                                 const cmsis_nn_dims *state_dims,
-                                q7_t *state_data,
+                                int8_t *state_data,
                                 const cmsis_nn_dims *weights_feature_dims,
-                                const q7_t *weights_feature_data,
+                                const int8_t *weights_feature_data,
                                 const cmsis_nn_dims *weights_time_dims,
-                                const q7_t *weights_time_data,
+                                const int8_t *weights_time_data,
                                 const cmsis_nn_dims *bias_dims,
-                                const q31_t *bias_data,
+                                const int32_t *bias_data,
                                 const cmsis_nn_dims *output_dims,
-                                q7_t *output_data)
+                                int8_t *output_data)
 {
     (void)bias_dims;
     (void)state_dims;
     (void)output_dims;
 
-    const q31_t multiplier_in = input_quant_params->multiplier;
-    const q31_t shift_in = input_quant_params->shift;
-    const q31_t multiplier_out = output_quant_params->multiplier;
-    const q31_t shift_2 = output_quant_params->shift;
+    const int32_t multiplier_in = input_quant_params->multiplier;
+    const int32_t shift_in = input_quant_params->shift;
+    const int32_t multiplier_out = output_quant_params->multiplier;
+    const int32_t shift_2 = output_quant_params->shift;
     const int32_t zp_in = svdf_params->input_offset;
     const int32_t zp_out = svdf_params->output_offset;
     const int32_t in_activation_min = svdf_params->input_activation.min;
@@ -91,13 +91,13 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q31_t *buffer_a = (q31_t *)input_ctx->buf;
+    int32_t *buffer_a = (int32_t *)input_ctx->buf;
 
     if (output_ctx->buf == NULL)
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q31_t *buffer_b = (q31_t *)output_ctx->buf;
+    int32_t *buffer_b = (int32_t *)output_ctx->buf;
 
     // Left shift state
     memmove((int8_t *)state_data,
@@ -107,9 +107,9 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
     // Matrix multiplication input * feature weight
     for (int i_batch = 0; i_batch < input_batches; i_batch++)
     {
-        q7_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
-        const q7_t *weight = weights_feature_data;
-        const q7_t *input = input_data + i_batch * input_height;
+        int8_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
+        const int8_t *weight = weights_feature_data;
+        const int8_t *input = input_data + i_batch * input_height;
 
         arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_s8(input,
                                                            weight,
@@ -117,7 +117,6 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                                                            res_ptr,
                                                            -zp_in,
                                                            0,
-                                                           0,
                                                            multiplier_in,
                                                            shift_in,
                                                            input_height,
@@ -134,7 +133,7 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
 
     // Matrix multiplicate time weight * state tensors
     {
-        q31_t *ptr_a = buffer_a;
+        int32_t *ptr_a = buffer_a;
         const int8_t *v2 = state_data;
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
@@ -152,11 +151,11 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                 {
                     j += 4;
 
-                    q31_t r1_1, r1_2, r2_1, r2_2;
+                    int32_t r1_1, r1_2, r2_1, r2_2;
                     v1 = read_and_pad_reordered(v1, &r1_1, &r1_2);
                     v2 = read_and_pad_reordered(v2, &r2_1, &r2_2);
-                    sum = __SMLAD(r1_1, r2_1, sum);
-                    sum = __SMLAD(r1_2, r2_2, sum);
+                    sum = SMLAD(r1_1, r2_1, sum);
+                    sum = SMLAD(r1_2, r2_2, sum);
                 }
 
                 // Process the remaining data
@@ -187,8 +186,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
         {
             for (int i = 0; i < input_batches; i++)
             {
-                q31_t *output_temp = buffer_b + i * feature_batches;
-                const q31_t *ptr_a = buffer_a + i * feature_batches;
+                int32_t *output_temp = buffer_b + i * feature_batches;
+                const int32_t *ptr_a = buffer_a + i * feature_batches;
 
                 const int32_t *bi = bias_data;
                 for (int j = 0; j < feature_batches; j++)
@@ -201,8 +200,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
         {
             for (int i_batch = 0; i_batch < input_batches; i_batch++)
             {
-                q31_t *output_data_temp = buffer_b + i_batch * unit_count;
-                q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+                int32_t *output_data_temp = buffer_b + i_batch * unit_count;
+                int32_t *ptr_a = buffer_a + i_batch * feature_batches;
 
                 for (int i = 0; i < unit_count; i++)
                 {
@@ -221,8 +220,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
     {
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
-            q31_t *output_data_temp = buffer_b + i_batch * unit_count;
-            q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+            int32_t *output_data_temp = buffer_b + i_batch * unit_count;
+            int32_t *ptr_a = buffer_a + i_batch * feature_batches;
 
             for (int i = 0; i < unit_count; i++)
             {
@@ -258,7 +257,7 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
 #else
     for (int i = 0; i < input_batches * unit_count; i++)
     {
-        output_data[i] = (q7_t)CLAMP(
+        output_data[i] = (int8_t)CLAMP(
             arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
     }
 #endif
diff --git a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c
index fc5d19e6..73b8fa13 100644
--- a/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c
+++ b/src/third_party/cmsis_nn/Source/SVDFunctions/arm_svdf_state_s16_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_svdf_s8.c
  * Description:  S8 basic SVDF layer function with s16 state tensor
  *
- * $Date:        4 May 2022
- * $Revision:    V.2.0.1
+ * $Date:        5 January 2023
+ * $Revision:    V.3.1.0
  *
- * Target Processor:  Cortex-M processors
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -53,26 +53,26 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
                                           const cmsis_nn_per_tensor_quant_params *input_quant_params,
                                           const cmsis_nn_per_tensor_quant_params *output_quant_params,
                                           const cmsis_nn_dims *input_dims,
-                                          const q7_t *input_data,
+                                          const int8_t *input_data,
                                           const cmsis_nn_dims *state_dims,
-                                          q15_t *state_data,
+                                          int16_t *state_data,
                                           const cmsis_nn_dims *weights_feature_dims,
-                                          const q7_t *weights_feature_data,
+                                          const int8_t *weights_feature_data,
                                           const cmsis_nn_dims *weights_time_dims,
-                                          const q15_t *weights_time_data,
+                                          const int16_t *weights_time_data,
                                           const cmsis_nn_dims *bias_dims,
-                                          const q31_t *bias_data,
+                                          const int32_t *bias_data,
                                           const cmsis_nn_dims *output_dims,
-                                          q7_t *output_data)
+                                          int8_t *output_data)
 {
     (void)bias_dims;
     (void)state_dims;
     (void)output_dims;
 
-    const q31_t multiplier_in = input_quant_params->multiplier;
-    const q31_t shift_in = input_quant_params->shift;
-    const q31_t multiplier_out = output_quant_params->multiplier;
-    const q31_t shift_2 = output_quant_params->shift;
+    const int32_t multiplier_in = input_quant_params->multiplier;
+    const int32_t shift_in = input_quant_params->shift;
+    const int32_t multiplier_out = output_quant_params->multiplier;
+    const int32_t shift_2 = output_quant_params->shift;
     const int32_t zp_in = svdf_params->input_offset;
     const int32_t zp_out = svdf_params->output_offset;
     const int32_t in_activation_min = svdf_params->input_activation.min;
@@ -91,31 +91,30 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q31_t *buffer_a = (q31_t *)input_ctx->buf;
+    int32_t *buffer_a = (int32_t *)input_ctx->buf;
 
     if (output_ctx->buf == NULL)
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-    q31_t *buffer_b = (q31_t *)output_ctx->buf;
+    int32_t *buffer_b = (int32_t *)output_ctx->buf;
 
     // Left shift state
-    memmove((q15_t *)state_data,
-            (q15_t *)state_data + 1,
+    memmove((int16_t *)state_data,
+            (int16_t *)state_data + 1,
             (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t)));
 
     // Matrix multiplication input * feature weight
     for (int i_batch = 0; i_batch < input_batches; i_batch++)
     {
-        q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
-        const q7_t *weight = weights_feature_data;
-        const q7_t *input = input_data + i_batch * input_height;
+        int16_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
+        const int8_t *weight = weights_feature_data;
+        const int8_t *input = input_data + i_batch * input_height;
 
         arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
                                                                 weight,
                                                                 res_ptr,
                                                                 -zp_in,
-                                                                0,
                                                                 time_batches,
                                                                 multiplier_in,
                                                                 shift_in,
@@ -132,11 +131,11 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
 
     {
         // Matrix multiplication time weight * state tensors
-        q31_t *ptr_a = buffer_a;
-        const q15_t *v2 = state_data;
+        int32_t *ptr_a = buffer_a;
+        const int16_t *v2 = state_data;
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
-            const q15_t *v1 = weights_time_data;
+            const int16_t *v1 = weights_time_data;
 
             for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
             {
@@ -149,10 +148,10 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
                 for (int i = 0; i < block_count; i++)
                 {
                     j += 2;
-                    q31_t r1 = arm_nn_read_q15x2_ia(&v1);
-                    q31_t r2 = arm_nn_read_q15x2_ia(&v2);
+                    int32_t r1 = arm_nn_read_q15x2_ia(&v1);
+                    int32_t r2 = arm_nn_read_q15x2_ia(&v2);
 
-                    sum = __SMLAD(r1, r2, sum);
+                    sum = SMLAD(r1, r2, sum);
                 }
 
                 // Process the remaining data
@@ -183,8 +182,8 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
         {
             for (int i = 0; i < input_batches; i++)
             {
-                q31_t *output_temp = buffer_b + i * feature_batches;
-                const q31_t *ptr_a = buffer_a + i * feature_batches;
+                int32_t *output_temp = buffer_b + i * feature_batches;
+                const int32_t *ptr_a = buffer_a + i * feature_batches;
 
                 const int32_t *bi = bias_data;
                 for (int j = 0; j < feature_batches; j++)
@@ -197,8 +196,8 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
         {
             for (int i_batch = 0; i_batch < input_batches; i_batch++)
             {
-                q31_t *output_data_temp = buffer_b + i_batch * unit_count;
-                q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+                int32_t *output_data_temp = buffer_b + i_batch * unit_count;
+                int32_t *ptr_a = buffer_a + i_batch * feature_batches;
 
                 for (int i = 0; i < unit_count; i++)
                 {
@@ -217,8 +216,8 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
     {
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
-            q31_t *output_data_temp = buffer_b + i_batch * unit_count;
-            q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+            int32_t *output_data_temp = buffer_b + i_batch * unit_count;
+            int32_t *ptr_a = buffer_a + i_batch * feature_batches;
 
             for (int i = 0; i < unit_count; i++)
             {
@@ -254,7 +253,7 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
 #else
     for (int i = 0; i < input_batches * unit_count; i++)
     {
-        output_data[i] = (q7_t)CLAMP(
+        output_data[i] = (int8_t)CLAMP(
             arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
     }
 #endif
diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c
index 22b9a1d3..6d73402d 100644
--- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c
+++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_nn_softmax_common_s8.c
  * Description:  Softmax with s8 input and output of s8 or s16.
  *
- * $Date:        17 March 2022
- * $Revision:    V.1.0.1
+ * $Date:        5 January 2023
+ * $Revision:    V.1.1.0
  *
- * Target Processor:  Cortex-M processors
+ * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
 
 #include "third_party/cmsis_nn/Include/arm_nnsupportfunctions.h"
@@ -89,7 +89,7 @@ void arm_nn_softmax_common_s8(const int8_t *input,
             }
         }
 
-        const int32_t headroom = __CLZ(sum);
+        const int32_t headroom = CLZ(sum);
         const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31));
         int32_t bits_over_unit;
 
diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c
index 31e27e81..a132e96c 100644
--- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c
+++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_softmax_s16.c
  * Description:  S16 softmax function
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        5 January 2023
+ * $Revision:    V.2.1.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -84,7 +84,7 @@ arm_cmsis_nn_status arm_softmax_s16(const int16_t *input,
             sum += cached_exp_results[col];
         }
 
-        const int32_t headroom = __CLZ(sum);
+        const int32_t headroom = CLZ(sum);
 
         // Compute the reciprocal 1/sum
         const int32_t shifted_sum = (((sum) << (headroom - 1)) + (1 << 13)) >> 14;
diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c
index 671bb893..d49e0dc0 100644
--- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c
+++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_softmax_s8.c
  * Description:  S8 softmax function
  *
- * $Date:        9 March 2022
- * $Revision:    V.2.1.0
+ * $Date:        5 January 2023
+ * $Revision:    V.2.2.0
  *
- * Target Processor:  Cortex-M cores
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -33,10 +33,10 @@
 
 #define ACCUM_BITS 12
 
-#ifdef ARM_MATH_MVEI
+#if defined(ARM_MATH_MVEI) && !defined(ARM_GCC_12_2_ICE)
 static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val)
 {
-#define SHIFT_START (24)
+    #define SHIFT_START (24)
     int32_t shift = SHIFT_START;
     int32x4_t mask;
 
@@ -49,12 +49,12 @@ static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val)
     const int32x4_t op_2 = x + DIV_POW2_MVE(MUL_SAT_MVE(op_1, vdupq_n_s32(715827883)) + x2, 1);
     int32x4_t result = vdupq_n_s32(1895147668) + MUL_SAT_MVE(vdupq_n_s32(1895147668), op_2);
 
-#define SELECT_IF_NON_ZERO(x)                                                                                          \
-    {                                                                                                                  \
-        mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0);                                      \
-        mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p);                                                         \
-        result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result);                                 \
-    }
+    #define SELECT_IF_NON_ZERO(x)                                                                                      \
+        {                                                                                                              \
+            mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0);                                  \
+            mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p);                                                     \
+            result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result);                             \
+        }
 
     SELECT_IF_NON_ZERO(1672461947)
     SELECT_IF_NON_ZERO(1302514674)
@@ -64,7 +64,7 @@ static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val)
     SELECT_IF_NON_ZERO(720401)
     SELECT_IF_NON_ZERO(242)
 
-#undef SELECT_IF_NON_ZERO
+    #undef SELECT_IF_NON_ZERO
 
     mve_pred16_t p = vcmpeqq_n_s32(val, 0);
     mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p);
@@ -91,10 +91,10 @@ void arm_softmax_s8(const int8_t *input,
                     const int32_t diff_min,
                     int8_t *output)
 {
-#ifdef ARM_MATH_MVEI
+#if defined(ARM_MATH_MVEI) && !defined(ARM_GCC_12_2_ICE)
 
-#define ACT_MIN ((int8_t)NN_Q7_MIN)
-#define ACT_MAX ((int8_t)NN_Q7_MAX)
+    #define ACT_MIN ((int8_t)NN_Q7_MIN)
+    #define ACT_MAX ((int8_t)NN_Q7_MAX)
 
     const int32_t mask = (1 << shift);
 
@@ -147,7 +147,7 @@ void arm_softmax_s8(const int8_t *input,
             }
         }
 
-        const int32_t headroom = __CLZ((uint32_t)sum);
+        const int32_t headroom = CLZ((uint32_t)sum);
         const int32_t bits_over_unit = ACCUM_BITS - headroom + 23;
         const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31));
 
diff --git a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c
index 4a88930f..a7c1bff8 100644
--- a/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c
+++ b/src/third_party/cmsis_nn/Source/SoftmaxFunctions/arm_softmax_u8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright 2010-2020, 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,10 +21,10 @@
  * Title:        arm_softmax_u8.c
  * Description:  U8 softmax function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        5 January 2023
+ * $Revision:    V.1.1.0
  *
- * Target Processor:  Cortex-M CPUs
+ * Target :  Arm(R) M-Profile Architecture
  *
  * -------------------------------------------------------------------- */
 
@@ -76,7 +76,7 @@ void arm_softmax_u8(const uint8_t *input,
             }
         }
 
-        const int32_t headroom = __CLZ((uint32_t)sum);
+        const int32_t headroom = CLZ((uint32_t)sum);
         const int32_t bits_over_unit = ACCUM_BITS - headroom + 23;
         const int32_t shifted_scale = ONE_OVER1((sum << headroom) - (1 << 31));