Rolling average alloc latency

Part 1. (single tier) ----------------------------- added per pool class rolling average latency (upstream PR version) fix for rolling stats (on multi-tier to be followed by multi-tier rolling stats implementation in the following commit) it should be noted - an attempt was made to use average alloc latency as a guide to control background mover batch size. While average alloc latency decreased, so did throughput because batch size became too big and put contention on locks.
intel · byrnedj · Nov 2, 2021 · Feb 4, 2023 · Jul 6, 2022 · Sep 28, 2021
commit 9fc705f990abd2d98864523903b9038c8092bd96
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
@@ -2782,6 +2782,8 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
 
   // the allocation class in our memory allocator.
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[pid][cid]};
 
   // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
@@ -2892,6 +2894,9 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
 
   // TODO: per-tier? Right now stats_ are not used in any public periodic
   // worker
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[pid][cid]};
+
   (*stats_.allocAttempts)[pid][cid].inc();
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
@@ -4912,7 +4917,9 @@ ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
                                                ClassId classId) const {
   const auto& pool = allocator_[tid]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
-  return ac.getStats();
+  auto stats = ac.getStats();
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[poolId][classId];
+  return stats;
 }
 
 template <typename CacheTrait>

diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
@@ -43,6 +43,8 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+
+  classAllocLatency = std::make_unique<PerPoolClassRollingStats>();
 }
 
 template <int>

diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
@@ -21,6 +21,7 @@
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/memory/MemoryAllocator.h"
 #include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -229,6 +230,13 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
+  using PerPoolClassRollingStats =
+      std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>;
+
+  // rolling latency tracking for every alloc class in every pool
+  std::unique_ptr<PerPoolClassRollingStats> classAllocLatency{};
+
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
 

diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -22,6 +22,7 @@
 #include <unordered_map>
 
 #include "cachelib/allocator/memory/Slab.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -49,6 +50,9 @@ struct ACStats {
   // true if the allocation class is full.
   bool full;
 
+  // Rolling allocation latency (in ns)
+  util::RollingStats allocLatencyNs;
+
   constexpr unsigned long long totalSlabs() const noexcept {
     return freeSlabs + usedSlabs;
   }

diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
@@ -223,8 +223,10 @@ struct Stats {
                                    : stats.usageFraction();
 
         out << folly::sformat(
-                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}",
-                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction)
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
+                   "rollingAvgAllocLatency: {:8.2f}ns",
+                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   stats.allocLatencyNs.estimate())
             << std::endl;
       });
     }

diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Range.h>
+#include <folly/logging/xlog.h>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+namespace util {
+
+class RollingStats {
+ public:
+  // track latency by taking the value of duration directly.
+  void trackValue(double value) {
+    // This is a highly unlikely scenario where
+    // cnt_ reaches numerical limits. Skip update
+    // of the rolling average anymore.
+    if (cnt_ == std::numeric_limits<uint64_t>::max()) {
+      cnt_ = 0;
+      return;
+    }
+    auto ratio = static_cast<double>(cnt_) / (cnt_ + 1);
+    avg_ *= ratio;
+    ++cnt_;
+    avg_ += value / cnt_;
+  }
+
+  // Return the rolling average.
+  double estimate() { return avg_; }
+
+ private:
+  double avg_{0};
+  uint64_t cnt_{0};
+};
+
+class RollingLatencyTracker {
+ public:
+  explicit RollingLatencyTracker(RollingStats& stats)
+      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  RollingLatencyTracker() {}
+  ~RollingLatencyTracker() {
+    if (stats_) {
+      auto tp = std::chrono::steady_clock::now();
+      auto diffNanos =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
+              .count();
+      stats_->trackValue(static_cast<double>(diffNanos));
+    }
+  }
+
+  RollingLatencyTracker(const RollingLatencyTracker&) = delete;
+  RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete;
+
+  RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept
+      : stats_(rhs.stats_), begin_(rhs.begin_) {
+    rhs.stats_ = nullptr;
+  }
+
+  RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept {
+    if (this != &rhs) {
+      this->~RollingLatencyTracker();
+      new (this) RollingLatencyTracker(std::move(rhs));
+    }
+    return *this;
+  }
+
+ private:
+  RollingStats* stats_{nullptr};
+  std::chrono::time_point<std::chrono::steady_clock> begin_;
+};
+} // namespace util
+} // namespace cachelib
+} // namespace facebook