Remove update_weights and reset_kv_cache and collective_rpc api

NVIDIA · hchings · Aug 18, 2025 · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025
commit 599db4708e0c4e8bf47da5fe4117d2c73190a3da
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -737,12 +737,6 @@ class WindowBlockManager
         return 0;
     }
 
-    void resetReuseState()
-    {
-        mCachedBlocksRoot
-            = std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tensorrt_llm::kernels::KVCacheIndex{0});
-    }
-
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -1126,14 +1120,6 @@ class BlockManager
     //! \brief Update cache offsets for block at index
     void updateCacheBlockOffsetsAtIdx(GenerationRequest& seq, SizeType32 windowSize, SizeType32 blockIdx);
 
-    void resetReuseState()
-    {
-        for (auto& [windowSize, manager] : mWindowBlockManagers)
-        {
-            manager.resetReuseState();
-        }
-    }
-
 private:
     [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
     {
@@ -1304,7 +1290,6 @@ class BaseKVCacheManager
 
     virtual void refreshBlocks() = 0;
     virtual void flushIterationEvents() = 0;
-    virtual void resetReuseState() = 0;
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
 
@@ -1654,11 +1639,6 @@ class KVCacheManager : public BaseKVCacheManager
         mBlockManager.flushIterationEvents();
     }
 
-    void resetReuseState() override
-    {
-        mBlockManager.resetReuseState();
-    }
-
     /// @brief Finds the maximum attention window that can be used on a sequence, given some kv-cache block capacity.
     ///
     /// @param inputLength The number of input tokens in the sequence.

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -423,8 +423,7 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def("get_cache_block_ids", &BaseKVCacheManager::getCacheBlockIds)
         .def("get_batch_cache_block_ids", &BaseKVCacheManager::getBatchCacheBlockIds)
         .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds)
-        .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents)
-        .def("reset_reuse_state", &BaseKVCacheManager::resetReuseState);
+        .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents);
 
     py::enum_<tbk::CacheType>(m, "CacheType")
         .value("SELF", tbk::CacheType::kSELF)

@@ -233,11 +233,6 @@ if [ -f "llm_inference_async_ray.py" ]; then
     run_python_file "llm_inference_async_ray.py"
 fi
 
-# 4. test_update_weight_from_ipc.py
-if [ -f "test_update_weight_from_ipc.py" ]; then
-    run_python_file "test_update_weight_from_ipc.py"
-fi
-
 # Run MPI guarding tests
 run_python_file "../llm_inference.py"
 

diff --git a/examples/llm-api/ray/test_update_weight_from_ipc.py b/examples/llm-api/ray/test_update_weight_from_ipc.py
@@ -849,9 +849,6 @@ def load_single_module(name, module):
                 for new_name in params_map[names[-1]]:
                     fw = filter_weights('.'.join(names[:-1] + [new_name]),
                                         weights)
-                    # tmp fixes to enable partial updates in old path
-                    if not fw:
-                        continue
                     if new_name in ['k_proj', 'v_proj']:
                         num_kv_heads_list = [num_kv_heads
                                              ] * len(fw) if isinstance(
@@ -868,27 +865,24 @@ def load_single_module(name, module):
                         }
 
                     module_weights.append(fw)
-                if module_weights:
-                    module.load_weights(weights=module_weights)
+                module.load_weights(weights=module_weights)
 
             else:
                 module_weights = filter_weights(name, weights)
-                if module_weights:
-                    if hasattr(module, 'load_weights'):
-                        module.load_weights(weights=[module_weights])
-                    else:
-                        for n, p in module._parameters.items():
-                            if p is not None:
-                                p.data.copy_(module_weights[n][:])
+                if hasattr(module, 'load_weights'):
+                    module.load_weights(weights=[module_weights])
+                else:
+                    for n, p in module._parameters.items():
+                        if p is not None:
+                            p.data.copy_(module_weights[n][:])
 
     if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
                       False) in ["True", "true", "1", "yes", "y"]:
-        for name, module in tqdm(list(
-                model.named_modules(remove_duplicate=False)),
+        for name, module in tqdm(list(model.named_modules()),
                                  desc="Loading weights"):
             load_single_module(name, module)
     else:
-        all_modules = dict(model.named_modules(remove_duplicate=False))
+        all_modules = dict(model.named_modules())
         serial_load_modules = []
         if preload_weight_modules is not None:
             for module in preload_weight_modules:
@@ -904,13 +898,10 @@ def load_single_module(name, module):
                 del all_modules[module]
             pbar.close()
 
-        pbar = tqdm(list(model.named_modules(remove_duplicate=False)),
+        pbar = tqdm(list(model.named_modules()),
                     desc="Loading weights concurrently")
-        args_list = [
-            (name, module)
-            for name, module in model.named_modules(remove_duplicate=False)
-            if name not in serial_load_modules
-        ]
+        args_list = [(name, module) for name, module in model.named_modules()
+                     if name not in serial_load_modules]
         run_concurrently(load_single_module, args_list, pbar=pbar)