NVIDIA · hchings · Aug 18, 2025 · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ tensorrt_llm/deep_ep_cpp_tllm.pyi
 tensorrt_llm/deep_gemm/
 tensorrt_llm/deep_gemm_cpp_tllm.*.so
 tensorrt_llm/deep_gemm_cpp_tllm.pyi
+tensorrt_llm/pg_utils_bindings.*.so
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst

diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
@@ -23,9 +23,17 @@
 #include "tensorrt_llm/executor/cacheCommunicator.h"
 #include "tensorrt_llm/executor/dataTransceiverState.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
+#include "tensorrt_llm/runtime/utils/pgUtils.h"
 #include <future>
-#include <map>
 #include <memory>
+#include <mutex>
+#include <optional>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/custom_class.h>
+#include <torch/python.h>
+#include <type_traits>
+#include <vector>
 
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
@@ -37,6 +45,131 @@ class BaseCacheTransceiver;
 class DataResponder;
 class DataRequester;
 
+class CacheTransceiverComm
+{
+public:
+    // Construct from a non-owning raw pointer, won't take ownership of the pointer
+    explicit CacheTransceiverComm(mpi::MpiComm const* mpiComm)
+        : mMpiComm(std::shared_ptr<mpi::MpiComm const>(nullptr), mpiComm)
+    {
+    }
+
+    // Construct from a shared_ptr with shared ownership
+    explicit CacheTransceiverComm(std::shared_ptr<mpi::MpiComm const> mpiComm)
+        : mMpiComm(std::move(mpiComm))
+    {
+    }
+
+    // Construct from a ProcessGroup communicator
+    explicit CacheTransceiverComm(c10::intrusive_ptr<c10d::ProcessGroup> pgComm)
+        : mPgComm(std::move(pgComm))
+    {
+    }
+
+    ~CacheTransceiverComm() = default;
+
+    bool isMpi() const noexcept
+    {
+        return mMpiComm != nullptr;
+    }
+
+    int getRank() const
+    {
+        if (isMpi())
+        {
+            return mMpiComm->getRank();
+        }
+        return mPgComm->getRank();
+    }
+
+    int getSize() const
+    {
+        if (isMpi())
+        {
+            return mMpiComm->getSize();
+        }
+        return mPgComm->getSize();
+    }
+
+    void allgather(void const* sendbuf, void* recvbuf, int count, mpi::MpiType dtype) const
+    {
+        if (isMpi())
+        {
+            mMpiComm->allgather(sendbuf, recvbuf, count, dtype);
+            return;
+        }
+        TLLM_THROW("Input arguments only supported in mpi");
+    }
+
+    template <typename Input, typename Output>
+    bool allgather(Input input, Output output, c10d::AllgatherOptions options = c10d::AllgatherOptions()) const
+    {
+        if (isMpi())
+        {
+            TLLM_THROW("Input arguments only supported in pg");
+        }
+        tensorrt_llm::pg_utils::PgHelper pgh{mPgComm};
+
+        PGCHECK_THROW(pgh.allgather(input, output, options));
+        return true;
+    }
+
+    template <typename Input, typename Output>
+    bool allgatherv(Input input, Output output, std::vector<int> const& sizes,
+        c10d::AllgatherOptions options = c10d::AllgatherOptions()) const
+    {
+        if (isMpi())
+        {
+            TLLM_THROW("Input arguments only supported in pg");
+        }
+        tensorrt_llm::pg_utils::PgHelper pgh{mPgComm};
+        PGCHECK_THROW(pgh.allgatherv(input, output, sizes, options));
+        return true;
+    }
+
+    bool allgatherv(void const* sendbuf, int sendcount, mpi::MpiType sendtype, void* recvbuf,
+        std::vector<int> const& recvcounts, std::vector<int> const& displs, mpi::MpiType recvtype) const
+    {
+        if (isMpi())
+        {
+            mMpiComm->allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype);
+            return true;
+        }
+        TLLM_THROW("Input arguments only supported in mpi");
+    }
+
+    CacheTransceiverComm split(int color, int key)
+    {
+        if (isMpi())
+        {
+            auto subgroup = mMpiComm->split(color, key);
+            return CacheTransceiverComm(std::make_shared<mpi::MpiComm const>(std::move(subgroup)));
+        }
+        bool const initialized = Py_IsInitialized();
+        TLLM_CHECK_WITH_INFO(initialized, "Trying to use ProcessGroup communicator but Python is not initialized");
+        try
+        {
+            pybind11::gil_scoped_acquire gil;
+            auto const m = pybind11::module::import("tensorrt_llm._torch.distributed.pg_utils");
+            // Properly box the existing intrusive_ptr ProcessGroup into an IValue
+            // and convert to a Python object without constructing a new instance.
+            auto const py_pg = torch::jit::toPyObject(c10::IValue(mPgComm));
+
+            auto const py_sub_pg = m.attr("split")(color, key, py_pg);
+            auto pgSub = torch::jit::toCustomClass<c10d::ProcessGroup>(py_sub_pg);
+            return CacheTransceiverComm(pgSub);
+        }
+        catch (...)
+        {
+            TLLM_THROW("Failed to split process group");
+        }
+    }
+
+private:
+    std::shared_ptr<mpi::MpiComm const> mMpiComm;
+    c10::intrusive_ptr<c10d::ProcessGroup> mPgComm;
+};
+
 class CacheTransceiverFactory
 {
 public:
@@ -114,9 +247,12 @@ class CacheTransceiver : public BaseCacheTransceiver
     std::unique_ptr<DataRequester> mDataRequester;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mResponderFutures;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
-    mpi::MpiComm const *mMpiGroupComm{nullptr}, *mMpiWorldComm{nullptr};
-    std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
-        mMpiGroupTPInDPComm;
+    // only for mpi backend, don't need it for ucx backend
+    mpi::MpiComm const* mMpiWorldComm{nullptr};
+
+    std::shared_ptr<CacheTransceiverComm> mGroupComm;
+    std::shared_ptr<CacheTransceiverComm> mGroupTensorParaComm, mGroupPipeParaComm, mGroupDataComm, mGroupTPInDPComm;
+
     executor::kv_cache::CommState const* mCommState;
     std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
     std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;

diff --git a/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
@@ -35,6 +35,7 @@
 #include <cstdlib>
 #include <memory>
 #include <mutex>
+#include <optional>
 #include <thread>
 
 #if ENABLE_MULTI_DEVICE
@@ -425,7 +426,25 @@ class MpiComm
         return !(rhs == *this);
     }
 
+    bool couldUseMPI() const
+    {
+        if (!mDisableMPI.has_value())
+        {
+            char* val = std::getenv("TLLM_DISABLE_MPI");
+            ;
+            bool disable_mpi = false;
+            if (val != NULL && std::string(val) == "1")
+            {
+                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
+            }
+            mDisableMPI = disable_mpi;
+        }
+
+        return mDisableMPI.value();
+    }
-    bool couldUseMPI() const
-    {
-        if (!mDisableMPI.has_value())
-        {
-            char* val = std::getenv("TLLM_DISABLE_MPI");
-            ;
-            bool disable_mpi = false;
-            if (val != NULL && std::string(val) == "1")
-            {
-                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
-            }
-            mDisableMPI = disable_mpi;
-        }
-
-        return mDisableMPI.value();
-    }
++ b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
+@@
+ #include <memory>
+ #include <mutex>
+#include <optional>
+#include <stdexcept>
+#include <string>
+ #include <thread>
-    bool couldUseMPI() const
-    {
-        if (!mDisableMPI.has_value())
-        {
-            char* val = std::getenv("TLLM_DISABLE_MPI");
-            ;
-            bool disable_mpi = false;
-            if (val != NULL && std::string(val) == "1")
-            {
-                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
-            }
-            mDisableMPI = disable_mpi;
-        }
-
-        return mDisableMPI.value();
-    }
++ b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
+@@ lines 429-444
+    [[nodiscard]] bool couldUseMPI() const
+     {
+         if (!mDisableMPI.has_value())
+         {
+-            char* val = std::getenv("TLLM_DISABLE_MPI");
+-            ;
+-            bool disable_mpi = false;
+-            if (val != NULL && std::string(val) == "1")
+-            {
+-                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
+-            }
+            char const* val = std::getenv("TLLM_DISABLE_MPI");
+            bool const disableMpi = (val != nullptr) && (std::strcmp(val, "1") == 0);
+            mDisableMPI = disableMpi;
+         }
+-
+        // couldUse == not disabled
+        return !mDisableMPI.value();
+     }
-    bool couldUseMPI() const
-    {
-        if (!mDisableMPI.has_value())
-        {
-            char* val = std::getenv("TLLM_DISABLE_MPI");
-            ;
-            bool disable_mpi = false;
-            if (val != NULL && std::string(val) == "1")
-            {
-                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
-            }
-            mDisableMPI = disable_mpi;
-        }
-
-        return mDisableMPI.value();
-    }
++ b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
+@@
+ #include <memory>
+ #include <mutex>
+#include <optional>
+#include <stdexcept>
+#include <string>
+ #include <thread>
-    bool couldUseMPI() const
-    {
-        if (!mDisableMPI.has_value())
-        {
-            char* val = std::getenv("TLLM_DISABLE_MPI");
-            ;
-            bool disable_mpi = false;
-            if (val != NULL && std::string(val) == "1")
-            {
-                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
-            }
-            mDisableMPI = disable_mpi;
-        }
-
-        return mDisableMPI.value();
-    }
++ b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
+@@ lines 429-444
+    [[nodiscard]] bool couldUseMPI() const
+     {
+         if (!mDisableMPI.has_value())
+         {
+-            char* val = std::getenv("TLLM_DISABLE_MPI");
+-            ;
+-            bool disable_mpi = false;
+-            if (val != NULL && std::string(val) == "1")
+-            {
+-                throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
+-            }
+            char const* val = std::getenv("TLLM_DISABLE_MPI");
+            bool const disableMpi = (val != nullptr) && (std::strcmp(val, "1") == 0);
+            mDisableMPI = disableMpi;
+         }
+-
+        // couldUse == not disabled
+        return !mDisableMPI.value();
+     }
+
 private:
+    mutable std::optional<bool> mDisableMPI;
     //! \brief Corresponds to `world()` by default, but can be overridden per process.
     static MpiComm& mutableSession();