NVIDIA · Funatiq · Aug 24, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -69,32 +69,6 @@ function(add_gtest test_name test_src)
   add_dependencies(google-tests ${test_name})
 endfunction()
 
-add_subdirectory(unit_tests)
-
-add_gtest(mpiUtilsTest runtime/mpiUtilsTest.cpp)
-
-add_gtest(gptDecoderTest runtime/gptDecoderTest.cpp)
-add_gtest(gptDecoderBatchedTest runtime/gptDecoderBatchedTest.cpp)
-add_gtest(medusaModuleTest runtime/medusaModuleTest.cpp)
-
-add_gtest(moeLoadBalancerTest runtime/moeLoadBalancerTest.cpp)
-
-add_gtest(sanitizerTest runtime/sanitizerTest.cpp)
-
-add_gtest(eaglePackDataTest kernels/eaglePackDataTest.cpp)
-
-add_gtest(medusaDecodeLayerTest layers/medusaDecodeLayerTest.cpp)
-
-add_gtest(moeLoadBalanceKernelTest kernels/moeLoadBalanceKernelTest.cpp)
-
-add_gtest(eagleLayerTest layers/eagleLayerTest.cpp)
-
 add_subdirectory(utils)
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
-  add_subdirectory(batch_manager)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor)
-  add_subdirectory(executor)
-endif()
+add_subdirectory(unit_tests)
+add_subdirectory(e2e_tests)
diff --git a/cpp/tests/e2e_tests/CMakeLists.txt b/cpp/tests/e2e_tests/CMakeLists.txt
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT
+# Source Code License Agreement
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this material and related documentation without an express
+# license agreement from NVIDIA CORPORATION or its affiliates is strictly
+# prohibited.
+
+add_subdirectory(batch_manager)
+add_subdirectory(executor)
diff --git a/cpp/tests/batch_manager/CMakeLists.txt → ...ts/e2e_tests/batch_manager/CMakeLists.txt b/cpp/tests/batch_manager/CMakeLists.txt → ...ts/e2e_tests/batch_manager/CMakeLists.txt
@@ -9,13 +9,10 @@
 # license agreement from NVIDIA CORPORATION or its affiliates is strictly
 # prohibited.
 
-add_gtest(cacheTransceiverTest cacheTransceiverTest.cpp)
-
+# guidedDecoderTest requires model tokenizer info, so it's easier to run it with
+# e2e tests instead of unit tests.
+add_gtest(guidedDecoderTest guidedDecoderTest.cpp)
+add_gtest(trtEncoderModelTest trtEncoderModelTest.cpp)
 add_gtest(trtGptModelTest trtGptModelTest.cpp)
 add_gtest(trtGptModelRealDecoderTest trtGptModelRealDecoderTest.cpp)
 target_link_libraries(trtGptModelRealDecoderTest PRIVATE testingUtils)
-
-add_gtest(peftCacheManagerTest peftCacheManagerTest.cpp)
-add_gtest(trtEncoderModelTest trtEncoderModelTest.cpp)
-add_gtest(guidedDecoderTest guidedDecoderTest.cpp)
-add_gtest(blockKeyTest blockKeyTest.cpp)
diff --git a/...tests/batch_manager/guidedDecoderTest.cpp → ...tests/batch_manager/guidedDecoderTest.cpp b/...tests/batch_manager/guidedDecoderTest.cpp → ...tests/batch_manager/guidedDecoderTest.cpp
diff --git a/...sts/batch_manager/trtEncoderModelTest.cpp → ...sts/batch_manager/trtEncoderModelTest.cpp b/...sts/batch_manager/trtEncoderModelTest.cpp → ...sts/batch_manager/trtEncoderModelTest.cpp
diff --git a/...ch_manager/trtGptModelRealDecoderTest.cpp → ...ch_manager/trtGptModelRealDecoderTest.cpp b/...ch_manager/trtGptModelRealDecoderTest.cpp → ...ch_manager/trtGptModelRealDecoderTest.cpp
diff --git a/cpp/tests/batch_manager/trtGptModelTest.cpp → ...e_tests/batch_manager/trtGptModelTest.cpp b/cpp/tests/batch_manager/trtGptModelTest.cpp → ...e_tests/batch_manager/trtGptModelTest.cpp
diff --git a/cpp/tests/executor/CMakeLists.txt → cpp/tests/e2e_tests/executor/CMakeLists.txt b/cpp/tests/executor/CMakeLists.txt → cpp/tests/e2e_tests/executor/CMakeLists.txt
diff --git a/cpp/tests/executor/disaggExecutor.h → ...tests/e2e_tests/executor/disaggExecutor.h b/cpp/tests/executor/disaggExecutor.h → ...tests/e2e_tests/executor/disaggExecutor.h
diff --git a/cpp/tests/executor/disaggExecutorTest.cpp → ...e2e_tests/executor/disaggExecutorTest.cpp b/cpp/tests/executor/disaggExecutorTest.cpp → ...e2e_tests/executor/disaggExecutorTest.cpp
@@ -1405,7 +1405,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaCon2TP1Gen1TP2PP2DisaaggOrchestrator, DisaggOrches
         ),
     generateTestNameDisaggParams);
 
-INSTANTIATE_TEST_SUITE_P(LlamaCon2TP2Gen2TP1DisaaggSpawnOrchestrator, DisaggOrchestratorParamsTest,
+INSTANTIATE_TEST_SUITE_P(LlamaCon2TP2Gen2TP1DisaggSpawnOrchestrator, DisaggOrchestratorParamsTest,
     testing::Combine(                                                                                      //
         testing::Values(1),                                                                                // processNum
         testing::Values(
@@ -1418,7 +1418,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaCon2TP2Gen2TP1DisaaggSpawnOrchestrator, DisaggOrch
         ),
     generateTestNameDisaggParams);
 
-INSTANTIATE_TEST_SUITE_P(LlamaCon2TP1Gen2PP2DisaaggSpawnOrchestrator, DisaggOrchestratorParamsTest,
+INSTANTIATE_TEST_SUITE_P(LlamaCon2TP1Gen2PP2DisaggSpawnOrchestrator, DisaggOrchestratorParamsTest,
     testing::Combine(                                                                                      //
         testing::Values(1),                                                                                // processNum
         testing::Values(

diff --git a/cpp/tests/executor/encDecTest.cpp → cpp/tests/e2e_tests/executor/encDecTest.cpp b/cpp/tests/executor/encDecTest.cpp → cpp/tests/e2e_tests/executor/encDecTest.cpp
diff --git a/cpp/tests/executor/executorMockTest.cpp → ...s/e2e_tests/executor/executorMockTest.cpp b/cpp/tests/executor/executorMockTest.cpp → ...s/e2e_tests/executor/executorMockTest.cpp
diff --git a/cpp/tests/executor/executorTest.cpp → ...tests/e2e_tests/executor/executorTest.cpp b/cpp/tests/executor/executorTest.cpp → ...tests/e2e_tests/executor/executorTest.cpp
diff --git a/cpp/tests/executor/executorTest.h → cpp/tests/e2e_tests/executor/executorTest.h b/cpp/tests/executor/executorTest.h → cpp/tests/e2e_tests/executor/executorTest.h
diff --git a/cpp/tests/unit_tests/CMakeLists.txt b/cpp/tests/unit_tests/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 
 add_subdirectory(common)
 add_subdirectory(kernels)
+add_subdirectory(multi_gpu)
 add_subdirectory(layers)
 add_subdirectory(runtime)
 add_subdirectory(thop)

diff --git a/cpp/tests/unit_tests/batch_manager/CMakeLists.txt b/cpp/tests/unit_tests/batch_manager/CMakeLists.txt
@@ -9,12 +9,14 @@
 # license agreement from NVIDIA CORPORATION or its affiliates is strictly
 # prohibited.
 
+add_gtest(blockKeyTest blockKeyTest.cpp)
+add_gtest(cacheTransBufferTest cacheTransBufferTest.cpp)
 add_gtest(capacitySchedulerTest capacitySchedulerTest.cpp)
 add_gtest(contextProgressTest contextProgressTest.cu)
 add_gtest(evictionPolicyTest evictionPolicyTest.cpp)
 add_gtest(kvCacheManagerTest kvCacheManagerTest.cpp)
 add_gtest(kvCacheUtilsTest kvCacheUtilsTest.cpp)
 add_gtest(llmRequestTest llmRequestTest.cpp)
 add_gtest(microBatchSchedulerTest microBatchSchedulerTest.cpp)
+add_gtest(peftCacheManagerTest peftCacheManagerTest.cpp)
 add_gtest(staticThreadPoolTest staticThreadPoolTest.cpp)
-add_gtest(cacheTransBufferTest cacheTransBufferTest.cpp)
diff --git a/cpp/tests/batch_manager/blockKeyTest.cpp → ...unit_tests/batch_manager/blockKeyTest.cpp b/cpp/tests/batch_manager/blockKeyTest.cpp → ...unit_tests/batch_manager/blockKeyTest.cpp
diff --git a/...ts/batch_manager/peftCacheManagerTest.cpp → ...ts/batch_manager/peftCacheManagerTest.cpp b/...ts/batch_manager/peftCacheManagerTest.cpp → ...ts/batch_manager/peftCacheManagerTest.cpp
diff --git a/cpp/tests/unit_tests/kernels/CMakeLists.txt b/cpp/tests/unit_tests/kernels/CMakeLists.txt
@@ -44,17 +44,6 @@ add_gtest(mlaChunkedPrefillTest mlaChunkedPrefillTest.cu)
 
 add_gtest(fusedMoeCommKernelTest fusedMoeCommKernelTest.cpp)
 
-if(NOT ENABLE_MULTI_DEVICE EQUAL 0)
-  add_gtest(allReduceKernelTest allReduce/allReduceKernelTest.cu)
-  add_gtest(allReduceFusionTest allReduce/allReduceFusionTest.cu)
-  add_gtest(gemmAllReduceTest allReduce/gemmAllReduceTest.cu)
-  if(USING_OSS_CUTLASS_ALLREDUCE_GEMM)
-    target_link_libraries(gemmAllReduceTest PRIVATE ar_gemm_src)
-    target_compile_definitions(gemmAllReduceTest
-                               PRIVATE USING_OSS_CUTLASS_ALLREDUCE_GEMM)
-  endif()
-endif()
-
 add_gtest(
   gemmSwigluRunnerTest
   fused_gated_gemm/gemmSwigluRunnerTest.cu
@@ -88,11 +77,13 @@ set(SAMPLING_KERNEL_TEST_SRC
     sampling/samplingTest.cpp sampling/samplingTopKTest.cpp
     sampling/samplingTopPTest.cpp sampling/samplingAirTopPTest.cpp
     sampling/samplingPenaltyTest.cpp sampling/samplingUtilsTest.cu)
-
 add_gtest(samplingKernelsTest "${SAMPLING_KERNEL_TEST_SRC}")
 
 set(ROUTING_KERNEL_TEST_SRC
     routing/routingTest.cpp routing/routingLlama4Test.cpp
     routing/routingRenormalizeTest.cpp routing/routingDeepSeekTest.cpp)
-
 add_gtest(routingKernelsTest "${ROUTING_KERNEL_TEST_SRC}")
+
+add_gtest(moeLoadBalanceKernelTest moeLoadBalanceKernelTest.cpp)
+
+add_gtest(eaglePackDataTest eaglePackDataTest.cpp)
diff --git a/cpp/tests/kernels/eaglePackDataTest.cpp → .../unit_tests/kernels/eaglePackDataTest.cpp b/cpp/tests/kernels/eaglePackDataTest.cpp → .../unit_tests/kernels/eaglePackDataTest.cpp
diff --git a/...ests/kernels/moeLoadBalanceKernelTest.cpp → ...ests/kernels/moeLoadBalanceKernelTest.cpp b/...ests/kernels/moeLoadBalanceKernelTest.cpp → ...ests/kernels/moeLoadBalanceKernelTest.cpp
diff --git a/cpp/tests/unit_tests/layers/CMakeLists.txt b/cpp/tests/unit_tests/layers/CMakeLists.txt
@@ -31,5 +31,7 @@ set(LOOKAHEAD_DECODING_TEST_SRC randomLlm.cpp lookaheadDecodingLayerTest.cpp)
 add_gtest(lookaheadDecodingLayerTest "${LOOKAHEAD_DECODING_TEST_SRC}")
 
 add_gtest(dynamicDecodeLayerTest dynamicDecodeLayerTest.cpp)
+add_gtest(eagleLayerTest eagleLayerTest.cpp)
 add_gtest(explicitDraftTokensLayerTest explicitDraftTokensLayerTest.cpp)
 add_gtest(layerUtilsTest layerUtilsTest.cpp)
+add_gtest(medusaDecodeLayerTest medusaDecodeLayerTest.cpp)
diff --git a/cpp/tests/layers/eagleLayerTest.cpp → ...ests/unit_tests/layers/eagleLayerTest.cpp b/cpp/tests/layers/eagleLayerTest.cpp → ...ests/unit_tests/layers/eagleLayerTest.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tests/layers/eagleLayerTest.h"
+#include "eagleLayerTest.h"
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"

diff --git a/cpp/tests/layers/eagleLayerTest.h → cpp/tests/unit_tests/layers/eagleLayerTest.h b/cpp/tests/layers/eagleLayerTest.h → cpp/tests/unit_tests/layers/eagleLayerTest.h
diff --git a/cpp/tests/layers/medusaDecodeLayerTest.cpp → ...it_tests/layers/medusaDecodeLayerTest.cpp b/cpp/tests/layers/medusaDecodeLayerTest.cpp → ...it_tests/layers/medusaDecodeLayerTest.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tests/layers/medusaDecodeLayerTest.h"
+#include "medusaDecodeLayerTest.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/runtime/medusaModule.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"

diff --git a/cpp/tests/layers/medusaDecodeLayerTest.h → ...unit_tests/layers/medusaDecodeLayerTest.h b/cpp/tests/layers/medusaDecodeLayerTest.h → ...unit_tests/layers/medusaDecodeLayerTest.h
diff --git a/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt b/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT
+# Source Code License Agreement
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this material and related documentation without an express
+# license agreement from NVIDIA CORPORATION or its affiliates is strictly
+# prohibited.
+
+add_subdirectory(kernels)
+
+add_gtest(cacheTransceiverTest cacheTransceiverTest.cpp)
+add_gtest(mpiUtilsTest mpiUtilsTest.cpp)
+add_gtest(userBufferTest userBufferTest.cpp)
diff --git a/...ts/batch_manager/cacheTransceiverTest.cpp → ..._tests/multi_gpu/cacheTransceiverTest.cpp b/...ts/batch_manager/cacheTransceiverTest.cpp → ..._tests/multi_gpu/cacheTransceiverTest.cpp
diff --git a/cpp/tests/unit_tests/multi_gpu/kernels/CMakeLists.txt b/cpp/tests/unit_tests/multi_gpu/kernels/CMakeLists.txt
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT
+# Source Code License Agreement
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this material and related documentation without an express
+# license agreement from NVIDIA CORPORATION or its affiliates is strictly
+# prohibited.
+
+if(NOT ENABLE_MULTI_DEVICE EQUAL 0)
+  add_gtest(allReduceKernelTest allReduce/allReduceKernelTest.cu)
+  add_gtest(allReduceFusionTest allReduce/allReduceFusionTest.cu)
+  add_gtest(gemmAllReduceTest allReduce/gemmAllReduceTest.cu)
+  if(USING_OSS_CUTLASS_ALLREDUCE_GEMM)
+    target_link_libraries(gemmAllReduceTest PRIVATE ar_gemm_src)
+    target_compile_definitions(gemmAllReduceTest
+                               PRIVATE USING_OSS_CUTLASS_ALLREDUCE_GEMM)
+  endif()
+endif()
diff --git a/.../kernels/allReduce/allReduceFusionTest.cu → .../kernels/allReduce/allReduceFusionTest.cu b/.../kernels/allReduce/allReduceFusionTest.cu → .../kernels/allReduce/allReduceFusionTest.cu
@@ -501,11 +501,8 @@ TEST(Kernel_AllReduceFusion, AllReduceAccuracyRandomTokenNum)
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
     auto rank = comm.getRank();
-    if (world_size % 2)
-    {
-        TLLM_LOG_WARNING("world size is not a multiple of 2, return");
-        return;
-    }
+    ASSERT_EQ(world_size % 2, 0) << "Requires even world size (got " << world_size << ")";
+
     int iter = 100;
     std::vector<int> candidate_hidden_dim{1024, 2048, 4096, 7168, 8192};
     int min_token_num = 1;
@@ -537,11 +534,8 @@ TEST(Kernel_AllReduceFusion, AllReduceAccuracyFixedTokenNum)
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
     auto rank = comm.getRank();
-    if (world_size % 2)
-    {
-        TLLM_LOG_WARNING("world size is not a multiple of 2, return");
-        return;
-    }
+    ASSERT_EQ(world_size % 2, 0) << "Requires even world size (got " << world_size << ")";
+
     int iter = 10;
     std::vector<int> candidate_hidden_dim{1024, 2048, 4096, 7168, 8192};
     int min_token_num = 1;
@@ -603,11 +597,8 @@ TEST(Kernel_AllReduceFusion, AllReduceFusionAccuracyDifferentHiddenDim)
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
     auto rank = comm.getRank();
-    if (world_size % 2)
-    {
-        TLLM_LOG_WARNING("world size is not a multiple of 2, return");
-        return;
-    }
+    ASSERT_EQ(world_size % 2, 0) << "Requires even world size (got " << world_size << ")";
+
     int const arch = tensorrt_llm::common::getSMVersion();
     if (arch >= 100)
     {
@@ -647,11 +638,8 @@ TEST(Kernel_AllReduceFusion, AllReduceFusionAccuracyDifferentDType)
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
     auto rank = comm.getRank();
-    if (world_size % 2)
-    {
-        TLLM_LOG_WARNING("world size is not a multiple of 2, return");
-        return;
-    }
+    ASSERT_EQ(world_size % 2, 0) << "Requires even world size (got " << world_size << ")";
+
     std::vector<int> candidate_hidden_dim{1024, 2048, 4096, 7168, 8192};
     int min_token_num = 1;
     int max_token_num = 2048;
@@ -683,53 +671,52 @@ TEST(Kernel_AllReduceFusion, AllReduceFusionAccuracyDifferentDType)
 TEST(Kernel_AllReduceFusion, Perf)
 {
     int const arch = tensorrt_llm::common::getSMVersion();
-    if (arch >= 100)
+    if (arch < 100)
+    {
+        GTEST_SKIP() << "Skipping test for SM < 100";
+    }
+
+    using Runner = TestRunner<half, ar_fusion::AllReduceFusionPattern::kARResidualRMSNormFP4Quant>;
+    auto& comm = mpi::MpiComm::world();
+    auto world_size = comm.getSize();
+    auto rank = comm.getRank();
+    ASSERT_EQ(world_size % 2, 0) << "Requires even world size (got " << world_size << ")";
+
+    int warmup = 100, iter = 300;
+    int hidden_dim = 7168;
+    std::vector<int> candidate_token_num{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
+    int max_token_num = 2048;
+    Runner runner(max_token_num, hidden_dim);
+    for (auto token_num : candidate_token_num)
     {
-        using Runner = TestRunner<half, ar_fusion::AllReduceFusionPattern::kARResidualRMSNormFP4Quant>;
-        auto& comm = mpi::MpiComm::world();
-        auto world_size = comm.getSize();
-        auto rank = comm.getRank();
-        if (world_size % 2)
+        auto latency = runner.benchmark(&Runner::run_kernel, warmup, iter, token_num, hidden_dim);
+        if (rank == 0)
         {
-            TLLM_LOG_WARNING("world size is not a multiple of 2, return");
-            return;
+            TLLM_LOG_INFO(
+                "token_num %-4d, hidden_dim %-4d, fusion kernel latency %4.4fus", token_num, hidden_dim, latency);
         }
-        int warmup = 100, iter = 300;
-        int hidden_dim = 7168;
-        std::vector<int> candidate_token_num{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
-        int max_token_num = 2048;
-        Runner runner(max_token_num, hidden_dim);
-        for (auto token_num : candidate_token_num)
+        auto nccl_latency = runner.benchmark(&Runner::run_nccl_allreduce, warmup, iter, token_num, hidden_dim);
+        if (rank == 0)
         {
-            auto latency = runner.benchmark(&Runner::run_kernel, warmup, iter, token_num, hidden_dim);
-            if (rank == 0)
-            {
-                TLLM_LOG_INFO(
-                    "token_num %-4d, hidden_dim %-4d, fusion kernel latency %4.4fus", token_num, hidden_dim, latency);
-            }
-            auto nccl_latency = runner.benchmark(&Runner::run_nccl_allreduce, warmup, iter, token_num, hidden_dim);
-            if (rank == 0)
-            {
-                TLLM_LOG_INFO("nccl allreduce latency %4.4fus", nccl_latency);
-            }
-            auto residual_latency = runner.benchmark(&Runner::run_residual_add, warmup, iter, token_num, hidden_dim);
-            if (rank == 0)
-            {
-                TLLM_LOG_INFO("residual add latency %4.4fus", residual_latency);
-            }
-            auto rms_latency = runner.benchmark(&Runner::run_rms_norm, warmup, iter, token_num, hidden_dim);
-            if (rank == 0)
-            {
-                TLLM_LOG_INFO("rms norm latency %4.4fus", rms_latency);
-            }
-            auto quant_latency = runner.benchmark(&Runner::run_fp4_quant, warmup, iter, token_num, hidden_dim);
-            if (rank == 0)
-            {
-                TLLM_LOG_INFO("fp4 quant latency %4.4fus", quant_latency);
-                auto tot_latency = nccl_latency + residual_latency + rms_latency + quant_latency;
-                TLLM_LOG_INFO("fusion kernel latency %4.4fus, nccl + ops latency %4.4fus, total speedup %2.4fx",
-                    latency, tot_latency, tot_latency / latency);
-            }
+            TLLM_LOG_INFO("nccl allreduce latency %4.4fus", nccl_latency);
+        }
+        auto residual_latency = runner.benchmark(&Runner::run_residual_add, warmup, iter, token_num, hidden_dim);
+        if (rank == 0)
+        {
+            TLLM_LOG_INFO("residual add latency %4.4fus", residual_latency);
+        }
+        auto rms_latency = runner.benchmark(&Runner::run_rms_norm, warmup, iter, token_num, hidden_dim);
+        if (rank == 0)
+        {
+            TLLM_LOG_INFO("rms norm latency %4.4fus", rms_latency);
+        }
+        auto quant_latency = runner.benchmark(&Runner::run_fp4_quant, warmup, iter, token_num, hidden_dim);
+        if (rank == 0)
+        {
+            TLLM_LOG_INFO("fp4 quant latency %4.4fus", quant_latency);
+            auto tot_latency = nccl_latency + residual_latency + rms_latency + quant_latency;
+            TLLM_LOG_INFO("fusion kernel latency %4.4fus, nccl + ops latency %4.4fus, total speedup %2.4fx", latency,
+                tot_latency, tot_latency / latency);
         }
     }
 }