[TRTLLM-7963][feat] Cold L2 cache when doing autotune benchmarking.

hyukn · hyukn · commit dbd4a641140b · 2025-10-30T07:14:37.000Z
When running ops in models, the L2 cache is usually cold. But in autotune, we don't clear L2 cache when benchmarking. Some kernels may easily been influenced by warm/cold L2 cache. In order to make the kernel selection more accurate, we clear L2 cache (circle buffer method) for autotune benchmark in this PR.

Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -13,6 +13,7 @@
 from typing import Any, Callable, Dict, List, Set, Tuple, Union
 
 import torch
+from cuda.bindings import driver
 
 import tensorrt_llm
 from tensorrt_llm.bindings.internal.runtime import delay_kernel
@@ -99,6 +100,7 @@ class TuningConfig:
     constraint_specs: Tuple[ConstraintSpec, ...] = ()
     tune_max_num_tokens: int = None
     inputs_pre_hook: Callable = None
+    use_cold_l2_cache: bool = True
 
 
 @dataclass(unsafe_hash=True)
@@ -524,7 +526,7 @@ class AutoTuner:
     """
     _instance = None
 
-    def __init__(self, warmup=3, repeat=10, stream_delay_micro_secs=1000):
+    def __init__(self, warmup=3, repeat=30, stream_delay_micro_secs=1000):
         self.repeat = repeat
         self.warmup = warmup
         self.stream_delay_micro_secs = stream_delay_micro_secs
@@ -622,7 +624,7 @@ def choose_one(
                     self.stats.tuned_op_successful_configs[
                         custom_op] = self.stats.tuned_op_successful_configs.get(
                             custom_op, 0) + 1
-                    logger.debug(
+                    logger.info(
                         f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
                     )
                 else:
@@ -685,7 +687,7 @@ def _profile_runners(
             for tac in valid_tactics:
                 try:
                     time_measured = self._profile_single_kernel(
-                        runner, input_tensors, tac, **kwargs)
+                        runner, input_tensors, tac, tuning_config, **kwargs)
                 except Exception as e:
                     # Handle None tensors for optional inputs
                     shapes = self._get_input_sizes(input_tensors)
@@ -727,6 +729,7 @@ def _profile_single_kernel(
         runner: TunableRunner,
         inputs: List[torch.Tensor],
         tactic: Any,
+        tuning_config: TuningConfig,
         **kwargs,
     ) -> float:
         """Profile a single kernel implementation for performance measurement.
@@ -744,10 +747,13 @@ def _profile_single_kernel(
             to get an average execution time. Stream synchronization and delays
             are used to ensure accurate timing.
         """
+        input_tensor_batches = self._prepare_input_tensors_with_batches(inputs, tuning_config)
+
+
         stream = torch.cuda.current_stream()
         # warm up, no timing
         for _ in range(self.warmup):
-            runner(inputs, tactic=tactic, **kwargs)
+            runner(input_tensor_batches[-1], tactic=tactic, **kwargs)
         stream.synchronize()
 
         # Delay the profiled kernel launch to eliminate affects of host time overhead in profiling.
@@ -758,8 +764,12 @@ def _profile_single_kernel(
         end = torch.cuda.Event(enable_timing=True)
 
         start.record(stream=stream)
-        for _ in range(self.repeat):
-            runner(inputs, tactic=tactic, **kwargs)
+        for r in range(self.repeat):
+            runner(
+                input_tensor_batches[r % len(input_tensor_batches)],
+                tactic=tactic,
+                **kwargs,
+            )
         end.record(stream=stream)
         stream.synchronize()
 
@@ -939,6 +949,39 @@ def _prepare_input_tensors(
             tensors.append(tensor)
         return tensors
 
+    def _prepare_input_tensors_with_batches(
+        self,
+        inputs: List[torch.Tensor],
+        tuning_config: TuningConfig,
+    ) -> List[List[torch.Tensor]]:
+        if not tuning_config.use_cold_l2_cache:
+            return [inputs]
+
+        one_buffer_bytes = sum(
+            input.numel() *
+            input.element_size() if isinstance(input, torch.Tensor) else 0
+            for input in inputs)
+        if one_buffer_bytes <= 0:
+            logger.info(
+                "[Autotuner] No tensor inputs or zero-sized tensors; falling back to single-batch profiling."
+            )
+            return [inputs]
+
+        num_buffers = self._get_l2_cache_size_in_bytes(
+        ) * 3 // one_buffer_bytes + 1
+        num_buffers = min(num_buffers, self.repeat + 1)
+
+        inputs_list = [inputs]
+        for _ in range(num_buffers - 1):
+            inputs_list.append(
+                list(t.clone() if isinstance(t, torch.Tensor) else t
+                     for t in inputs))
+
+        logger.info(
+            f"[Autotuner] use_cold_l2_cache={tuning_config.use_cold_l2_cache}, use {num_buffers} different tensors for profiling"
+        )
+        return inputs_list
+
     def clear_cache(self) -> None:
         """Clear the profiling cache."""
         self.profiling_cache.clear()
@@ -957,3 +1000,35 @@ def print_profiling_cache(self):
             logger.debug(
                 f"[Autotuner] {key}: (runner_id={runner_id}, tactic={tactic}, min_time={min_time})"
             )
+
+    def _get_l2_cache_size_in_bytes(self, device_id: int = 0) -> int:
+        device = self._checkCudaErrors(driver.cuDeviceGet(device_id))
+        return self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                device,
+            ))
+
+    def _checkCudaErrors(self, result) -> Any:
+        status = result[0]
+        if status != driver.CUresult.CUDA_SUCCESS:
+            code = getattr(status, "value", status)
+            raise RuntimeError(
+                f"CUDA error code={code}({self._cudaGetErrorEnum(status)})")
+        # CUDA APIs always return the status as the first element of the result tuple
+        if len(result) == 1:
+            return None
+        elif len(result) == 2:
+            return result[1]
+        else:
+            return result[1:]
+
+    def _cudaGetErrorEnum(self, error) -> str:
+        from cuda.bindings import nvrtc
+        if isinstance(error, driver.CUresult):
+            err, name = driver.cuGetErrorName(error)
+            return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+        elif isinstance(error, nvrtc.nvrtcResult):
+            return nvrtc.nvrtcGetErrorString(error)[1]
+        else:
+            raise RuntimeError("Unknown error type: {}".format(error))