Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tensorrt_llm/_torch/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,13 @@ class TuningConfig:
any value is provided to the choose_one function, the input tensor will be saturated
with the provided value.
If not provided, the autotuner will not consider the max num tokens.
inputs_pre_hook (Callable): A function that takes a list of input tensors, returns a list of modified input tensors.
It is called before the input tensors are prepared for the tuning process to match the real data distribution.
"""
dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...] = ()
constraint_specs: Tuple[ConstraintSpec, ...] = ()
tune_max_num_tokens: int = None
inputs_pre_hook: Callable = None


@dataclass(unsafe_hash=True)
Expand Down Expand Up @@ -660,6 +663,9 @@ def _profile_runners(
min_time = float('inf')
has_tuning_failure_occured = False
best_runner_id, best_tactic = None, None
# If the inputs_pre_hook is provided, it will be called before profiling.
if tuning_config.inputs_pre_hook is not None:
input_tensors = tuning_config.inputs_pre_hook(input_tensors)
for runner_id, runner in enumerate(runners):
# TODO: use FakeTensor here.
runner_arg_names = {
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class MoERunner(TunableRunner):
runner_dict = dict()
tuning_config = TuningConfig(
dynamic_tensor_specs=(DynamicTensorSpec(
0, 0, get_last_power_of_2_num_tokens_buckets(8192),
lambda x: min(last_positive_power_of_2(x), 8192)), ),
0, 0, get_last_power_of_2_num_tokens_buckets,
last_positive_power_of_2), ),
tune_max_num_tokens=8192,
)

Expand Down
44 changes: 37 additions & 7 deletions tests/unittest/_torch/misc/test_autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,24 @@ def test_multiple_dynamic_shapes_cache():
f"Expected 12 cache entries for 3x4 shape combinations, got {len(cache_entries)}"


class GemmRunnerWithTacticConfigs(TunableRunner):
class GemmRunnerComplexTuningConfigs(TunableRunner):
valid_tactic_ids = [-1, 0, 1]
tune_max_num_tokens = 32

def get_valid_tactics(
self,
inputs: List[FakeTensor],
profile: OptimizationProfile,
**kwargs,
) -> List[Dict[str, int]]:
# During the tuning process, we verify if the tuning config behaves as expected

assert inputs[0].shape[0] <= self.tune_max_num_tokens, \
f"Input shape {inputs[0].shape[0]} is larger than the max num tokens {self.tune_max_num_tokens}"

assert inputs[0][-1, 0] == inputs[0].shape[0], \
f"Input shape {inputs[0].shape[0]} is not set through the pre_hook correctly"

def get_valid_tactics(self, inputs: List[FakeTensor],
profile: OptimizationProfile,
**kwargs) -> List[Dict[str, int]]:
# The simulated delay is not deterministic, so we need to return specific tactics here
return [{
"block_size": block_size,
Expand All @@ -350,12 +362,30 @@ def forward(
assert tactic_id in self.valid_tactic_ids
return [gemm_0, gemm_1, gemm_fallback][tactic_id](*inputs)

@staticmethod
def inputs_pre_hook(inputs: List[torch.Tensor]):
# always set the first element to bo iota in x
x, w = inputs
x_hooked = torch.zeros_like(x)
x_hooked[-1, 0] = x.shape[0]
return [x_hooked, w]


def test_autotuner_tactic_configs():
runner_0 = GemmRunnerWithTacticConfigs()
def test_autotuner_tuning_configs():
runner_0 = GemmRunnerComplexTuningConfigs()
runners = [runner_0]
x, w = torch.randn(64, 64), torch.randn(64, 128)
tuning_config = TuningConfig()
tuning_config = TuningConfig(
dynamic_tensor_specs=(DynamicTensorSpec(
input_idx=0,
dim_idx=0,
gen_tuning_buckets=get_power_of_2_num_tokens_buckets,
map_to_tuning_buckets=next_positive_power_of_2,
), ),
# Test if the number of tuning tokens is clipped to 32
tune_max_num_tokens=GemmRunnerComplexTuningConfigs.tune_max_num_tokens,
inputs_pre_hook=GemmRunnerComplexTuningConfigs.inputs_pre_hook,
)
with autotune():
tuner = AutoTuner.get()
runner, tactic = tuner.choose_one("test_autotuner_tactic_configs",
Expand Down