Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ATTRIBUTIONS-Python.md
Original file line number Diff line number Diff line change
Expand Up @@ -25250,7 +25250,7 @@ License: `NVIDIA Proprietary Software`
- `Homepage`: https://developer.nvidia.com/cusparselt


## nvidia-cutlass-dsl (4.2.1)
## nvidia-cutlass-dsl (4.3.0)

### Licenses
License: `None`
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ triton==3.5.0; platform_machine == "x86_64"
tiktoken
blobfile
openai-harmony==0.0.4
nvidia-cutlass-dsl==4.3.0.dev0; python_version >= "3.10"
nvidia-cutlass-dsl==4.3.0; python_version >= "3.10"
plotly
numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
partial_json_parser
Original file line number Diff line number Diff line change
Expand Up @@ -1552,6 +1552,8 @@ def kernel(
epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
)

tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.out_dtype)

copy_atom_r2s = sm100_utils.get_smem_store_op(
self.gemm_output_layout, self.out_dtype, self.acc_dtype, tiled_copy_t2r
)
Expand Down Expand Up @@ -1641,8 +1643,6 @@ def kernel(
layout = cute.make_layout(shape=(cute.size(tTR_rAcc),), stride=(1,))
loop_size = cute.size(tTR_rAcc)

rOut_epi = cute.make_rmem_tensor(layout, self.out_dtype)

for subtile_idx in cutlass.range(subtile_cnt):
#
# Load accumulator from tensor memory buffer to register
Expand All @@ -1657,7 +1657,8 @@ def kernel(
# Apply router scale to the entire row (broadcast scalar to vector)
acc_vec_finalized = token_scale * acc_vec_scaled

rOut_epi.store(acc_vec_finalized.to(self.out_dtype))
tTR_rC.store(acc_vec_finalized.to(self.out_dtype))
rOut_epi = cute.make_tensor(tTR_rC.iterator, layout)

if permuted_row < tile_mn_limit:
coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[
Expand Down
1 change: 1 addition & 0 deletions tensorrt_llm/commands/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,

profiler.start("trtllm init")
if backend == 'pytorch':
llm_args.pop("build_config", None)
llm = PyTorchLLM(**llm_args)
elif backend == 'tensorrt':
llm = LLM(**llm_args)
Expand Down