Skip to content

Commit abfdfbf

Browse files
committed
support for more accurate AR calculation
Signed-off-by: binghanc <176802681+binghanc@users.noreply.github.com>
1 parent 0f77fec commit abfdfbf

File tree

1 file changed

+73
-27
lines changed

1 file changed

+73
-27
lines changed

tensorrt_llm/serve/scripts/benchmark_serving.py

Lines changed: 73 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,13 @@ class BenchmarkMetrics:
8080
std_e2el_ms: float
8181
percentiles_e2el_ms: list[tuple[float, float]]
8282
tput_user: list[float]
83-
avg_decoded_tokens_per_iter: float
83+
# Statistics for avg_decoded_tokens_per_iter across all requests
84+
mean_avg_decoded_tokens_per_iter: float
85+
min_avg_decoded_tokens_per_iter: float
86+
max_avg_decoded_tokens_per_iter: float
87+
median_avg_decoded_tokens_per_iter: float
88+
std_avg_decoded_tokens_per_iter: float
89+
percentiles_avg_decoded_tokens_per_iter: list[tuple[float, float]]
8490

8591

8692
async def get_request(
@@ -144,7 +150,7 @@ def calculate_metrics(
144150
ttfts: list[float] = []
145151
e2els: list[float] = []
146152
tput_user: list[float] = []
147-
latest_avg_decoded_tokens_per_iter: float = 0.0
153+
avg_decoded_tokens_per_iter_list: list[float] = []
148154
error_counts: dict[str, int] = {}
149155
for i in range(len(outputs)):
150156
if outputs[i].exception_type:
@@ -177,11 +183,11 @@ def calculate_metrics(
177183
tput_user.append(output_len / (outputs[i].latency))
178184
completed += 1
179185

180-
# Track the latest avg_decoded_tokens_per_iter if available
186+
# Collect avg_decoded_tokens_per_iter for all requests
181187
if hasattr(outputs[i], 'avg_decoded_tokens_per_iter'
182188
) and outputs[i].avg_decoded_tokens_per_iter is not None:
183-
latest_avg_decoded_tokens_per_iter = outputs[
184-
i].avg_decoded_tokens_per_iter
189+
avg_decoded_tokens_per_iter_list.append(
190+
outputs[i].avg_decoded_tokens_per_iter)
185191
else:
186192
actual_output_lens.append(0)
187193

@@ -247,7 +253,13 @@ def calculate_metrics(
247253
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
248254
for p in selected_percentiles],
249255
tput_user=np.mean(tput_user or 0),
250-
avg_decoded_tokens_per_iter=latest_avg_decoded_tokens_per_iter,
256+
mean_avg_decoded_tokens_per_iter=np.mean(avg_decoded_tokens_per_iter_list or 0),
257+
min_avg_decoded_tokens_per_iter=np.min(avg_decoded_tokens_per_iter_list) if avg_decoded_tokens_per_iter_list else 0.0,
258+
max_avg_decoded_tokens_per_iter=np.max(avg_decoded_tokens_per_iter_list) if avg_decoded_tokens_per_iter_list else 0.0,
259+
median_avg_decoded_tokens_per_iter=np.median(avg_decoded_tokens_per_iter_list or 0),
260+
std_avg_decoded_tokens_per_iter=np.std(avg_decoded_tokens_per_iter_list or 0),
261+
percentiles_avg_decoded_tokens_per_iter=[(p, np.percentile(avg_decoded_tokens_per_iter_list or 0, p))
262+
for p in selected_percentiles],
251263
)
252264
return metrics, actual_output_lens
253265

@@ -466,10 +478,6 @@ async def limited_request_func(request_func_input, streaming, pbar,
466478
print("{:<40} {:<10.2f}".format("User throughput (tok/s):",
467479
metrics.tput_user))
468480

469-
# Print last avg_decoded_tokens_per_iter value if available
470-
if metrics.avg_decoded_tokens_per_iter > 0.0:
471-
print("{:<40} {:<10.2f}".format("Avg Decoded Tokens per Iter:",
472-
metrics.avg_decoded_tokens_per_iter))
473481
if len(outputs) - metrics.completed > 0:
474482
print(
475483
f"=======================!FAILED REQUESTS!=======================")
@@ -488,7 +496,14 @@ async def limited_request_func(request_func_input, streaming, pbar,
488496
"output_throughput": metrics.output_throughput,
489497
"total_token_throughput": metrics.total_token_throughput,
490498
"user_throughput": metrics.tput_user,
491-
"avg_decoded_tokens_per_iter": metrics.avg_decoded_tokens_per_iter,
499+
"avg_decoded_tokens_per_iter": {
500+
"mean": metrics.mean_avg_decoded_tokens_per_iter,
501+
"min": metrics.min_avg_decoded_tokens_per_iter,
502+
"max": metrics.max_avg_decoded_tokens_per_iter,
503+
"median": metrics.median_avg_decoded_tokens_per_iter,
504+
"std": metrics.std_avg_decoded_tokens_per_iter,
505+
"percentiles": {f"p{p}": v for p, v in metrics.percentiles_avg_decoded_tokens_per_iter}
506+
},
492507
"input_lens": [output.prompt_len for output in outputs],
493508
"output_lens": actual_output_lens,
494509
"ttfts": [output.ttft for output in outputs],
@@ -504,30 +519,61 @@ def process_one_metric(
504519
metric_name: str,
505520
# E.g., "Time to First Token"
506521
metric_header: str,
522+
# E.g., "ms" or "" for no unit
523+
unit_suffix: str = "ms",
507524
):
508-
# This function prints and adds statistics of the specified
509-
# metric.
510-
if metric_attribute_name not in selected_percentile_metrics:
525+
# This function prints and adds statistics of the specified metric.
526+
# Skip if not in selected metrics (except avg_decoded_tokens_per_iter which has its own condition)
527+
if (metric_attribute_name not in selected_percentile_metrics and metric_attribute_name != "avg_decoded_tokens_per_iter"):
511528
return
529+
530+
# Build attribute suffix (e.g., "_ms" or "")
531+
attr_suffix = f"_{unit_suffix}" if unit_suffix else ""
532+
# Build display unit (e.g., " (ms)" or "")
533+
display_unit = f" ({unit_suffix})" if unit_suffix else ""
534+
512535
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
513536
print("{:<40} {:<10.2f}".format(
514-
f"Mean {metric_name} (ms):",
515-
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
537+
f"Mean {metric_name}{display_unit}:",
538+
getattr(metrics, f"mean_{metric_attribute_name}{attr_suffix}")))
516539
print("{:<40} {:<10.2f}".format(
517-
f"Median {metric_name} (ms):",
518-
getattr(metrics, f"median_{metric_attribute_name}_ms")))
519-
result[f"mean_{metric_attribute_name}_ms"] = getattr(
520-
metrics, f"mean_{metric_attribute_name}_ms")
521-
result[f"median_{metric_attribute_name}_ms"] = getattr(
522-
metrics, f"median_{metric_attribute_name}_ms")
523-
result[f"std_{metric_attribute_name}_ms"] = getattr(
524-
metrics, f"std_{metric_attribute_name}_ms")
540+
f"Median {metric_name}{display_unit}:",
541+
getattr(metrics, f"median_{metric_attribute_name}{attr_suffix}")))
542+
if hasattr(metrics, f"std_{metric_attribute_name}{attr_suffix}"):
543+
print("{:<40} {:<10.2f}".format(
544+
f"Std Dev {metric_name}{display_unit}:",
545+
getattr(metrics, f"std_{metric_attribute_name}{attr_suffix}")))
546+
result[f"std_{metric_attribute_name}{attr_suffix}"] = getattr(
547+
metrics, f"std_{metric_attribute_name}{attr_suffix}")
548+
if hasattr(metrics, f"min_{metric_attribute_name}{attr_suffix}"):
549+
print("{:<40} {:<10.2f}".format(
550+
f"Min {metric_name}{display_unit}:",
551+
getattr(metrics, f"min_{metric_attribute_name}{attr_suffix}")))
552+
result[f"min_{metric_attribute_name}{attr_suffix}"] = getattr(
553+
metrics, f"min_{metric_attribute_name}{attr_suffix}")
554+
if hasattr(metrics, f"max_{metric_attribute_name}{attr_suffix}"):
555+
print("{:<40} {:<10.2f}".format(
556+
f"Max {metric_name}{display_unit}:",
557+
getattr(metrics, f"max_{metric_attribute_name}{attr_suffix}")))
558+
result[f"max_{metric_attribute_name}{attr_suffix}"] = getattr(
559+
metrics, f"max_{metric_attribute_name}{attr_suffix}")
560+
561+
result[f"mean_{metric_attribute_name}{attr_suffix}"] = getattr(
562+
metrics, f"mean_{metric_attribute_name}{attr_suffix}")
563+
result[f"median_{metric_attribute_name}{attr_suffix}"] = getattr(
564+
metrics, f"median_{metric_attribute_name}{attr_suffix}")
565+
525566
for p, value in getattr(metrics,
526-
f"percentiles_{metric_attribute_name}_ms"):
567+
f"percentiles_{metric_attribute_name}{attr_suffix}"):
527568
p_word = str(int(p)) if int(p) == p else str(p)
528-
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
569+
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}{display_unit}:",
529570
value))
530-
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
571+
result[f"p{p_word}_{metric_attribute_name}{attr_suffix}"] = value
572+
573+
# Print avg_decoded_tokens_per_iter statistics if available
574+
if metrics.mean_avg_decoded_tokens_per_iter > 0.0:
575+
process_one_metric("avg_decoded_tokens_per_iter", "Avg Decoded Tokens per Iter",
576+
"Avg Decoded Tokens per Iter", unit_suffix="")
531577

532578
process_one_metric("ttft", "TTFT", "Time to First Token")
533579
process_one_metric("tpot", "TPOT",

0 commit comments

Comments
 (0)