huggingface · neonwatty · Dec 13, 2025
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -3690,8 +3690,14 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
             let last_timestamp = null;
             let first_timestamp = timestamp_begin;
 
+            // Track chunk length for clamping token timestamps (issue #1357)
+            // This prevents timestamps from exceeding the actual audio duration
+            // when the model outputs timestamps near the 30s boundary
+            let current_chunk_len = null;
+
             if ("stride" in output) {
                 const [chunk_len, stride_left, stride_right] = output.stride;
+                current_chunk_len = chunk_len;
 
                 // Offset the timings to account for the other `model_outputs`.
                 time_offset -= stride_left;
@@ -3821,11 +3827,21 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
                     current_tokens.push(token)
 
                     if (returnWordTimestamps) {
-                        let start_time = round(token_timestamps[i] + time_offset, 2);
+                        // Clamp token timestamps to chunk length to prevent exceeding audio duration (issue #1357)
+                        let raw_start = token_timestamps[i];
+                        let raw_end = (i + 1 < token_timestamps.length) ? token_timestamps[i + 1] : null;
+                        if (current_chunk_len !== null) {
+                            raw_start = Math.min(raw_start, current_chunk_len);
+                            if (raw_end !== null) {
+                                raw_end = Math.min(raw_end, current_chunk_len);
+                            }
+                        }
+
+                        let start_time = round(raw_start + time_offset, 2);
 
                         let end_time;
-                        if (i + 1 < token_timestamps.length) {
-                            end_time = round(token_timestamps[i + 1] + time_offset, 2);
+                        if (raw_end !== null) {
+                            end_time = round(raw_end + time_offset, 2);
 
                             // Do not allow punctuation-only tokens to have a duration.
                             // This prevents long pauses from messing up the timestamps.

diff --git a/tests/models/whisper/test_tokenization_whisper.js b/tests/models/whisper/test_tokenization_whisper.js
@@ -774,5 +774,62 @@ export const CUSTOM_TESTS = () => {
       },
       MAX_EXECUTION_TIME,
     );
+
+    it(
+      "should clamp timestamps to chunk_len when model outputs exceed chunk boundary (issue #1357)",
+      async () => {
+        // This test verifies the fix for issue #1357
+        // When the last chunk is shorter than 30s but model outputs timestamps near 29.98s,
+        // the timestamps should be clamped to the actual chunk_len
+
+        const tokenizer = await WhisperTokenizer.from_pretrained("onnx-community/whisper-tiny.en_timestamped");
+
+        // Simulate a 65s audio with chunk_length_s=30, stride_length_s=5
+        const model_outputs = [
+          {
+            stride: [30, 0, 5], // Full 30s chunk
+            tokens: [50258n, 50364n, 1000n, 50257n],
+            token_timestamps: [0, 0, 25.0, 29.98], // Near boundary
+          },
+          {
+            stride: [30, 5, 5], // Full 30s chunk
+            tokens: [50258n, 50364n, 2000n, 50257n],
+            token_timestamps: [0, 0, 15.0, 29.98], // Near boundary
+          },
+          {
+            stride: [15, 5, 0], // Only 15s chunk! But model might output 29.98s
+            tokens: [50258n, 50364n, 3000n, 50257n],
+            // BUG CASE: model outputs timestamp near 29.98s even though chunk is only 15s
+            token_timestamps: [0, 0, 10.0, 29.98],
+          },
+        ];
+
+        const [text, options] = tokenizer._decode_asr(model_outputs, {
+          return_timestamps: "word",
+          time_precision: 0.02,
+          force_full_sequences: false,
+        });
+
+        // Without the fix, the last timestamp would be:
+        //   time_offset = (30-5) + (30-5) - 5 = 45s (after both chunks)
+        //   29.98 + 45 = ~75s (WRONG - exceeds 65s audio!)
+        //
+        // With the fix, raw timestamp 29.98 is clamped to chunk_len=15:
+        //   15 + 45 = 60s (within audio duration)
+
+        if (options.chunks && options.chunks.length > 0) {
+          const maxTimestamp = Math.max(
+            ...options.chunks.map((c) => c.timestamp[1] || 0)
+          );
+
+          console.log("Issue #1357 test - Max timestamp:", maxTimestamp);
+
+          // The key assertion: timestamps should not exceed audio duration
+          // With fix, max should be around 60s (45 + 15), not 75s (45 + 29.98)
+          expect(maxTimestamp).toBeLessThanOrEqual(65);
+        }
+      },
+      MAX_EXECUTION_TIME,
+    );
   });
 };