diff --git a/src/tokenizers.js b/src/tokenizers.js index 51c9d23dc..2aaa601db 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3690,8 +3690,14 @@ export class WhisperTokenizer extends PreTrainedTokenizer { let last_timestamp = null; let first_timestamp = timestamp_begin; + // Track chunk length for clamping token timestamps (issue #1357) + // This prevents timestamps from exceeding the actual audio duration + // when the model outputs timestamps near the 30s boundary + let current_chunk_len = null; + if ("stride" in output) { const [chunk_len, stride_left, stride_right] = output.stride; + current_chunk_len = chunk_len; // Offset the timings to account for the other `model_outputs`. time_offset -= stride_left; @@ -3821,11 +3827,21 @@ export class WhisperTokenizer extends PreTrainedTokenizer { current_tokens.push(token) if (returnWordTimestamps) { - let start_time = round(token_timestamps[i] + time_offset, 2); + // Clamp token timestamps to chunk length to prevent exceeding audio duration (issue #1357) + let raw_start = token_timestamps[i]; + let raw_end = (i + 1 < token_timestamps.length) ? token_timestamps[i + 1] : null; + if (current_chunk_len !== null) { + raw_start = Math.min(raw_start, current_chunk_len); + if (raw_end !== null) { + raw_end = Math.min(raw_end, current_chunk_len); + } + } + + let start_time = round(raw_start + time_offset, 2); let end_time; - if (i + 1 < token_timestamps.length) { - end_time = round(token_timestamps[i + 1] + time_offset, 2); + if (raw_end !== null) { + end_time = round(raw_end + time_offset, 2); // Do not allow punctuation-only tokens to have a duration. // This prevents long pauses from messing up the timestamps. diff --git a/tests/models/whisper/test_tokenization_whisper.js b/tests/models/whisper/test_tokenization_whisper.js index bb3b1c685..0dc9f35a3 100644 --- a/tests/models/whisper/test_tokenization_whisper.js +++ b/tests/models/whisper/test_tokenization_whisper.js @@ -774,5 +774,62 @@ export const CUSTOM_TESTS = () => { }, MAX_EXECUTION_TIME, ); + + it( + "should clamp timestamps to chunk_len when model outputs exceed chunk boundary (issue #1357)", + async () => { + // This test verifies the fix for issue #1357 + // When the last chunk is shorter than 30s but model outputs timestamps near 29.98s, + // the timestamps should be clamped to the actual chunk_len + + const tokenizer = await WhisperTokenizer.from_pretrained("onnx-community/whisper-tiny.en_timestamped"); + + // Simulate a 65s audio with chunk_length_s=30, stride_length_s=5 + const model_outputs = [ + { + stride: [30, 0, 5], // Full 30s chunk + tokens: [50258n, 50364n, 1000n, 50257n], + token_timestamps: [0, 0, 25.0, 29.98], // Near boundary + }, + { + stride: [30, 5, 5], // Full 30s chunk + tokens: [50258n, 50364n, 2000n, 50257n], + token_timestamps: [0, 0, 15.0, 29.98], // Near boundary + }, + { + stride: [15, 5, 0], // Only 15s chunk! But model might output 29.98s + tokens: [50258n, 50364n, 3000n, 50257n], + // BUG CASE: model outputs timestamp near 29.98s even though chunk is only 15s + token_timestamps: [0, 0, 10.0, 29.98], + }, + ]; + + const [text, options] = tokenizer._decode_asr(model_outputs, { + return_timestamps: "word", + time_precision: 0.02, + force_full_sequences: false, + }); + + // Without the fix, the last timestamp would be: + // time_offset = (30-5) + (30-5) - 5 = 45s (after both chunks) + // 29.98 + 45 = ~75s (WRONG - exceeds 65s audio!) + // + // With the fix, raw timestamp 29.98 is clamped to chunk_len=15: + // 15 + 45 = 60s (within audio duration) + + if (options.chunks && options.chunks.length > 0) { + const maxTimestamp = Math.max( + ...options.chunks.map((c) => c.timestamp[1] || 0) + ); + + console.log("Issue #1357 test - Max timestamp:", maxTimestamp); + + // The key assertion: timestamps should not exceed audio duration + // With fix, max should be around 60s (45 + 15), not 75s (45 + 29.98) + expect(maxTimestamp).toBeLessThanOrEqual(65); + } + }, + MAX_EXECUTION_TIME, + ); }); };