Zero-pad speech utterances to bring closer training (with zero paddin…

…g from batching) and testing conditions PiperOrigin-RevId: 195565730
tensorflow · lukaszkaiser · May 8, 2018 · May 4, 2018 · May 4, 2018 · May 6, 2018
commit ed0dc39af97c174525e2a7ece09197aec2252b32
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
@@ -251,6 +251,7 @@ def hparams(self, defaults, model_hparams):
     p.add_hparam("audio_upper_edge_hertz", 8000.0)
     p.add_hparam("audio_num_mel_bins", 80)
     p.add_hparam("audio_add_delta_deltas", True)
+    p.add_hparam("num_zeropad_frames", 250)
 
     p = defaults
     # p.stop_at_eos = int(False)
@@ -319,8 +320,9 @@ def preprocess_example(self, example, mode, hparams):
 
       # Later models like to flatten the two spatial dims. Instead, we add a
       # unit spatial dim and flatten the frequencies and channels.
-      example["inputs"] = tf.reshape(
-          mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]])
+      example["inputs"] = tf.concat([
+          tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]),
+          tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0)
 
     if not p.audio_keep_example_waveforms:
       del example["waveforms"]