Change training_seq_length to pretraining_seq_length

keras-team · mattdangerw · May 17, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
commit c205c2070fbf3653981497985b58c1d37535c5b3
diff --git a/keras_nlp/src/models/phi3/phi3_attention.py b/keras_nlp/src/models/phi3/phi3_attention.py
@@ -30,7 +30,7 @@ def __init__(
         kernel_initializer="glorot_uniform",
         dropout=0,
         max_sequence_length=4096,
-        training_sequence_length=4096,
+        pretraining_sequence_length=4096,
         rope_max_wavelength=10000,
         rope_scaling_type=None,
         rope_scaling_short_factor=None,
@@ -44,7 +44,7 @@ def __init__(
         self.dropout = dropout
 
         self.max_sequence_length = max_sequence_length
-        self.training_sequence_length = training_sequence_length
+        self.pretraining_sequence_length = pretraining_sequence_length
         self.rope_max_wavelength = rope_max_wavelength
         self.rope_scaling_type = rope_scaling_type
         self.rope_scaling_short_factor = rope_scaling_short_factor
@@ -148,7 +148,7 @@ def build(self, inputs_shape):
                 inverese_freq_short_factor=self.rope_scaling_short_factor,
                 inverese_freq_long_factor=self.rope_scaling_long_factor,
                 max_sequence_length=self.max_sequence_length,
-                training_sequence_length=self.training_sequence_length,
+                pretraining_sequence_length=self.pretraining_sequence_length,
                 max_wavelength=self.rope_max_wavelength,
                 dtype=self.dtype_policy,
             )
@@ -249,7 +249,7 @@ def get_config(self):
                 ),
                 "dropout": self.dropout,
                 "max_sequence_length": self.max_sequence_length,
-                "training_sequence_length": self.training_sequence_length,
+                "pretraining_sequence_length": self.pretraining_sequence_length,
                 "rope_max_wavelength": self.rope_max_wavelength,
                 "rope_scaling_type": self.rope_scaling_type,
                 "rope_scaling_short_factor": self.rope_scaling_short_factor,

diff --git a/keras_nlp/src/models/phi3/phi3_backbone.py b/keras_nlp/src/models/phi3/phi3_backbone.py
@@ -55,8 +55,8 @@ class Phi3Backbone(Backbone):
             decoder.
         max_sequence_length (int, optional): The maximum sequence length
             that this model might ever be used with. Defaults to `4096`.
-        training_sequence_length (int, optional): The maximum sequence length
-            that the model was trained with. Defaults to `4096`.
+        pretraining_sequence_length (int, optional): The maximum sequence length
+            that the model was pretrained with. Defaults to `4096`.
         rope_max_wavelength (int, optional): The maximum angular wavelength of
             the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
         rope_scaling_type (str, optional): The type of the rope scaling. Can be
@@ -118,7 +118,7 @@ def __init__(
         layer_norm_epsilon=1e-6,
         dropout=0.0,
         max_sequence_length=4096,
-        training_sequence_length=4096,
+        pretraining_sequence_length=4096,
         rope_max_wavelength=10000,
         rope_scaling_type=None,
         rope_scaling_short_factor=None,
@@ -148,7 +148,7 @@ def __init__(
                 kernel_initializer=_phi3_kernel_initializer(stddev=0.02),
                 dropout=dropout,
                 max_sequence_length=max_sequence_length,
-                training_sequence_length=training_sequence_length,
+                pretraining_sequence_length=pretraining_sequence_length,
                 rope_scaling_type=rope_scaling_type,
                 rope_scaling_short_factor=rope_scaling_short_factor,
                 rope_scaling_long_factor=rope_scaling_long_factor,
@@ -194,7 +194,7 @@ def __init__(
         self.layer_norm_epsilon = layer_norm_epsilon
         self.dropout = dropout
         self.max_sequence_length = max_sequence_length
-        self.training_sequence_length = training_sequence_length
+        self.pretraining_sequence_length = pretraining_sequence_length
         self.rope_max_wavelength = rope_max_wavelength
         self.rope_scaling_type = rope_scaling_type
         self.rope_scaling_short_factor = rope_scaling_short_factor
@@ -213,7 +213,7 @@ def get_config(self):
                 "layer_norm_epsilon": self.layer_norm_epsilon,
                 "dropout": self.dropout,
                 "max_sequence_length": self.max_sequence_length,
-                "training_sequence_length": self.training_sequence_length,
+                "pretraining_sequence_length": self.pretraining_sequence_length,
                 "rope_max_wavelength": self.rope_max_wavelength,
                 "rope_scaling_type": self.rope_scaling_type,
                 "rope_scaling_short_factor": self.rope_scaling_short_factor,

diff --git a/keras_nlp/src/models/phi3/phi3_backbone_test.py b/keras_nlp/src/models/phi3/phi3_backbone_test.py
@@ -36,7 +36,7 @@ def setUp(self):
             "hidden_dim": 8,
             "intermediate_dim": 12,
             "max_sequence_length": 10,
-            "training_sequence_length": 5,
+            "pretraining_sequence_length": 5,
             "rope_scaling_type": "su",
             "rope_scaling_short_factor": [1.2, 1.4],
             "rope_scaling_long_factor": [0.8, 0.6],

diff --git a/keras_nlp/src/models/phi3/phi3_decoder.py b/keras_nlp/src/models/phi3/phi3_decoder.py
@@ -38,7 +38,7 @@ def __init__(
         kernel_initializer="glorot_uniform",
         dropout=0,
         max_sequence_length=4096,
-        training_sequence_length=4096,
+        pretraining_sequence_length=4096,
         rope_max_wavelength=10000,
         rope_scaling_type=None,
         rope_scaling_short_factor=None,
@@ -52,7 +52,7 @@ def __init__(
         self.num_key_value_heads = num_key_value_heads
 
         self.max_sequence_length = max_sequence_length
-        self.training_sequence_length = training_sequence_length
+        self.pretraining_sequence_length = pretraining_sequence_length
         self.rope_max_wavelength = rope_max_wavelength
         self.rope_scaling_type = rope_scaling_type
         self.rope_scaling_short_factor = rope_scaling_short_factor
@@ -81,7 +81,7 @@ def build(self, decoder_sequence_shape):
             kernel_initializer=clone_initializer(self.kernel_initializer),
             dropout=self.dropout,
             max_sequence_length=self.max_sequence_length,
-            training_sequence_length=self.training_sequence_length,
+            pretraining_sequence_length=self.pretraining_sequence_length,
             rope_max_wavelength=self.rope_max_wavelength,
             rope_scaling_type=self.rope_scaling_type,
             rope_scaling_short_factor=self.rope_scaling_short_factor,
@@ -249,7 +249,7 @@ def get_config(self):
                 ),
                 "dropout": self.dropout,
                 "max_sequence_length": self.max_sequence_length,
-                "training_sequence_length": self.training_sequence_length,
+                "pretraining_sequence_length": self.pretraining_sequence_length,
                 "rope_max_wavelength": self.rope_max_wavelength,
                 "rope_scaling_type": self.rope_scaling_type,
                 "rope_scaling_short_factor": self.rope_scaling_short_factor,

diff --git a/keras_nlp/src/models/phi3/phi3_rotary_embedding.py b/keras_nlp/src/models/phi3/phi3_rotary_embedding.py
@@ -31,8 +31,8 @@ class Phi3SuScaledRotaryEmbedding(RotaryEmbedding):
             `sequence_length` is larger than `original_max_sequence_length`.
         max_sequence_length: int. The maximum sequence length that this
             model might ever be used with.
-        training_sequence_length: int. The maximum sequence length that
-            this model was trained with.
+        pretraining_sequence_length: int. The maximum sequence length that
+            this model was pretrained with.
         max_wavelength: int. The maximum angular wavelength of the sine/cosine
             curves.
 
@@ -53,24 +53,24 @@ def __init__(
         inverese_freq_short_factor,
         inverese_freq_long_factor,
         max_sequence_length=4096,
-        training_sequence_length=4096,
+        pretraining_sequence_length=4096,
         max_wavelength=10000,
         **kwargs
     ):
         super().__init__(max_wavelength=max_wavelength, **kwargs)
         self.max_sequence_length = max_sequence_length
-        self.training_sequence_length = training_sequence_length
+        self.pretraining_sequence_length = pretraining_sequence_length
 
         scaling_factor = (
-            self.max_sequence_length / self.training_sequence_length
+            self.max_sequence_length / self.pretraining_sequence_length
         )
         if scaling_factor <= 1.0:
             self.embedding_scaling_factor = 1.0
         else:
             self.embedding_scaling_factor = math.sqrt(
                 1
                 + math.log(scaling_factor)
-                / math.log(self.training_sequence_length)
+                / math.log(self.pretraining_sequence_length)
             )
 
         self.inverese_freq_short_factor = inverese_freq_short_factor
@@ -84,7 +84,7 @@ def _compute_cos_sin_embedding(self, inputs, start_index=0, positions=None):
         inverse_freq = self._get_inverse_freq(rotary_dim)
 
         # Multiply inverse_freq by a factor.
-        if ops.shape(inputs)[sequence_axis] > self.training_sequence_length:
+        if ops.shape(inputs)[sequence_axis] > self.pretraining_sequence_length:
             inverse_freq = ops.divide(
                 inverse_freq,
                 ops.convert_to_tensor(self.inverese_freq_long_factor),
@@ -128,7 +128,7 @@ def get_config(self):
         config.update(
             {
                 "max_sequence_length": self.max_sequence_length,
-                "training_sequence_length": self.training_sequence_length,
+                "pretraining_sequence_length": self.pretraining_sequence_length,
                 "inverese_freq_short_factor": self.inverese_freq_short_factor,
                 "inverese_freq_long_factor": self.inverese_freq_long_factor,
             }