keras-team · mattdangerw · Oct 24, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/keras_nlp/models/t5/t5_backbone.py b/keras_nlp/models/t5/t5_backbone.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
 from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding
 from keras_nlp.models.backbone import Backbone
 from keras_nlp.models.t5.t5_layer_norm import T5LayerNorm
+from keras_nlp.models.t5.t5_presets import backbone_presets
 from keras_nlp.models.t5.t5_transformer_layer import T5TransformerLayer
 from keras_nlp.utils.python_utils import classproperty
 
@@ -50,6 +52,9 @@ class T5Backbone(Backbone):
         hidden_dim: int. The hidden size of the Transformer layers.
         intermediate_dim: int. The output dimension of the first Dense layer in
             a two-layer feedforward network for each Transformer layer.
+        key_value_dim: int. The dimension of each head of the key/value
+            projections in the multi-head attention layers. Defaults to
+            hidden_dim / num_heads.
         dropout: float. Dropout probability for the Transformer layers.
         activation: activation function (or activation string name). The
             activation to be used in the inner dense blocks of the
@@ -73,6 +78,7 @@ def __init__(
         num_heads,
         hidden_dim,
         intermediate_dim,
+        key_value_dim=None,
         dropout=0.1,
         activation="gelu",
         use_gated_activation=True,
@@ -122,6 +128,7 @@ def __init__(
                 is_decoder=False,
                 hidden_dim=hidden_dim,
                 intermediate_dim=intermediate_dim,
+                key_value_dim=key_value_dim or hidden_dim // num_heads,
                 dropout=dropout,
                 activation=activation,
                 layer_norm_epsilon=layer_norm_epsilon,
@@ -165,6 +172,7 @@ def __init__(
                 is_decoder=True,
                 hidden_dim=hidden_dim,
                 intermediate_dim=intermediate_dim,
+                key_value_dim=key_value_dim or hidden_dim // num_heads,
                 dropout=dropout,
                 activation=activation,
                 layer_norm_epsilon=layer_norm_epsilon,
@@ -213,6 +221,7 @@ def __init__(
         self.num_layers = num_layers
         self.num_heads = num_heads
         self.activation = keras.activations.get(activation)
+        self.key_value_dim = key_value_dim
         self.dropout = dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.tie_embedding_weights = tie_embedding_weights
@@ -228,6 +237,7 @@ def get_config(self):
                 "num_layers": self.num_layers,
                 "num_heads": self.num_heads,
                 "activation": keras.activations.serialize(self.activation),
+                "key_value_dim": self.key_value_dim,
                 "dropout": self.dropout,
                 "layer_norm_epsilon": self.layer_norm_epsilon,
                 "tie_embedding_weights": self.tie_embedding_weights,
@@ -237,4 +247,4 @@ def get_config(self):
 
     @classproperty
     def presets(cls):
-        return {}
+        return copy.deepcopy(backbone_presets)
diff --git a/keras_nlp/models/t5/t5_multi_head_attention.py b/keras_nlp/models/t5/t5_multi_head_attention.py
@@ -25,6 +25,7 @@ def __init__(
         self,
         is_decoder,
         hidden_dim,
+        key_value_dim,
         num_heads,
         dropout,
         use_relative_attention_bias=False,
@@ -33,7 +34,7 @@ def __init__(
         super().__init__(**kwargs)
         self.is_decoder = is_decoder
         self.hidden_dim = hidden_dim
-        self.key_value_dim = hidden_dim // num_heads
+        self.key_value_dim = key_value_dim
         self.num_heads = num_heads
         self.use_relative_attention_bias = use_relative_attention_bias
 

diff --git a/keras_nlp/models/t5/t5_presets.py b/keras_nlp/models/t5/t5_presets.py
@@ -0,0 +1,163 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLM-RoBERTa model preset configurations."""
+
+backbone_presets = {
+    "t5_small_multi": {
+        "metadata": {
+            "description": (
+                "8-layer T5 model. Trained on the Colossal Clean Crawled "
+                "Corpus (C4)."
+            ),
+            "params": 0,
+            "official_name": "T5",
+            "path": "t5",
+            "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
+        },
+        "config": {
+            "vocabulary_size": 32128,
+            "num_layers": 6,
+            "num_heads": 8,
+            "hidden_dim": 512,
+            "intermediate_dim": 2048,
+            "key_value_dim": 64,
+            "dropout": 0.1,
+            "activation": "relu",
+            "use_gated_activation": False,
+            "layer_norm_epsilon": 1e-06,
+        },
+        "preprocessor_config": {},
+    },
+    "t5_base_multi": {
+        "metadata": {
+            "description": (
+                "12-layer T5 model. Trained on the Colossal Clean Crawled "
+                "Corpus (C4)."
+            ),
+            "params": 0,
+            "official_name": "T5",
+            "path": "t5",
+            "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
+        },
+        "config": {
+            "vocabulary_size": 32128,
+            "num_layers": 12,
+            "num_heads": 12,
+            "hidden_dim": 768,
+            "intermediate_dim": 3072,
+            "dropout": 0.1,
+            "activation": "relu",
+            "use_gated_activation": False,
+            "layer_norm_epsilon": 1e-06,
+        },
+        "preprocessor_config": {},
+    },
+    "t5_large_multi": {
+        "metadata": {
+            "description": (
+                "24-layer T5 model. Trained on the Colossal Clean Crawled "
+                "Corpus (C4)."
+            ),
+            "params": 0,
+            "official_name": "T5",
+            "path": "t5",
+            "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
+        },
+        "config": {
+            "vocabulary_size": 32128,
+            "num_layers": 24,
+            "num_heads": 16,
+            "hidden_dim": 1024,
+            "intermediate_dim": 4096,
+            "dropout": 0.1,
+            "activation": "relu",
+            "use_gated_activation": False,
+            "layer_norm_epsilon": 1e-06,
+        },
+        "preprocessor_config": {},
+    },
+    "flan_small_multi": {
+        "metadata": {
+            "description": (
+                "8-layer T5 model. Trained on the Colossal Clean Crawled "
+                "Corpus (C4)."
+            ),
+            "params": 0,
+            "official_name": "T5",
+            "path": "t5",
+            "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
+        },
+        "config": {
+            "vocabulary_size": 32128,
+            "num_layers": 8,
+            "num_heads": 6,
+            "hidden_dim": 512,
+            "intermediate_dim": 1024,
+            "key_value_dim": 64,
+            "dropout": 0.1,
+            "activation": "gelu",
+            "use_gated_activation": True,
+            "layer_norm_epsilon": 1e-06,
+        },
+        "preprocessor_config": {},
+    },
+    "flan_base_multi": {
+        "metadata": {
+            "description": (
+                "12-layer T5 model. Trained on the Colossal Clean Crawled "
+                "Corpus (C4)."
+            ),
+            "params": 0,
+            "official_name": "T5",
+            "path": "t5",
+            "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
+        },
+        "config": {
+            "vocabulary_size": 32128,
+            "num_layers": 12,
+            "num_heads": 12,
+            "hidden_dim": 768,
+            "intermediate_dim": 2048,
+            "dropout": 0.1,
+            "activation": "gelu",
+            "use_gated_activation": True,
+            "layer_norm_epsilon": 1e-06,
+        },
+        "preprocessor_config": {},
+    },
+    "flan_large_multi": {
+        "metadata": {
+            "description": (
+                "24-layer T5 model. Trained on the Colossal Clean Crawled "
+                "Corpus (C4)."
+            ),
+            "params": 0,
+            "official_name": "T5",
+            "path": "t5",
+            "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
+        },
+        "config": {
+            "vocabulary_size": 32128,
+            "num_layers": 24,
+            "num_heads": 16,
+            "hidden_dim": 1024,
+            "intermediate_dim": 2816,
+            "dropout": 0.1,
+            "activation": "gelu",
+            "use_gated_activation": True,
+            "layer_norm_epsilon": 1e-06,
+        },
+        "preprocessor_config": {},
+    },
+}
diff --git a/keras_nlp/models/t5/t5_transformer_layer.py b/keras_nlp/models/t5/t5_transformer_layer.py
@@ -27,6 +27,7 @@ def __init__(
         is_decoder,
         hidden_dim,
         intermediate_dim,
+        key_value_dim,
         dropout,
         activation,
         layer_norm_epsilon,
@@ -40,10 +41,11 @@ def __init__(
         self.use_gated_activation = use_gated_activation
 
         self.self_attention = T5MultiHeadAttention(
-            is_decoder,
-            hidden_dim,
-            num_heads,
-            dropout,
+            is_decoder=is_decoder,
+            hidden_dim=hidden_dim,
+            key_value_dim=key_value_dim,
+            num_heads=num_heads,
+            dropout=dropout,
             use_relative_attention_bias=use_relative_attention_bias,
             name="self_attention",
         )
@@ -52,16 +54,22 @@ def __init__(
 
         if self.is_decoder:
             self.cross_attention = T5MultiHeadAttention(
-                is_decoder,
-                hidden_dim,
-                num_heads,
-                dropout,
+                is_decoder=is_decoder,
+                hidden_dim=hidden_dim,
+                key_value_dim=key_value_dim,
+                num_heads=num_heads,
+                dropout=dropout,
                 use_relative_attention_bias=False,
                 name="cross_attention",
             )
             self.cross_attention_layer_norm = T5LayerNorm(layer_norm_epsilon)
             self.cross_attention_dropout = keras.layers.Dropout(dropout)
 
+        if activation == "gelu":
+            activation = keras.activations.get("keras_nlp>gelu_approximate")
+        else:
+            activation = keras.activations.get(activation)
+
         self.input_projector = keras.layers.Dense(
             intermediate_dim,
             use_bias=False,