Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion keras_nlp/models/t5/t5_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.backend import keras
from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding
from keras_nlp.models.backbone import Backbone
from keras_nlp.models.t5.t5_layer_norm import T5LayerNorm
from keras_nlp.models.t5.t5_presets import backbone_presets
from keras_nlp.models.t5.t5_transformer_layer import T5TransformerLayer
from keras_nlp.utils.python_utils import classproperty

Expand Down Expand Up @@ -50,6 +52,9 @@ class T5Backbone(Backbone):
hidden_dim: int. The hidden size of the Transformer layers.
intermediate_dim: int. The output dimension of the first Dense layer in
a two-layer feedforward network for each Transformer layer.
key_value_dim: int. The dimension of each head of the key/value
projections in the multi-head attention layers. Defaults to
hidden_dim / num_heads.
dropout: float. Dropout probability for the Transformer layers.
activation: activation function (or activation string name). The
activation to be used in the inner dense blocks of the
Expand All @@ -73,6 +78,7 @@ def __init__(
num_heads,
hidden_dim,
intermediate_dim,
key_value_dim=None,
dropout=0.1,
activation="gelu",
use_gated_activation=True,
Expand Down Expand Up @@ -122,6 +128,7 @@ def __init__(
is_decoder=False,
hidden_dim=hidden_dim,
intermediate_dim=intermediate_dim,
key_value_dim=key_value_dim or hidden_dim // num_heads,
dropout=dropout,
activation=activation,
layer_norm_epsilon=layer_norm_epsilon,
Expand Down Expand Up @@ -165,6 +172,7 @@ def __init__(
is_decoder=True,
hidden_dim=hidden_dim,
intermediate_dim=intermediate_dim,
key_value_dim=key_value_dim or hidden_dim // num_heads,
dropout=dropout,
activation=activation,
layer_norm_epsilon=layer_norm_epsilon,
Expand Down Expand Up @@ -213,6 +221,7 @@ def __init__(
self.num_layers = num_layers
self.num_heads = num_heads
self.activation = keras.activations.get(activation)
self.key_value_dim = key_value_dim
self.dropout = dropout
self.layer_norm_epsilon = layer_norm_epsilon
self.tie_embedding_weights = tie_embedding_weights
Expand All @@ -228,6 +237,7 @@ def get_config(self):
"num_layers": self.num_layers,
"num_heads": self.num_heads,
"activation": keras.activations.serialize(self.activation),
"key_value_dim": self.key_value_dim,
"dropout": self.dropout,
"layer_norm_epsilon": self.layer_norm_epsilon,
"tie_embedding_weights": self.tie_embedding_weights,
Expand All @@ -237,4 +247,4 @@ def get_config(self):

@classproperty
def presets(cls):
return {}
return copy.deepcopy(backbone_presets)
3 changes: 2 additions & 1 deletion keras_nlp/models/t5/t5_multi_head_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
self,
is_decoder,
hidden_dim,
key_value_dim,
num_heads,
dropout,
use_relative_attention_bias=False,
Expand All @@ -33,7 +34,7 @@ def __init__(
super().__init__(**kwargs)
self.is_decoder = is_decoder
self.hidden_dim = hidden_dim
self.key_value_dim = hidden_dim // num_heads
self.key_value_dim = key_value_dim
self.num_heads = num_heads
self.use_relative_attention_bias = use_relative_attention_bias

Expand Down
163 changes: 163 additions & 0 deletions keras_nlp/models/t5/t5_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""XLM-RoBERTa model preset configurations."""

backbone_presets = {
"t5_small_multi": {
"metadata": {
"description": (
"8-layer T5 model. Trained on the Colossal Clean Crawled "
"Corpus (C4)."
),
"params": 0,
"official_name": "T5",
"path": "t5",
"model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
},
"config": {
"vocabulary_size": 32128,
"num_layers": 6,
"num_heads": 8,
"hidden_dim": 512,
"intermediate_dim": 2048,
"key_value_dim": 64,
"dropout": 0.1,
"activation": "relu",
"use_gated_activation": False,
"layer_norm_epsilon": 1e-06,
},
"preprocessor_config": {},
},
"t5_base_multi": {
"metadata": {
"description": (
"12-layer T5 model. Trained on the Colossal Clean Crawled "
"Corpus (C4)."
),
"params": 0,
"official_name": "T5",
"path": "t5",
"model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
},
"config": {
"vocabulary_size": 32128,
"num_layers": 12,
"num_heads": 12,
"hidden_dim": 768,
"intermediate_dim": 3072,
"dropout": 0.1,
"activation": "relu",
"use_gated_activation": False,
"layer_norm_epsilon": 1e-06,
},
"preprocessor_config": {},
},
"t5_large_multi": {
"metadata": {
"description": (
"24-layer T5 model. Trained on the Colossal Clean Crawled "
"Corpus (C4)."
),
"params": 0,
"official_name": "T5",
"path": "t5",
"model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
},
"config": {
"vocabulary_size": 32128,
"num_layers": 24,
"num_heads": 16,
"hidden_dim": 1024,
"intermediate_dim": 4096,
"dropout": 0.1,
"activation": "relu",
"use_gated_activation": False,
"layer_norm_epsilon": 1e-06,
},
"preprocessor_config": {},
},
"flan_small_multi": {
"metadata": {
"description": (
"8-layer T5 model. Trained on the Colossal Clean Crawled "
"Corpus (C4)."
),
"params": 0,
"official_name": "T5",
"path": "t5",
"model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
},
"config": {
"vocabulary_size": 32128,
"num_layers": 8,
"num_heads": 6,
"hidden_dim": 512,
"intermediate_dim": 1024,
"key_value_dim": 64,
"dropout": 0.1,
"activation": "gelu",
"use_gated_activation": True,
"layer_norm_epsilon": 1e-06,
},
"preprocessor_config": {},
},
"flan_base_multi": {
"metadata": {
"description": (
"12-layer T5 model. Trained on the Colossal Clean Crawled "
"Corpus (C4)."
),
"params": 0,
"official_name": "T5",
"path": "t5",
"model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
},
"config": {
"vocabulary_size": 32128,
"num_layers": 12,
"num_heads": 12,
"hidden_dim": 768,
"intermediate_dim": 2048,
"dropout": 0.1,
"activation": "gelu",
"use_gated_activation": True,
"layer_norm_epsilon": 1e-06,
},
"preprocessor_config": {},
},
"flan_large_multi": {
"metadata": {
"description": (
"24-layer T5 model. Trained on the Colossal Clean Crawled "
"Corpus (C4)."
),
"params": 0,
"official_name": "T5",
"path": "t5",
"model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md",
},
"config": {
"vocabulary_size": 32128,
"num_layers": 24,
"num_heads": 16,
"hidden_dim": 1024,
"intermediate_dim": 2816,
"dropout": 0.1,
"activation": "gelu",
"use_gated_activation": True,
"layer_norm_epsilon": 1e-06,
},
"preprocessor_config": {},
},
}
24 changes: 16 additions & 8 deletions keras_nlp/models/t5/t5_transformer_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
is_decoder,
hidden_dim,
intermediate_dim,
key_value_dim,
dropout,
activation,
layer_norm_epsilon,
Expand All @@ -40,10 +41,11 @@ def __init__(
self.use_gated_activation = use_gated_activation

self.self_attention = T5MultiHeadAttention(
is_decoder,
hidden_dim,
num_heads,
dropout,
is_decoder=is_decoder,
hidden_dim=hidden_dim,
key_value_dim=key_value_dim,
num_heads=num_heads,
dropout=dropout,
use_relative_attention_bias=use_relative_attention_bias,
name="self_attention",
)
Expand All @@ -52,16 +54,22 @@ def __init__(

if self.is_decoder:
self.cross_attention = T5MultiHeadAttention(
is_decoder,
hidden_dim,
num_heads,
dropout,
is_decoder=is_decoder,
hidden_dim=hidden_dim,
key_value_dim=key_value_dim,
num_heads=num_heads,
dropout=dropout,
use_relative_attention_bias=False,
name="cross_attention",
)
self.cross_attention_layer_norm = T5LayerNorm(layer_norm_epsilon)
self.cross_attention_dropout = keras.layers.Dropout(dropout)

if activation == "gelu":
activation = keras.activations.get("keras_nlp>gelu_approximate")
else:
activation = keras.activations.get(activation)

self.input_projector = keras.layers.Dense(
intermediate_dim,
use_bias=False,
Expand Down
Loading