diff --git a/keras_nlp/models/t5/t5_backbone.py b/keras_nlp/models/t5/t5_backbone.py index ace329c3f6..9d64edd3bc 100644 --- a/keras_nlp/models/t5/t5_backbone.py +++ b/keras_nlp/models/t5/t5_backbone.py @@ -58,8 +58,7 @@ class T5Backbone(Backbone): dropout: float. Dropout probability for the Transformer layers. activation: activation function (or activation string name). The activation to be used in the inner dense blocks of the - Transformer layers. The original T5 architecture used `"relu"`, - but more recent versions use `"gelu"`. Defaults to `"gelu"`. + Transformer layers. Defaults to `"relu"`. use_gated_activation: boolean. Whether to use activation gating in the inner dense blocks of the Transformer layers. The original T5 architecture didn't use gating, but more @@ -80,7 +79,7 @@ def __init__( intermediate_dim, key_value_dim=None, dropout=0.1, - activation="gelu", + activation="relu", use_gated_activation=True, layer_norm_epsilon=1e-06, tie_embedding_weights=False, diff --git a/keras_nlp/models/t5/t5_presets.py b/keras_nlp/models/t5/t5_presets.py index 5bba204051..cbdde0391a 100644 --- a/keras_nlp/models/t5/t5_presets.py +++ b/keras_nlp/models/t5/t5_presets.py @@ -106,7 +106,7 @@ "intermediate_dim": 1024, "key_value_dim": 64, "dropout": 0.1, - "activation": "gelu", + "activation": "keras_nlp>gelu_approximate", "use_gated_activation": True, "layer_norm_epsilon": 1e-06, }, @@ -130,7 +130,7 @@ "hidden_dim": 768, "intermediate_dim": 2048, "dropout": 0.1, - "activation": "gelu", + "activation": "keras_nlp>gelu_approximate", "use_gated_activation": True, "layer_norm_epsilon": 1e-06, }, @@ -154,7 +154,7 @@ "hidden_dim": 1024, "intermediate_dim": 2816, "dropout": 0.1, - "activation": "gelu", + "activation": "keras_nlp>gelu_approximate", "use_gated_activation": True, "layer_norm_epsilon": 1e-06, }, diff --git a/keras_nlp/models/t5/t5_transformer_layer.py b/keras_nlp/models/t5/t5_transformer_layer.py index 655019b451..27b4c9892c 100644 --- a/keras_nlp/models/t5/t5_transformer_layer.py +++ b/keras_nlp/models/t5/t5_transformer_layer.py @@ -65,11 +65,6 @@ def __init__( self.cross_attention_layer_norm = T5LayerNorm(layer_norm_epsilon) self.cross_attention_dropout = keras.layers.Dropout(dropout) - if activation == "gelu": - activation = keras.activations.get("keras_nlp>gelu_approximate") - else: - activation = keras.activations.get(activation) - self.input_projector = keras.layers.Dense( intermediate_dim, use_bias=False,