Remove special case Bart from_preset (#1333)

In doing this we need to remove an error for the case where a user would try to use a sequence_length longer than the supported max length of the backbone. preprocessor = BertPreprocessor.from_preset( "bert_base_uncased", sequence_length=1500, ) We would do this by reaching into the backbone config to read out the max length. Overall I think we shoud probably avoid cross cutting dependencies like this, a preprocessor should not reach into a backbone config. Also it is valid to want to use the vocab of a model to preprocess at a longer sequence length than the backbone would allow (maybe you are using a custom model). Instead we should probably try to make a friendly error message from the backbone (or position embedding), if a sequence length is too long.
keras-team · mattdangerw · Jan 4, 2024 · Nov 20, 2023 · Nov 21, 2023 · Nov 21, 2023
commit e3f8d062ebe72f2d9c95812a1a4b578ec863764f
diff --git a/keras_nlp/models/bart/bart_preprocessor.py b/keras_nlp/models/bart/bart_preprocessor.py
@@ -233,63 +233,3 @@ def tokenizer_cls(cls):
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
-
-    @classmethod
-    def from_preset(
-        cls,
-        preset,
-        **kwargs,
-    ):
-        # Override base class's `from_preset` to handle `encoder_sequence_length`
-        # and `decoder_sequence_length`.
-        if not cls.presets:
-            raise NotImplementedError(
-                "No presets have been created for this class."
-            )
-        if preset not in cls.presets:
-            raise ValueError(
-                "`preset` must be one of "
-                f"""{", ".join(cls.presets)}. Received: {preset}."""
-            )
-
-        tokenizer = cls.tokenizer_cls.from_preset(preset)
-
-        metadata = cls.presets[preset]
-        # For task model presets, the backbone config is nested.
-        if "backbone" in metadata["config"]:
-            backbone_config = metadata["config"]["backbone"]["config"]
-        else:
-            backbone_config = metadata["config"]
-
-        # Use model's `max_sequence_length` if either `encoder_sequence_length`
-        # or `decoder_sequence_length` are unspecified; otherwise check that
-        # `encoder_sequence_length`/`decoder_sequence_length` are not too long.
-        encoder_sequence_length = kwargs.pop("encoder_sequence_length", None)
-        decoder_sequence_length = kwargs.pop("decoder_sequence_length", None)
-        max_sequence_length = backbone_config["max_sequence_length"]
-
-        def check_sequence_length(sequence_length, name):
-            if sequence_length is not None:
-                if sequence_length > max_sequence_length:
-                    raise ValueError(
-                        f"`{name}` cannot be longer than `{preset}` "
-                        f"preset's `max_sequence_length` of {max_sequence_length}. "
-                        f"Received: {sequence_length}."
-                    )
-                return sequence_length
-            else:
-                return max_sequence_length
-
-        encoder_sequence_length = check_sequence_length(
-            encoder_sequence_length, "encoder_sequence_length"
-        )
-        decoder_sequence_length = check_sequence_length(
-            decoder_sequence_length, "decoder_sequence_length"
-        )
-
-        return cls(
-            tokenizer=tokenizer,
-            encoder_sequence_length=encoder_sequence_length,
-            decoder_sequence_length=decoder_sequence_length,
-            **kwargs,
-        )
diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py
@@ -71,31 +71,8 @@ def _legacy_from_preset(
         **kwargs,
     ):
         tokenizer = cls.tokenizer_cls.from_preset(preset)
-
-        metadata = cls.presets[preset]
-        # For task model presets, the backbone config is nested.
-        if "backbone" in metadata["config"]:
-            backbone_config = metadata["config"]["backbone"]["config"]
-        else:
-            backbone_config = metadata["config"]
-
-        # Use model's `max_sequence_length` if `sequence_length` unspecified;
-        # otherwise check that `sequence_length` not too long.
-        sequence_length = kwargs.pop("sequence_length", None)
-        max_sequence_length = backbone_config["max_sequence_length"]
-        if sequence_length is not None:
-            if sequence_length > max_sequence_length:
-                raise ValueError(
-                    f"`sequence_length` cannot be longer than `{preset}` "
-                    f"preset's `max_sequence_length` of {max_sequence_length}. "
-                    f"Received: {sequence_length}."
-                )
-        else:
-            sequence_length = max_sequence_length
-
         return cls(
             tokenizer=tokenizer,
-            sequence_length=sequence_length,
             **kwargs,
         )