keras-team · mattdangerw · Jan 26, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/keras_nlp/models/__init__.py b/keras_nlp/models/__init__.py
@@ -36,6 +36,7 @@
 from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor
 from keras_nlp.models.bert.bert_tokenizer import BertTokenizer
 from keras_nlp.models.bloom.bloom_backbone import BloomBackbone
+from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer
 from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
 from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
     DebertaV3Classifier,

diff --git a/keras_nlp/models/bloom/bloom_backbone.py b/keras_nlp/models/bloom/bloom_backbone.py
@@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
+
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
 from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding
 from keras_nlp.models.backbone import Backbone
 from keras_nlp.models.bloom.bloom_decoder import BloomDecoder
+from keras_nlp.models.bloom.bloom_presets import backbone_presets
+from keras_nlp.utils.python_utils import classproperty
 
 
 def _bloom_kernel_initializer(stddev=0.02):
@@ -35,7 +39,8 @@ class BloomBackbone(Backbone):
     load preset architectures and weights, use the `from_preset()` constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
-    warranties or conditions of any kind.
+    warranties or conditions of any kind. The underlying model is provided by a
+    third party and subject to a separate license, available [here](https://huggingface.co/spaces/bigscience/license).
 
     Args:
         vocabulary_size: int. The size of the token vocabulary.
@@ -58,6 +63,10 @@ class BloomBackbone(Backbone):
         "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
+    # Pretrained BLOOM decoder.
+    model = keras_nlp.models.BloomBackbone.from_preset("bloom_560m_multi")
+    model(input_data)
+
     # Randomly initialized BLOOM decoder with a custom config.
     model = keras_nlp.models.BloomBackbone(
         vocabulary_size=10,
@@ -151,3 +160,7 @@ def get_config(self):
             }
         )
         return config
+
+    @classproperty
+    def presets(cls):
+        return copy.deepcopy(backbone_presets)
diff --git a/keras_nlp/models/bloom/bloom_backbone_test.py b/keras_nlp/models/bloom/bloom_backbone_test.py
@@ -19,7 +19,7 @@
 from keras_nlp.tests.test_case import TestCase
 
 
-class BloomTest(TestCase):
+class BloomBackboneTest(TestCase):
     def setUp(self):
         self.init_kwargs = {
             "vocabulary_size": 10,
@@ -49,3 +49,28 @@ def test_saved_model(self):
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
         )
+
+    @pytest.mark.large
+    def test_smallest_preset(self):
+        self.run_preset_test(
+            cls=BloomBackbone,
+            preset="bloom_560m_multi",
+            input_data={
+                "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"),
+                "padding_mask": ops.ones((1, 4), dtype="int32"),
+            },
+            expected_output_shape=(1, 4, 1024),
+            # The forward pass from a preset should be stable!
+            expected_partial_output=ops.array(
+                [2.4394186, 1.4131186, -2.7810357, -6.330823, -1.0599766]
+            ),
+        )
+
+    @pytest.mark.extra_large
+    def test_all_presets(self):
+        for preset in BloomBackbone.presets:
+            self.run_preset_test(
+                cls=BloomBackbone,
+                preset=preset,
+                input_data=self.input_data,
+            )
diff --git a/keras_nlp/models/bloom/bloom_presets.py b/keras_nlp/models/bloom/bloom_presets.py
@@ -0,0 +1,30 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BLOOM model preset configurations."""
+
+backbone_presets = {
+    "bloom_560m_multi": {
+        "metadata": {
+            "description": (
+                "24-layer Bloom model. trained on 45 natural languages and "
+                "12 programming languages."
+            ),
+            "params": 816115712,
+            "official_name": "BLOOM",
+            "path": "bloom",
+            "model_card": "https://huggingface.co/bigscience/bloom",
+        },
+        "kaggle_handle": "kaggle://keras/bloom/keras/bloom_560m_multi/1",
+    },
+}
diff --git a/keras_nlp/models/bloom/bloom_tokenizer.py b/keras_nlp/models/bloom/bloom_tokenizer.py
@@ -0,0 +1,123 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_nlp.api_export import keras_nlp_export
+from keras_nlp.models.bloom.bloom_presets import backbone_presets
+from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer
+from keras_nlp.utils.python_utils import classproperty
+
+
+@keras_nlp_export("keras_nlp.models.BloomTokenizer")
+class BloomTokenizer(BytePairTokenizer):
+    """A BLOOM tokenizer using Byte-Pair Encoding subword segmentation.
+
+    This tokenizer class will tokenize raw strings into integer sequences and
+    is based on `keras_nlp.tokenizers.BytePairTokenizer`. Unlike the
+    underlying tokenizer, it will check for all special tokens needed by BLOOM
+    models and provides a `from_preset()` method to automatically download
+    a matching vocabulary for a BLOOM preset.
+
+    This tokenizer does not provide truncation or padding of inputs.
+
+    If input is a batch of strings (rank > 0), the layer will output a
+    `tf.RaggedTensor` where the last dimension of the output is ragged.
+
+    If input is a scalar string (rank == 0), the layer will output a dense
+    `tf.Tensor` with static shape `[None]`.
+
+    Args:
+        vocabulary: string or dict, maps token to integer ids. If it is a
+            string, it should be the file path to a json file.
+        merges: string or list, contains the merge rule. If it is a string,
+            it should be the file path to merge rules. The merge rule file
+            should have one merge rule per line. Every merge rule contains
+            merge entities separated by a space.
+
+    Examples:
+
+    ```python
+    # Unbatched input.
+    tokenizer = keras_nlp.models.BloomTokenizer.from_preset("bloom_560m_multi")
+    tokenizer("The quick brown fox jumped.")
+
+    # Batched input.
+    tokenizer(["The quick brown fox jumped.", "The fox slept."])
+
+    # Detokenization.
+    tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
+
+    # Custom vocabulary.
+    vocab = {"<s>": 0, "</s>": 1, "<pad>": 2, "a": 3, "Ġquick": 4, "Ġfox": 5}
+    merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
+    merges += ["Ġ f", "o x", "Ġf ox"]
+    tokenizer = keras_nlp.models.BloomTokenizer(vocabulary=vocab, merges=merges)
+    tokenizer("a quick fox.")
+    ```
+    """
+
+    def __init__(
+        self,
+        vocabulary=None,
+        merges=None,
+        **kwargs,
+    ):
+        self.bos_token = "<s>"
+        self.eos_token = "</s>"
+        self.pad_token = "<pad>"
+
+        super().__init__(
+            vocabulary=vocabulary,
+            merges=merges,
+            unsplittable_tokens=[
+                self.bos_token,
+                self.eos_token,
+                self.pad_token,
+            ],
+            **kwargs,
+        )
+
+    def set_vocabulary_and_merges(self, vocabulary, merges):
+        super().set_vocabulary_and_merges(vocabulary, merges)
+
+        if vocabulary is not None:
+            # Check for necessary special tokens.
+            for token in [self.bos_token, self.eos_token, self.pad_token]:
+                if token not in self.get_vocabulary():
+                    raise ValueError(
+                        f"Cannot find token `'{token}'` in the provided "
+                        f"`vocabulary`. Please provide `'{token}'` in "
+                        "your `vocabulary` or use a pretrained `vocabulary` name."
+                    )
+
+            self.bos_token_id = self.token_to_id(self.bos_token)
+            self.eos_token_id = self.token_to_id(self.eos_token)
+            self.pad_token_id = self.token_to_id(self.pad_token)
+        else:
+            self.bos_token_id = None
+            self.eos_token_id = None
+            self.pad_token_id = None
+
+    @classproperty
+    def presets(cls):
+        return copy.deepcopy(backbone_presets)
+
+    def get_config(self):
+        config = super().get_config()
+        # In the constructor, we pass the list of special tokens to the
+        # `unsplittable_tokens` arg of the superclass' constructor. Hence, we
+        # delete it from the config here.
+        del config["unsplittable_tokens"]
+        return config
diff --git a/keras_nlp/models/bloom/bloom_tokenizer_test.py b/keras_nlp/models/bloom/bloom_tokenizer_test.py
@@ -0,0 +1,63 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer
+from keras_nlp.tests.test_case import TestCase
+
+
+class BloomTokenizerTest(TestCase):
+    def setUp(self):
+        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
+        self.vocab += ["<s>", "</s>", "<pad>"]
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
+        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
+        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
+        self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
+        self.input_data = [
+            "<s>airplane at airport<pad>",
+            "<s> airplane airport<pad>",
+        ]
+
+    def test_tokenizer_basics(self):
+        self.run_preprocessing_layer_test(
+            cls=BloomTokenizer,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 2, 3, 2, 5, 8]],
+        )
+
+    def test_errors_missing_special_tokens(self):
+        with self.assertRaises(ValueError):
+            BloomTokenizer(vocabulary=["a", "b", "c"], merges=[])
+
+    @pytest.mark.large
+    def test_smallest_preset(self):
+        self.run_preset_test(
+            cls=BloomTokenizer,
+            preset="bloom_560m_multi",
+            input_data=["The quick brown fox."],
+            expected_output=[[2175, 23714, 73173, 144252, 17]],
+        )
+
+    @pytest.mark.extra_large
+    def test_all_presets(self):
+        for preset in BloomTokenizer.presets:
+            self.run_preset_test(
+                cls=BloomTokenizer,
+                preset=preset,
+                input_data=self.input_data,
+            )