Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions keras_nlp/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor
from keras_nlp.models.bert.bert_tokenizer import BertTokenizer
from keras_nlp.models.bloom.bloom_backbone import BloomBackbone
from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer
from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
DebertaV3Classifier,
Expand Down
15 changes: 14 additions & 1 deletion keras_nlp/models/bloom/bloom_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.backend import keras
from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding
from keras_nlp.models.backbone import Backbone
from keras_nlp.models.bloom.bloom_decoder import BloomDecoder
from keras_nlp.models.bloom.bloom_presets import backbone_presets
from keras_nlp.utils.python_utils import classproperty


def _bloom_kernel_initializer(stddev=0.02):
Expand All @@ -35,7 +39,8 @@ class BloomBackbone(Backbone):
load preset architectures and weights, use the `from_preset()` constructor.

Disclaimer: Pre-trained models are provided on an "as is" basis, without
warranties or conditions of any kind.
warranties or conditions of any kind. The underlying model is provided by a
third party and subject to a separate license, available [here](https://huggingface.co/spaces/bigscience/license).

Args:
vocabulary_size: int. The size of the token vocabulary.
Expand All @@ -58,6 +63,10 @@ class BloomBackbone(Backbone):
"padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
}

# Pretrained BLOOM decoder.
model = keras_nlp.models.BloomBackbone.from_preset("bloom_560m_multi")
model(input_data)

# Randomly initialized BLOOM decoder with a custom config.
model = keras_nlp.models.BloomBackbone(
vocabulary_size=10,
Expand Down Expand Up @@ -151,3 +160,7 @@ def get_config(self):
}
)
return config

@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)
27 changes: 26 additions & 1 deletion keras_nlp/models/bloom/bloom_backbone_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from keras_nlp.tests.test_case import TestCase


class BloomTest(TestCase):
class BloomBackboneTest(TestCase):
def setUp(self):
self.init_kwargs = {
"vocabulary_size": 10,
Expand Down Expand Up @@ -49,3 +49,28 @@ def test_saved_model(self):
init_kwargs=self.init_kwargs,
input_data=self.input_data,
)

@pytest.mark.large
def test_smallest_preset(self):
self.run_preset_test(
cls=BloomBackbone,
preset="bloom_560m_multi",
input_data={
"token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"),
"padding_mask": ops.ones((1, 4), dtype="int32"),
},
expected_output_shape=(1, 4, 1024),
# The forward pass from a preset should be stable!
expected_partial_output=ops.array(
[2.4394186, 1.4131186, -2.7810357, -6.330823, -1.0599766]
),
)

@pytest.mark.extra_large
def test_all_presets(self):
for preset in BloomBackbone.presets:
self.run_preset_test(
cls=BloomBackbone,
preset=preset,
input_data=self.input_data,
)
30 changes: 30 additions & 0 deletions keras_nlp/models/bloom/bloom_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BLOOM model preset configurations."""

backbone_presets = {
"bloom_560m_multi": {
"metadata": {
"description": (
"24-layer Bloom model. trained on 45 natural languages and "
"12 programming languages."
),
"params": 816115712,
"official_name": "BLOOM",
"path": "bloom",
"model_card": "https://huggingface.co/bigscience/bloom",
},
"kaggle_handle": "kaggle://keras/bloom/keras/bloom_560m_multi/1",
},
}
123 changes: 123 additions & 0 deletions keras_nlp/models/bloom/bloom_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.models.bloom.bloom_presets import backbone_presets
from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer
from keras_nlp.utils.python_utils import classproperty


@keras_nlp_export("keras_nlp.models.BloomTokenizer")
class BloomTokenizer(BytePairTokenizer):
"""A BLOOM tokenizer using Byte-Pair Encoding subword segmentation.

This tokenizer class will tokenize raw strings into integer sequences and
is based on `keras_nlp.tokenizers.BytePairTokenizer`. Unlike the
underlying tokenizer, it will check for all special tokens needed by BLOOM
models and provides a `from_preset()` method to automatically download
a matching vocabulary for a BLOOM preset.

This tokenizer does not provide truncation or padding of inputs.

If input is a batch of strings (rank > 0), the layer will output a
`tf.RaggedTensor` where the last dimension of the output is ragged.

If input is a scalar string (rank == 0), the layer will output a dense
`tf.Tensor` with static shape `[None]`.

Args:
vocabulary: string or dict, maps token to integer ids. If it is a
string, it should be the file path to a json file.
merges: string or list, contains the merge rule. If it is a string,
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.

Examples:

```python
# Unbatched input.
tokenizer = keras_nlp.models.BloomTokenizer.from_preset("bloom_560m_multi")
tokenizer("The quick brown fox jumped.")

# Batched input.
tokenizer(["The quick brown fox jumped.", "The fox slept."])

# Detokenization.
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))

# Custom vocabulary.
vocab = {"<s>": 0, "</s>": 1, "<pad>": 2, "a": 3, "Ġquick": 4, "Ġfox": 5}
merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
merges += ["Ġ f", "o x", "Ġf ox"]
tokenizer = keras_nlp.models.BloomTokenizer(vocabulary=vocab, merges=merges)
tokenizer("a quick fox.")
```
"""

def __init__(
self,
vocabulary=None,
merges=None,
**kwargs,
):
self.bos_token = "<s>"
self.eos_token = "</s>"
self.pad_token = "<pad>"

super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[
self.bos_token,
self.eos_token,
self.pad_token,
],
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
for token in [self.bos_token, self.eos_token, self.pad_token]:
if token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in "
"your `vocabulary` or use a pretrained `vocabulary` name."
)

self.bos_token_id = self.token_to_id(self.bos_token)
self.eos_token_id = self.token_to_id(self.eos_token)
self.pad_token_id = self.token_to_id(self.pad_token)
else:
self.bos_token_id = None
self.eos_token_id = None
self.pad_token_id = None

@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
return config
63 changes: 63 additions & 0 deletions keras_nlp/models/bloom/bloom_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer
from keras_nlp.tests.test_case import TestCase


class BloomTokenizerTest(TestCase):
def setUp(self):
self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
self.vocab += ["<s>", "</s>", "<pad>"]
self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.input_data = [
"<s>airplane at airport<pad>",
"<s> airplane airport<pad>",
]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=BloomTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 2, 3, 2, 5, 8]],
)

def test_errors_missing_special_tokens(self):
with self.assertRaises(ValueError):
BloomTokenizer(vocabulary=["a", "b", "c"], merges=[])

@pytest.mark.large
def test_smallest_preset(self):
self.run_preset_test(
cls=BloomTokenizer,
preset="bloom_560m_multi",
input_data=["The quick brown fox."],
expected_output=[[2175, 23714, 73173, 144252, 17]],
)

@pytest.mark.extra_large
def test_all_presets(self):
for preset in BloomTokenizer.presets:
self.run_preset_test(
cls=BloomTokenizer,
preset=preset,
input_data=self.input_data,
)
Loading