From f35ff3c2480ebb282d25ad25a00a13ae15a261f2 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 01:33:55 +0200 Subject: [PATCH 01/22] Add Tokenizer and new conversion script --- keras_nlp/models/bloom/bloom_presets.py | 30 ++++ keras_nlp/models/bloom/bloom_tokenizer.py | 123 +++++++++++++++ .../convert_bloom_checkpoints.py | 143 +++++++++++++----- 3 files changed, 260 insertions(+), 36 deletions(-) create mode 100644 keras_nlp/models/bloom/bloom_presets.py create mode 100644 keras_nlp/models/bloom/bloom_tokenizer.py diff --git a/keras_nlp/models/bloom/bloom_presets.py b/keras_nlp/models/bloom/bloom_presets.py new file mode 100644 index 0000000000..31d0600516 --- /dev/null +++ b/keras_nlp/models/bloom/bloom_presets.py @@ -0,0 +1,30 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT model preset configurations.""" + +backbone_presets = { + "bloom_tiny_multi": { + "metadata": { + "description": ( + "2-layer BERT model where all input is lowercased. " + "Trained on English Wikipedia + BooksCorpus." + ), + "params": 4385920, + "official_name": "BERT", + "path": "bert", + "model_card": "https://github.com/google-research/bert/blob/master/README.md", + }, + "kaggle_handle": "kaggle://keras/bert/keras/bert_tiny_en_uncased/2", + }, +} diff --git a/keras_nlp/models/bloom/bloom_tokenizer.py b/keras_nlp/models/bloom/bloom_tokenizer.py new file mode 100644 index 0000000000..7a2d7a340a --- /dev/null +++ b/keras_nlp/models/bloom/bloom_tokenizer.py @@ -0,0 +1,123 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +from keras_nlp.api_export import keras_nlp_export +from keras_nlp.models.gpt2.gpt2_presets import backbone_presets +from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer +from keras_nlp.utils.python_utils import classproperty + + +@keras_nlp_export("keras_nlp.models.BloomTokenizer") +class BloomTokenizer(BytePairTokenizer): + """A BLOOM tokenizer using Byte-Pair Encoding subword segmentation. + + This tokenizer class will tokenize raw strings into integer sequences and + is based on `keras_nlp.tokenizers.BytePairTokenizer`. Unlike the + underlying tokenizer, it will check for all special tokens needed by BLOOM + models and provides a `from_preset()` method to automatically download + a matching vocabulary for a BLOOM preset. + + This tokenizer does not provide truncation or padding of inputs. + + If input is a batch of strings (rank > 0), the layer will output a + `tf.RaggedTensor` where the last dimension of the output is ragged. + + If input is a scalar string (rank == 0), the layer will output a dense + `tf.Tensor` with static shape `[None]`. + + Args: + vocabulary: string or dict, maps token to integer ids. If it is a + string, it should be the file path to a json file. + merges: string or list, contains the merge rule. If it is a string, + it should be the file path to merge rules. The merge rule file + should have one merge rule per line. Every merge rule contains + merge entities separated by a space. + + Examples: + + ```python + # Unbatched input. + tokenizer = keras_nlp.models.BloomTokenizer.from_preset("gpt2_base_en") + tokenizer("The quick brown fox jumped.") + + # Batched input. + tokenizer(["The quick brown fox jumped.", "The fox slept."]) + + # Detokenization. + tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) + + # Custom vocabulary. + vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6} + merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"] + merges += ["Ġ f", "o x", "Ġf ox"] + tokenizer = keras_nlp.models.BloomTokenizer(vocabulary=vocab, merges=merges) + tokenizer("a quick fox.") + ``` + """ + + def __init__( + self, + vocabulary=None, + merges=None, + **kwargs, + ): + self.bos_token = "" + self.eos_token = "" + self.pad_token = "" + + super().__init__( + vocabulary=vocabulary, + merges=merges, + unsplittable_tokens=[ + self.bos_token, + self.eos_token, + self.pad_token, + ], + **kwargs, + ) + + def set_vocabulary_and_merges(self, vocabulary, merges): + super().set_vocabulary_and_merges(vocabulary, merges) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [self.bos_token, self.eos_token, self.pad_token]: + if token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in " + "your `vocabulary` or use a pretrained `vocabulary` name." + ) + + self.bos_token_id = self.token_to_id(self.bos_token) + self.eos_token_id = self.token_to_id(self.eos_token) + self.pad_token_id = self.token_to_id(self.pad_token) + else: + self.bos_token_id = None + self.eos_token_id = None + self.pad_token_id = None + + @classproperty + def presets(cls): + return copy.deepcopy(backbone_presets) + + def get_config(self): + config = super().get_config() + # In the constructor, we pass the list of special tokens to the + # `unsplittable_tokens` arg of the superclass' constructor. Hence, we + # delete it from the config here. + del config["unsplittable_tokens"] + return config diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 1fc895c54c..89eb266ee0 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -12,14 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import torch -import transformers -from absl import app -from absl import flags -from checkpoint_conversion_utils import get_md5_checksum +import json +import os -from keras_nlp.models.bloom.bloom_backbone import BloomBackbone +os.environ["KERAS_BACKEND"] = "torch" +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + +import numpy as np # noqa: E402 +import torch # noqa: E402 +import transformers # noqa: E402 +from absl import app # noqa: E402 +from absl import flags # noqa: E402 +from checkpoint_conversion_utils import get_md5_checksum # noqa: E402 +from huggingface_hub import snapshot_download # noqa: E402 + +import keras_nlp # noqa: E402 +from keras_nlp.models.bloom.bloom_backbone import BloomBackbone # noqa: E402 +from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer # noqa: E402 FLAGS = flags.FLAGS @@ -32,31 +41,58 @@ "bloom_extra_large": "bigscience/bloom", } +EXTRACT_DIR = "./model" + + flags.DEFINE_string( "preset", None, f'Must be one of {",".join(PRESET_MAP.keys())}' ) flags.mark_flag_as_required("preset") -def convert_checkpoints(hf_model): +def download_hf_model(hf_model_name): + hf_model_dir = snapshot_download( + repo_id=hf_model_name, + allow_patterns=["*.json", "*.bin"], + ignore_patterns=["onnx/*"], + local_dir=EXTRACT_DIR, + ) + + return hf_model_dir + + +def convert_model(hf_model): # get huggingface model configuration. hf_config = hf_model.config.to_dict() - cfg = {} - cfg["vocabulary_size"] = hf_config["vocab_size"] - cfg["num_layers"] = hf_config["n_layer"] - cfg["num_heads"] = hf_config["n_head"] - cfg["hidden_dim"] = hf_config["hidden_size"] - cfg["intermediate_dim"] = hf_config["hidden_size"] * 4 - cfg["dropout"] = hf_config["hidden_dropout"] - cfg["layer_norm_epsilon"] = hf_config["layer_norm_epsilon"] - - hidden_dim = cfg["hidden_dim"] - num_heads = cfg["num_heads"] - head_dim = hidden_dim // num_heads + kwargs = {} + kwargs["vocabulary_size"] = hf_config["vocab_size"] + kwargs["num_layers"] = hf_config["n_layer"] + kwargs["num_heads"] = hf_config["n_head"] + kwargs["hidden_dim"] = hf_config["hidden_size"] + kwargs["intermediate_dim"] = hf_config["hidden_size"] * 4 + kwargs["dropout"] = hf_config["hidden_dropout"] + kwargs["layer_norm_epsilon"] = hf_config["layer_norm_epsilon"] + + return BloomBackbone(**kwargs) + + +def convert_tokenizer(hf_model_dir): + tokenizer_file_path = os.path.join(hf_model_dir, "tokenizer.json") + with open(tokenizer_file_path) as tokenizer_file: + hf_tokenizer = json.load(tokenizer_file) + + vocab = hf_tokenizer["model"]["vocab"] + merges = hf_tokenizer["model"]["merges"] - # Intialize Bloom model with the weights. - keras_model = BloomBackbone(**cfg) + return BloomTokenizer(vocabulary=vocab, merges=merges) + + +def convert_checkpoints(keras_model, hf_model): + hidden_dim = keras_model.hidden_dim + num_heads = keras_model.num_heads + head_dim = hidden_dim // num_heads + num_layers = keras_model.num_layers # get huggingface model weights. hf_wts = hf_model.state_dict() @@ -78,7 +114,7 @@ def convert_checkpoints(hf_model): keras_model.get_layer("final_layernorm").beta.assign(hf_wts["ln_f.bias"]) # Decoder layers. - for i in range(cfg["num_layers"]): + for i in range(num_layers): decoder_layer = keras_model.get_layer(f"transformer_layer_{i}") # LayrNorm. decoder_layer._pre_attention_layernorm.gamma.assign( @@ -148,7 +184,14 @@ def convert_checkpoints(hf_model): return keras_model -def check_output(keras_model, hf_model): +def validate_output( + hf_model, + keras_model, + hf_tokenizer, + keras_tokenizer, +): + print("⏺ checking Backbone output") + hf_model_input = { "input_ids": torch.tensor([[59414, 15, 2670, 35433, 632, 207595]]), "attention_mask": torch.tensor([[1, 1, 1, 1, 1, 1]]), @@ -166,29 +209,57 @@ def check_output(keras_model, hf_model): keras_model_outputs = keras_model.predict(keras_model_input) # Comparing the outputs. - print("KerasNLP output:", keras_model_outputs[0, 0, :10]) - print("HF output:", hf_model_outputs[0, 0, :10]) - print("Difference:", np.mean(keras_model_outputs - hf_model_outputs)) + print("🔶 KerasNLP output:", keras_model_outputs[0, 0, :10]) + print("🔶 HF output:", hf_model_outputs[0, 0, :10]) + print("🔶 Difference:", np.mean(keras_model_outputs - hf_model_outputs)) + + print("⏺ checking tokenizer output") + + input_str = ["the quick brown fox ran, galloped and jumped."] + token_ids_keras = keras_tokenizer(input_str) + token_ids_hf = hf_tokenizer(input_str) + + print("🔶 KerasNLP output:", token_ids_keras) + print("🔶 HF output:", token_ids_hf) # Show the MD5 checksum of the model weights. print("Model md5sum: ", get_md5_checksum(f"./{FLAGS.preset}.weights.h5")) def main(_): + preset = FLAGS.preset + assert ( - FLAGS.preset in PRESET_MAP.keys() - ), f'Invalid preset {FLAGS.preset}. Must be one of {",".join(PRESET_MAP.keys())}' + preset in PRESET_MAP.keys() + ), f'Invalid preset {preset}. Must be one of {",".join(PRESET_MAP.keys())}' + + print(f"✅ Coverting {preset}") - hf_model_name = PRESET_MAP[FLAGS.preset] + hf_model_name = PRESET_MAP[preset] + hf_model_dir = download_hf_model(hf_model_name) + print("✅ Huggingface model downloaded from hub") - print("\n-> Loading HF model.") - hf_model = transformers.AutoModel.from_pretrained(hf_model_name) + hf_model = transformers.BloomModel.from_pretrained(hf_model_dir) + hf_tokenizer = transformers.BloomTokenizerFast.from_pretrained(hf_model_dir) + print("✅ Huggingface model loaded") - print("\n-> Converting model checkpoint.") - keras_model = convert_checkpoints(hf_model) + keras_model = convert_model(hf_model) + keras_tokenizer = convert_tokenizer(hf_model_dir) + print("✅ Keras model loaded") - print("\n-> Checking keras model output.") - check_output(keras_model, hf_model) + validate_output( + hf_model, + keras_model, + hf_tokenizer, + keras_tokenizer, + ) + print("✅ Numerics validated") + + keras_nlp.src.utils.preset_utils.save_to_preset(keras_model, preset) + keras_nlp.src.utils.preset_utils.save_to_preset( + keras_tokenizer, preset, config_filename="tokenizer.json" + ) + print("✅ Preset saved") if __name__ == "__main__": From 435d8b2e5eb47781e2ac8763a0622f9d2533b9de Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 01:48:42 +0200 Subject: [PATCH 02/22] Add bloom tokenizer to init --- keras_nlp/models/__init__.py | 1 + tools/checkpoint_conversion/convert_bloom_checkpoints.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/keras_nlp/models/__init__.py b/keras_nlp/models/__init__.py index 30736594d0..7b898138fd 100644 --- a/keras_nlp/models/__init__.py +++ b/keras_nlp/models/__init__.py @@ -36,6 +36,7 @@ from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor from keras_nlp.models.bert.bert_tokenizer import BertTokenizer from keras_nlp.models.bloom.bloom_backbone import BloomBackbone +from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone from keras_nlp.models.deberta_v3.deberta_v3_classifier import ( DebertaV3Classifier, diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 89eb266ee0..7bb63b5fa5 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -27,8 +27,8 @@ from huggingface_hub import snapshot_download # noqa: E402 import keras_nlp # noqa: E402 -from keras_nlp.models.bloom.bloom_backbone import BloomBackbone # noqa: E402 -from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer # noqa: E402 +from keras_nlp.models import BloomBackbone # noqa: E402 +from keras_nlp.models import BloomTokenizer # noqa: E402 FLAGS = flags.FLAGS From a0000dc96c00eb01c0e169fc85d300b802c3a21e Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 01:59:20 +0200 Subject: [PATCH 03/22] Convert weights --- .../convert_bloom_checkpoints.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 7bb63b5fa5..cb282ea1f2 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -88,7 +88,7 @@ def convert_tokenizer(hf_model_dir): return BloomTokenizer(vocabulary=vocab, merges=merges) -def convert_checkpoints(keras_model, hf_model): +def convert_weights(keras_model, hf_model): hidden_dim = keras_model.hidden_dim num_heads = keras_model.num_heads head_dim = hidden_dim // num_heads @@ -177,13 +177,6 @@ def convert_checkpoints(keras_model, hf_model): hf_wts[f"h.{i}.mlp.dense_4h_to_h.bias"] ) - # Save the model. - print(f"\n-> Saving KerasNLP model weights to `{FLAGS.preset}.weights.h5`.") - keras_model.save_weights(f"{FLAGS.preset}.weights.h5") - - return keras_model - - def validate_output( hf_model, keras_model, @@ -222,9 +215,6 @@ def validate_output( print("🔶 KerasNLP output:", token_ids_keras) print("🔶 HF output:", token_ids_hf) - # Show the MD5 checksum of the model weights. - print("Model md5sum: ", get_md5_checksum(f"./{FLAGS.preset}.weights.h5")) - def main(_): preset = FLAGS.preset @@ -247,6 +237,9 @@ def main(_): keras_tokenizer = convert_tokenizer(hf_model_dir) print("✅ Keras model loaded") + convert_weights(keras_model, hf_model) + print("✅ Weights converted") + validate_output( hf_model, keras_model, From 7007f4866d5851d9c5b658c4aba91cddd2700c74 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 02:11:14 +0200 Subject: [PATCH 04/22] check backbone output from bloom tokenizer --- .../convert_bloom_checkpoints.py | 36 +++++++------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index cb282ea1f2..916bb82a30 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -183,39 +183,29 @@ def validate_output( hf_tokenizer, keras_tokenizer, ): - print("⏺ checking Backbone output") - - hf_model_input = { - "input_ids": torch.tensor([[59414, 15, 2670, 35433, 632, 207595]]), - "attention_mask": torch.tensor([[1, 1, 1, 1, 1, 1]]), - } - - hf_model_outputs = hf_model(**hf_model_input) - hf_model_outputs = hf_model_outputs.last_hidden_state - hf_model_outputs = hf_model_outputs.detach().numpy() + print("⏺ checking Tokenizer and Backbone outputs") + + input_str = ["the quick brown fox ran, galloped and jumped."] + # KerasNLP + token_ids = keras_tokenizer(input_str) + padding_mask = token_ids != 0 keras_model_input = { - "token_ids": torch.tensor([[59414, 15, 2670, 35433, 632, 207595]]), - "padding_mask": torch.tensor([[1, 1, 1, 1, 1, 1]]), + "token_ids": token_ids.to_tensor(), + "padding_mask": padding_mask.to_tensor(), } - keras_model_outputs = keras_model.predict(keras_model_input) + hf_model_input = hf_tokenizer(input_str, return_tensors="pt") + + hf_model_outputs = hf_model(**hf_model_input).last_hidden_state + hf_model_outputs = hf_model_outputs.detach().numpy() + # Comparing the outputs. print("🔶 KerasNLP output:", keras_model_outputs[0, 0, :10]) print("🔶 HF output:", hf_model_outputs[0, 0, :10]) print("🔶 Difference:", np.mean(keras_model_outputs - hf_model_outputs)) - print("⏺ checking tokenizer output") - - input_str = ["the quick brown fox ran, galloped and jumped."] - token_ids_keras = keras_tokenizer(input_str) - token_ids_hf = hf_tokenizer(input_str) - - print("🔶 KerasNLP output:", token_ids_keras) - print("🔶 HF output:", token_ids_hf) - - def main(_): preset = FLAGS.preset From 0e270bd6d7fda61b076300319b9e29658851f076 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 02:14:45 +0200 Subject: [PATCH 05/22] Convert inputs into tensors --- tools/checkpoint_conversion/convert_bloom_checkpoints.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 916bb82a30..5be8ca4073 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -191,8 +191,8 @@ def validate_output( token_ids = keras_tokenizer(input_str) padding_mask = token_ids != 0 keras_model_input = { - "token_ids": token_ids.to_tensor(), - "padding_mask": padding_mask.to_tensor(), + "token_ids": torch.tensor(token_ids), + "padding_mask": torch.tensor(padding_mask), } keras_model_outputs = keras_model.predict(keras_model_input) From 03c15e0fd4b932df459155f2de51cdbb2713649d Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 02:20:19 +0200 Subject: [PATCH 06/22] Fix backbone input --- tools/checkpoint_conversion/convert_bloom_checkpoints.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 5be8ca4073..f8db6077e6 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -191,8 +191,8 @@ def validate_output( token_ids = keras_tokenizer(input_str) padding_mask = token_ids != 0 keras_model_input = { - "token_ids": torch.tensor(token_ids), - "padding_mask": torch.tensor(padding_mask), + "token_ids": token_ids, + "padding_mask": padding_mask, } keras_model_outputs = keras_model.predict(keras_model_input) From 0f4609e97c4c9e652c22b4fcaca53e12e45dbbb6 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 02:26:08 +0200 Subject: [PATCH 07/22] convert backbone input to torch tensor --- tools/checkpoint_conversion/convert_bloom_checkpoints.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index f8db6077e6..a6e5dd55ac 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -188,8 +188,8 @@ def validate_output( input_str = ["the quick brown fox ran, galloped and jumped."] # KerasNLP - token_ids = keras_tokenizer(input_str) - padding_mask = token_ids != 0 + token_ids = torch.tensor(keras_tokenizer(input_str)) + padding_mask = token_ids != 3 keras_model_input = { "token_ids": token_ids, "padding_mask": padding_mask, From 1b86e3891074e519336b4afa2232aabafcb7363f Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 02:39:49 +0200 Subject: [PATCH 08/22] Change presets naming --- .../convert_bloom_checkpoints.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index a6e5dd55ac..75d3e0080a 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -33,12 +33,12 @@ FLAGS = flags.FLAGS PRESET_MAP = { - "bloom_tiny": "bigscience/bloom-560m", - "bloom_extra_small": "bigscience/bloom-1b1", - "bloom_small": "bigscience/bloom-1b7", - "bloom_meduim": "bigscience/bloom-3b", - "bloom_large": "bigscience/bloom-7b1", - "bloom_extra_large": "bigscience/bloom", + "bloom_560m_multi": "bigscience/bloom-560m", + "bloom_1.1b_multi": "bigscience/bloom-1b1", + "bloom_1.7b_multi": "bigscience/bloom-1b7", + "bloom_3b_multi": "bigscience/bloom-3b", + "bloom_7b_multi": "bigscience/bloom-7b1", + "bloom_multi": "bigscience/bloom", } EXTRACT_DIR = "./model" From bbced6be679d7d1fba3137701b33d61e254f57ff Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 02:47:48 +0200 Subject: [PATCH 09/22] Change presets naming --- tools/checkpoint_conversion/convert_bloom_checkpoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 75d3e0080a..cc3a5a62e0 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -38,7 +38,7 @@ "bloom_1.7b_multi": "bigscience/bloom-1b7", "bloom_3b_multi": "bigscience/bloom-3b", "bloom_7b_multi": "bigscience/bloom-7b1", - "bloom_multi": "bigscience/bloom", + "bloom_176b_multi": "bigscience/bloom", } EXTRACT_DIR = "./model" From 0770b8a2686a9c1a03c16c4fc3f0c20995a94653 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 04:11:10 +0200 Subject: [PATCH 10/22] Comment unused code for now --- keras_nlp/models/bloom/bloom_presets.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/keras_nlp/models/bloom/bloom_presets.py b/keras_nlp/models/bloom/bloom_presets.py index 31d0600516..227061bab1 100644 --- a/keras_nlp/models/bloom/bloom_presets.py +++ b/keras_nlp/models/bloom/bloom_presets.py @@ -11,20 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""BERT model preset configurations.""" +"""BLOOM model preset configurations.""" backbone_presets = { - "bloom_tiny_multi": { + "bloom_560m_multi": { "metadata": { - "description": ( - "2-layer BERT model where all input is lowercased. " - "Trained on English Wikipedia + BooksCorpus." - ), - "params": 4385920, - "official_name": "BERT", - "path": "bert", - "model_card": "https://github.com/google-research/bert/blob/master/README.md", + # "description": ( + # "2-layer BERT model where all input is lowercased. " + # "Trained on English Wikipedia + BooksCorpus." + # ), + "params": 816115712, + "official_name": "BLOOM", + "path": "bloom", + # "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "kaggle_handle": "kaggle://keras/bert/keras/bert_tiny_en_uncased/2", + # "kaggle_handle": "kaggle://keras/bert/keras/bert_tiny_en_uncased/2", }, } From 75163e7ab76f83b7e959d50fc42272326b17b2ed Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 04:12:10 +0200 Subject: [PATCH 11/22] Format the code --- tools/checkpoint_conversion/convert_bloom_checkpoints.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index cc3a5a62e0..5fa930bec4 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -23,7 +23,6 @@ import transformers # noqa: E402 from absl import app # noqa: E402 from absl import flags # noqa: E402 -from checkpoint_conversion_utils import get_md5_checksum # noqa: E402 from huggingface_hub import snapshot_download # noqa: E402 import keras_nlp # noqa: E402 @@ -177,6 +176,7 @@ def convert_weights(keras_model, hf_model): hf_wts[f"h.{i}.mlp.dense_4h_to_h.bias"] ) + def validate_output( hf_model, keras_model, @@ -184,7 +184,7 @@ def validate_output( keras_tokenizer, ): print("⏺ checking Tokenizer and Backbone outputs") - + input_str = ["the quick brown fox ran, galloped and jumped."] # KerasNLP @@ -206,6 +206,7 @@ def validate_output( print("🔶 HF output:", hf_model_outputs[0, 0, :10]) print("🔶 Difference:", np.mean(keras_model_outputs - hf_model_outputs)) + def main(_): preset = FLAGS.preset From 3b139fcaf53c4aa8a8145ba1456ba1afc32f5e13 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 17 Jan 2024 04:13:05 +0200 Subject: [PATCH 12/22] change tokenizer example --- keras_nlp/models/bloom/bloom_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/models/bloom/bloom_tokenizer.py b/keras_nlp/models/bloom/bloom_tokenizer.py index 7a2d7a340a..dbf29078bb 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer.py +++ b/keras_nlp/models/bloom/bloom_tokenizer.py @@ -50,7 +50,7 @@ class BloomTokenizer(BytePairTokenizer): ```python # Unbatched input. - tokenizer = keras_nlp.models.BloomTokenizer.from_preset("gpt2_base_en") + tokenizer = keras_nlp.models.BloomTokenizer.from_preset("bloom_560m_multi") tokenizer("The quick brown fox jumped.") # Batched input. From c826837ca6bbef7cd926fd08ae7fd158fdb76aab Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Thu, 18 Jan 2024 03:54:00 +0200 Subject: [PATCH 13/22] Add a preset and initial for tests --- keras_nlp/models/bloom/bloom_backbone.py | 11 +++- keras_nlp/models/bloom/bloom_backbone_test.py | 24 +++++++ keras_nlp/models/bloom/bloom_presets.py | 12 ++-- .../models/bloom/bloom_tokenizer_test.py | 63 +++++++++++++++++++ 4 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 keras_nlp/models/bloom/bloom_tokenizer_test.py diff --git a/keras_nlp/models/bloom/bloom_backbone.py b/keras_nlp/models/bloom/bloom_backbone.py index e3d66998bc..3bac95bb80 100644 --- a/keras_nlp/models/bloom/bloom_backbone.py +++ b/keras_nlp/models/bloom/bloom_backbone.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy + from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.models.backbone import Backbone from keras_nlp.models.bloom.bloom_decoder import BloomDecoder +from keras_nlp.models.bloom.bloom_presets import backbone_presets +from keras_nlp.utils.python_utils import classproperty def _bloom_kernel_initializer(stddev=0.02): @@ -35,7 +39,8 @@ class BloomBackbone(Backbone): load preset architectures and weights, use the `from_preset()` constructor. Disclaimer: Pre-trained models are provided on an "as is" basis, without - warranties or conditions of any kind. + warranties or conditions of any kind. The underlying model is provided by a + third party and subject to a separate license, available [here](https://huggingface.co/spaces/bigscience/license). Args: vocabulary_size: int. The size of the token vocabulary. @@ -151,3 +156,7 @@ def get_config(self): } ) return config + + @classproperty + def presets(cls): + return copy.deepcopy(backbone_presets) diff --git a/keras_nlp/models/bloom/bloom_backbone_test.py b/keras_nlp/models/bloom/bloom_backbone_test.py index 99cb9d357e..bc950b2e72 100644 --- a/keras_nlp/models/bloom/bloom_backbone_test.py +++ b/keras_nlp/models/bloom/bloom_backbone_test.py @@ -49,3 +49,27 @@ def test_saved_model(self): init_kwargs=self.init_kwargs, input_data=self.input_data, ) + + # @pytest.mark.large + # def test_smallest_preset(self): + # self.run_preset_test( + # cls=BloomBackbone, + # preset="bloom_560m_multi", + # input_data={ + # "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"), + # "padding_mask": ops.ones((1, 4), dtype="int32"), + # }, + # expected_output_shape=(1, 4, 128), + # # The forward pass from a preset should be stable! + # expected_partial_output=ops.array([-1.38173, 0.16598, -2.92788, -2.66958, -0.61556]) + + # ) + + # @pytest.mark.extra_large + # def test_all_presets(self): + # for preset in BloomBackbone.presets: + # self.run_preset_test( + # cls=BloomBackbone, + # preset=preset, + # input_data=self.input_data, + # ) \ No newline at end of file diff --git a/keras_nlp/models/bloom/bloom_presets.py b/keras_nlp/models/bloom/bloom_presets.py index 227061bab1..0ed400c36c 100644 --- a/keras_nlp/models/bloom/bloom_presets.py +++ b/keras_nlp/models/bloom/bloom_presets.py @@ -16,15 +16,15 @@ backbone_presets = { "bloom_560m_multi": { "metadata": { - # "description": ( - # "2-layer BERT model where all input is lowercased. " - # "Trained on English Wikipedia + BooksCorpus." - # ), + "description": ( + "24-layer Bloom model. trained on 45 natural languages and " + "12 programming languages." + ), "params": 816115712, "official_name": "BLOOM", "path": "bloom", - # "model_card": "https://github.com/google-research/bert/blob/master/README.md", + "model_card": "https://huggingface.co/bigscience/bloom", }, - # "kaggle_handle": "kaggle://keras/bert/keras/bert_tiny_en_uncased/2", + "kaggle_handle": "kaggle://mohamedabuelnasr/bloom/keras/bloom_560m_multi/1", }, } diff --git a/keras_nlp/models/bloom/bloom_tokenizer_test.py b/keras_nlp/models/bloom/bloom_tokenizer_test.py new file mode 100644 index 0000000000..2e705c714f --- /dev/null +++ b/keras_nlp/models/bloom/bloom_tokenizer_test.py @@ -0,0 +1,63 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer +from keras_nlp.tests.test_case import TestCase + + +# class GPT2TokenizerTest(TestCase): + # def setUp(self): + # self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + # self.vocab += ["", "", ""] + # self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + # self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + # self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + # self.merges += ["Ġai r", "Ġa i", "pla ne"] + # self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + # self.input_data = [ + # " airplane at airport", + # " airplane airport", + # ] + + # def test_tokenizer_basics(self): + # self.run_preprocessing_layer_test( + # cls=BloomTokenizer, + # init_kwargs=self.init_kwargs, + # input_data=self.input_data, + # expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]], + # ) + + # def test_errors_missing_special_tokens(self): + # with self.assertRaises(ValueError): + # BloomTokenizer(vocabulary=["a", "b", "c"], merges=[]) + + # @pytest.mark.large + # def test_smallest_preset(self): + # self.run_preset_test( + # cls=BloomTokenizer, + # preset="bloom_560m_multi", + # input_data=["The quick brown fox."], + # expected_output=[[464, 2068, 7586, 21831, 13]], + # ) + + # @pytest.mark.extra_large + # def test_all_presets(self): + # for preset in BloomTokenizer.presets: + # self.run_preset_test( + # cls=BloomTokenizer, + # preset=preset, + # input_data=self.input_data, + # ) From 87024adbbf7aaf8eeb7593bae003099427b1ceb1 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Thu, 18 Jan 2024 04:44:04 +0200 Subject: [PATCH 14/22] Add backbone preset test --- keras_nlp/models/bloom/bloom_backbone_test.py | 45 ++++----- keras_nlp/models/bloom/bloom_tokenizer.py | 2 +- .../models/bloom/bloom_tokenizer_test.py | 91 +++++++++---------- 3 files changed, 69 insertions(+), 69 deletions(-) diff --git a/keras_nlp/models/bloom/bloom_backbone_test.py b/keras_nlp/models/bloom/bloom_backbone_test.py index bc950b2e72..c90ee01af2 100644 --- a/keras_nlp/models/bloom/bloom_backbone_test.py +++ b/keras_nlp/models/bloom/bloom_backbone_test.py @@ -50,26 +50,27 @@ def test_saved_model(self): input_data=self.input_data, ) - # @pytest.mark.large - # def test_smallest_preset(self): - # self.run_preset_test( - # cls=BloomBackbone, - # preset="bloom_560m_multi", - # input_data={ - # "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"), - # "padding_mask": ops.ones((1, 4), dtype="int32"), - # }, - # expected_output_shape=(1, 4, 128), - # # The forward pass from a preset should be stable! - # expected_partial_output=ops.array([-1.38173, 0.16598, -2.92788, -2.66958, -0.61556]) - - # ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=BloomBackbone, + preset="bloom_560m_multi", + input_data={ + "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape=(1, 4, 1024), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [2.4394186, 1.4131186, -2.7810357, -6.330823, -1.0599766] + ), + ) - # @pytest.mark.extra_large - # def test_all_presets(self): - # for preset in BloomBackbone.presets: - # self.run_preset_test( - # cls=BloomBackbone, - # preset=preset, - # input_data=self.input_data, - # ) \ No newline at end of file + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BloomBackbone.presets: + self.run_preset_test( + cls=BloomBackbone, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bloom/bloom_tokenizer.py b/keras_nlp/models/bloom/bloom_tokenizer.py index dbf29078bb..ced132b8ac 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer.py +++ b/keras_nlp/models/bloom/bloom_tokenizer.py @@ -15,7 +15,7 @@ import copy from keras_nlp.api_export import keras_nlp_export -from keras_nlp.models.gpt2.gpt2_presets import backbone_presets +from keras_nlp.models.bloom.bloom_presets import backbone_presets from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer from keras_nlp.utils.python_utils import classproperty diff --git a/keras_nlp/models/bloom/bloom_tokenizer_test.py b/keras_nlp/models/bloom/bloom_tokenizer_test.py index 2e705c714f..055fd19ca2 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer_test.py +++ b/keras_nlp/models/bloom/bloom_tokenizer_test.py @@ -12,52 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest - -from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer -from keras_nlp.tests.test_case import TestCase +# import pytest +# from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer +# from keras_nlp.tests.test_case import TestCase # class GPT2TokenizerTest(TestCase): - # def setUp(self): - # self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] - # self.vocab += ["", "", ""] - # self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - # self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - # self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - # self.merges += ["Ġai r", "Ġa i", "pla ne"] - # self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} - # self.input_data = [ - # " airplane at airport", - # " airplane airport", - # ] - - # def test_tokenizer_basics(self): - # self.run_preprocessing_layer_test( - # cls=BloomTokenizer, - # init_kwargs=self.init_kwargs, - # input_data=self.input_data, - # expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]], - # ) - - # def test_errors_missing_special_tokens(self): - # with self.assertRaises(ValueError): - # BloomTokenizer(vocabulary=["a", "b", "c"], merges=[]) - - # @pytest.mark.large - # def test_smallest_preset(self): - # self.run_preset_test( - # cls=BloomTokenizer, - # preset="bloom_560m_multi", - # input_data=["The quick brown fox."], - # expected_output=[[464, 2068, 7586, 21831, 13]], - # ) - - # @pytest.mark.extra_large - # def test_all_presets(self): - # for preset in BloomTokenizer.presets: - # self.run_preset_test( - # cls=BloomTokenizer, - # preset=preset, - # input_data=self.input_data, - # ) +# def setUp(self): +# self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] +# self.vocab += ["", "", ""] +# self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) +# self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] +# self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] +# self.merges += ["Ġai r", "Ġa i", "pla ne"] +# self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} +# self.input_data = [ +# " airplane at airport", +# " airplane airport", +# ] + +# def test_tokenizer_basics(self): +# self.run_preprocessing_layer_test( +# cls=BloomTokenizer, +# init_kwargs=self.init_kwargs, +# input_data=self.input_data, +# expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]], +# ) + +# def test_errors_missing_special_tokens(self): +# with self.assertRaises(ValueError): +# BloomTokenizer(vocabulary=["a", "b", "c"], merges=[]) + +# @pytest.mark.large +# def test_smallest_preset(self): +# self.run_preset_test( +# cls=BloomTokenizer, +# preset="bloom_560m_multi", +# input_data=["The quick brown fox."], +# expected_output=[[464, 2068, 7586, 21831, 13]], +# ) + +# @pytest.mark.extra_large +# def test_all_presets(self): +# for preset in BloomTokenizer.presets: +# self.run_preset_test( +# cls=BloomTokenizer, +# preset=preset, +# input_data=self.input_data, +# ) From 087e31b1aeb3dde8544cf773dd26efe621fc3860 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Thu, 18 Jan 2024 05:10:33 +0200 Subject: [PATCH 15/22] Add tokenizer test --- .../models/bloom/bloom_tokenizer_test.py | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/keras_nlp/models/bloom/bloom_tokenizer_test.py b/keras_nlp/models/bloom/bloom_tokenizer_test.py index 055fd19ca2..cbf49a8327 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer_test.py +++ b/keras_nlp/models/bloom/bloom_tokenizer_test.py @@ -12,51 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. -# import pytest +import pytest -# from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer -# from keras_nlp.tests.test_case import TestCase +from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer +from keras_nlp.tests.test_case import TestCase -# class GPT2TokenizerTest(TestCase): -# def setUp(self): -# self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] -# self.vocab += ["", "", ""] -# self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) -# self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] -# self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] -# self.merges += ["Ġai r", "Ġa i", "pla ne"] -# self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} -# self.input_data = [ -# " airplane at airport", -# " airplane airport", -# ] +class GPT2TokenizerTest(TestCase): + def setUp(self): + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["", "", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + self.input_data = [ + " airplane at airport", + " airplane airport", + ] -# def test_tokenizer_basics(self): -# self.run_preprocessing_layer_test( -# cls=BloomTokenizer, -# init_kwargs=self.init_kwargs, -# input_data=self.input_data, -# expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]], -# ) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=BloomTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 1, 3, 2, 5, 8]], + ) -# def test_errors_missing_special_tokens(self): -# with self.assertRaises(ValueError): -# BloomTokenizer(vocabulary=["a", "b", "c"], merges=[]) + def test_errors_missing_special_tokens(self): + with self.assertRaises(ValueError): + BloomTokenizer(vocabulary=["a", "b", "c"], merges=[]) -# @pytest.mark.large -# def test_smallest_preset(self): -# self.run_preset_test( -# cls=BloomTokenizer, -# preset="bloom_560m_multi", -# input_data=["The quick brown fox."], -# expected_output=[[464, 2068, 7586, 21831, 13]], -# ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=BloomTokenizer, + preset="bloom_560m_multi", + input_data=["The quick brown fox."], + expected_output=[[2175, 23714, 73173, 144252, 17]], + ) -# @pytest.mark.extra_large -# def test_all_presets(self): -# for preset in BloomTokenizer.presets: -# self.run_preset_test( -# cls=BloomTokenizer, -# preset=preset, -# input_data=self.input_data, -# ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BloomTokenizer.presets: + self.run_preset_test( + cls=BloomTokenizer, + preset=preset, + input_data=self.input_data, + ) From 03a1bfc89a9954f2caf2baf8c0d7960a8dc7ef66 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Thu, 18 Jan 2024 05:11:11 +0200 Subject: [PATCH 16/22] Format the code --- keras_nlp/models/bloom/bloom_tokenizer_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_nlp/models/bloom/bloom_tokenizer_test.py b/keras_nlp/models/bloom/bloom_tokenizer_test.py index cbf49a8327..77839a732c 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer_test.py +++ b/keras_nlp/models/bloom/bloom_tokenizer_test.py @@ -17,6 +17,7 @@ from keras_nlp.models.bloom.bloom_tokenizer import BloomTokenizer from keras_nlp.tests.test_case import TestCase + class GPT2TokenizerTest(TestCase): def setUp(self): self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] From 350c1dc954c59a6f1ca72585b09e640c04be0b17 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Thu, 18 Jan 2024 05:25:10 +0200 Subject: [PATCH 17/22] Fix tokenizer test --- keras_nlp/models/bloom/bloom_tokenizer_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/keras_nlp/models/bloom/bloom_tokenizer_test.py b/keras_nlp/models/bloom/bloom_tokenizer_test.py index 77839a732c..9ae9c0cc00 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer_test.py +++ b/keras_nlp/models/bloom/bloom_tokenizer_test.py @@ -18,7 +18,7 @@ from keras_nlp.tests.test_case import TestCase -class GPT2TokenizerTest(TestCase): +class BloomTokenizerTest(TestCase): def setUp(self): self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] self.vocab += ["", "", ""] @@ -28,7 +28,7 @@ def setUp(self): self.merges += ["Ġai r", "Ġa i", "pla ne"] self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} self.input_data = [ - " airplane at airport", + "airplane at airport", " airplane airport", ] @@ -37,7 +37,7 @@ def test_tokenizer_basics(self): cls=BloomTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 1, 3, 2, 5, 8]], + expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 2, 3, 2, 5, 8]], ) def test_errors_missing_special_tokens(self): From 01264ef80a47f6f5b086d688b7775fe183b7ec14 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Thu, 18 Jan 2024 05:48:24 +0200 Subject: [PATCH 18/22] Rename BloomBackboneTest --- keras_nlp/models/bloom/bloom_backbone_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/models/bloom/bloom_backbone_test.py b/keras_nlp/models/bloom/bloom_backbone_test.py index c90ee01af2..83732e4945 100644 --- a/keras_nlp/models/bloom/bloom_backbone_test.py +++ b/keras_nlp/models/bloom/bloom_backbone_test.py @@ -19,7 +19,7 @@ from keras_nlp.tests.test_case import TestCase -class BloomTest(TestCase): +class BloomBackboneTest(TestCase): def setUp(self): self.init_kwargs = { "vocabulary_size": 10, From 29be0f0fcf859d51de89e5448c4036e17949842d Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Sun, 21 Jan 2024 18:36:48 +0200 Subject: [PATCH 19/22] Small Fixes --- tools/checkpoint_conversion/convert_bloom_checkpoints.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bloom_checkpoints.py b/tools/checkpoint_conversion/convert_bloom_checkpoints.py index 5fa930bec4..38acd099cf 100644 --- a/tools/checkpoint_conversion/convert_bloom_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bloom_checkpoints.py @@ -18,12 +18,12 @@ os.environ["KERAS_BACKEND"] = "torch" os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +import huggingface_hub # noqa: E402 import numpy as np # noqa: E402 import torch # noqa: E402 import transformers # noqa: E402 from absl import app # noqa: E402 from absl import flags # noqa: E402 -from huggingface_hub import snapshot_download # noqa: E402 import keras_nlp # noqa: E402 from keras_nlp.models import BloomBackbone # noqa: E402 @@ -50,7 +50,7 @@ def download_hf_model(hf_model_name): - hf_model_dir = snapshot_download( + hf_model_dir = huggingface_hub.snapshot_download( repo_id=hf_model_name, allow_patterns=["*.json", "*.bin"], ignore_patterns=["onnx/*"], @@ -183,8 +183,6 @@ def validate_output( hf_tokenizer, keras_tokenizer, ): - print("⏺ checking Tokenizer and Backbone outputs") - input_str = ["the quick brown fox ran, galloped and jumped."] # KerasNLP From a84bc10653513f4be21901d3ef3ce7218ff8f45b Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Sun, 21 Jan 2024 19:37:15 +0200 Subject: [PATCH 20/22] Fix tokenizer example --- keras_nlp/models/bloom/bloom_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/models/bloom/bloom_tokenizer.py b/keras_nlp/models/bloom/bloom_tokenizer.py index ced132b8ac..cc3fcc2fc3 100644 --- a/keras_nlp/models/bloom/bloom_tokenizer.py +++ b/keras_nlp/models/bloom/bloom_tokenizer.py @@ -60,7 +60,7 @@ class BloomTokenizer(BytePairTokenizer): tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) # Custom vocabulary. - vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6} + vocab = {"": 0, "": 1, "": 2, "a": 3, "Ġquick": 4, "Ġfox": 5} merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"] merges += ["Ġ f", "o x", "Ġf ox"] tokenizer = keras_nlp.models.BloomTokenizer(vocabulary=vocab, merges=merges) From 2c521a0d8af8639eca29092655ceedd8738575ae Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Sun, 21 Jan 2024 19:37:33 +0200 Subject: [PATCH 21/22] Add example for backbone --- keras_nlp/models/bloom/bloom_backbone.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/keras_nlp/models/bloom/bloom_backbone.py b/keras_nlp/models/bloom/bloom_backbone.py index 3bac95bb80..2fd18ed760 100644 --- a/keras_nlp/models/bloom/bloom_backbone.py +++ b/keras_nlp/models/bloom/bloom_backbone.py @@ -63,6 +63,10 @@ class BloomBackbone(Backbone): "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]), } + # Pretrained BLOOM decoder. + model = keras_nlp.models.BloomBackbone.from_preset("bloom_560m_multi") + model(input_data) + # Randomly initialized BLOOM decoder with a custom config. model = keras_nlp.models.BloomBackbone( vocabulary_size=10, From 62a2db5ef074aa48a74339a2c4cff169f317bee7 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Wed, 24 Jan 2024 22:37:17 -0800 Subject: [PATCH 22/22] Update preset to Keras --- keras_nlp/models/bloom/bloom_presets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/models/bloom/bloom_presets.py b/keras_nlp/models/bloom/bloom_presets.py index 0ed400c36c..d3e9c780c0 100644 --- a/keras_nlp/models/bloom/bloom_presets.py +++ b/keras_nlp/models/bloom/bloom_presets.py @@ -25,6 +25,6 @@ "path": "bloom", "model_card": "https://huggingface.co/bigscience/bloom", }, - "kaggle_handle": "kaggle://mohamedabuelnasr/bloom/keras/bloom_560m_multi/1", + "kaggle_handle": "kaggle://keras/bloom/keras/bloom_560m_multi/1", }, }