-
Notifications
You must be signed in to change notification settings - Fork 31.1k
🚨🚨🚨 [SPM] Finish fix spm models 🚨🚨🚨
#25224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
c99c130
acf31e2
7305aff
01b8347
b9ddbbb
83af718
0babe38
0fdf51e
2d197a1
e9c7a72
94964cd
cc9ddcf
45cae43
53557a9
e049d11
b64b2d2
cb95361
a86bf78
913cd1d
ef28574
4f65261
ad7f8c6
76d00cc
9cb92b6
204153f
9f37103
700ee64
4b5315b
a4ed16f
e7906c2
ad33c97
f890882
b7f98bc
bb79083
3f8ac96
4249986
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,9 @@ | |
| from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple | ||
|
|
||
| import sentencepiece as spm | ||
| from sentencepiece import SentencePieceProcessor | ||
|
|
||
| from ...convert_slow_tokenizer import import_protobuf | ||
| from ...tokenization_utils import PreTrainedTokenizer | ||
|
|
||
|
|
||
|
|
@@ -106,9 +108,10 @@ class T5Tokenizer(PreTrainedTokenizer): | |
|
|
||
| - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for | ||
| BPE-dropout. | ||
| legacy (`bool`, *optional*, defaults to `True`): | ||
| legacy (`bool`, *optional*): | ||
| Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622 | ||
| which includes fixes to properly handle tokens that appear after special tokens. A simple example: | ||
| and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple | ||
| example: | ||
|
|
||
| - `legacy=True`: | ||
| ```python | ||
|
|
@@ -126,7 +129,7 @@ class T5Tokenizer(PreTrainedTokenizer): | |
| >>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here | ||
| [8774, 32099, 5, 1] | ||
| ``` | ||
| Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for | ||
| Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/25224) for | ||
| more details. | ||
|
|
||
| Attributes: | ||
|
|
@@ -187,8 +190,23 @@ def __init__( | |
| self.vocab_file = vocab_file | ||
| self._extra_ids = extra_ids | ||
|
|
||
| self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) | ||
| self.sp_model.Load(vocab_file) | ||
| self.sp_model = self.get_spm_processor() | ||
|
|
||
| self.unk_token_length = len(self.sp_model.encode(str(self.unk_token))) | ||
|
|
||
| def get_spm_processor(self): | ||
| tokenizer = SentencePieceProcessor(**self.sp_model_kwargs) | ||
| with open(self.vocab_file, "rb") as f: | ||
| sp_model = f.read() | ||
| model_pb2 = import_protobuf() | ||
| model = model_pb2.ModelProto.FromString(sp_model) | ||
| if not self.legacy: | ||
| normalizer_spec = model_pb2.NormalizerSpec() | ||
| normalizer_spec.add_dummy_prefix = False | ||
| model.normalizer_spec.MergeFrom(normalizer_spec) | ||
| sp_model = model.SerializeToString() | ||
| tokenizer.LoadFromSerializedProto(sp_model) | ||
| return tokenizer | ||
|
|
||
| @staticmethod | ||
| def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): | ||
|
|
@@ -335,6 +353,7 @@ def tokenize(self, text: "TextInput", **kwargs) -> List[str]: | |
| # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at | ||
| # the beginning of the text | ||
| if not self.legacy: | ||
| # replacing " " by SPIECE_UNDERLINE prevents any form of stripping... | ||
| text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ") | ||
| return super().tokenize(text, **kwargs) | ||
|
|
||
|
|
@@ -349,15 +368,10 @@ def _tokenize(self, text, **kwargs): | |
| the extra `SPIECE_UNDERLINE` prepended. | ||
| """ | ||
| if not self.legacy: | ||
| is_first = text.startswith(SPIECE_UNDERLINE) | ||
| if is_first: | ||
| text = text[1:] | ||
|
|
||
| tokens = self.sp_model.encode(text, out_type=str) | ||
|
|
||
| if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE): | ||
| tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:] | ||
| return tokens | ||
| text = self.unk_token + text | ||
| tokens = self.sp_model.encode(text, out_type=str) | ||
| return tokens[self.unk_token_length :] | ||
|
||
| return self.sp_model.encode(text, out_type=str) | ||
|
|
||
| def _convert_token_to_id(self, token): | ||
| """Converts a token (str) in an id using the vocab.""" | ||
|
|
@@ -378,6 +392,8 @@ def _convert_id_to_token(self, index): | |
| def convert_tokens_to_string(self, tokens): | ||
| """Converts a sequence of tokens (string) in a single string.""" | ||
| current_sub_tokens = [] | ||
| # since we manually add the prefix space, we have to remove it | ||
| tokens[0] = tokens[0].strip(SPIECE_UNDERLINE) | ||
| out_string = "" | ||
| prev_is_special = False | ||
| for token in tokens: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -499,6 +499,27 @@ def test_integration_test_xnli(self): | |
|
|
||
| self.assertEqual(decoded1, decoded2) | ||
|
|
||
| def test_special_token_special_word(self): | ||
| # the word inform should be split as ['in', 'form'] | ||
| tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False) | ||
| tokenizer.add_tokens(["<REPR_END>"], special_tokens=True) | ||
| out1 = tokenizer.decode( | ||
| tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False | ||
| ) | ||
| self.assertEquals(out1, "<REPR_END>inform") | ||
| out2 = tokenizer.decode( | ||
| tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True | ||
| ) | ||
| self.assertEquals(out2, " <REPR_END> inform") | ||
| input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False) | ||
| self.assertEquals(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁' | ||
|
|
||
| out2 = tokenizer.decode( | ||
| tokenizer.encode(" <REPR_END> inform", add_special_tokens=False), spaces_between_special_tokens=False | ||
| ) | ||
| # TODO ArthurZ currently we strip left and right, so this will not keep the spaces | ||
| self.assertEquals(out2, "<REPR_END>inform") | ||
|
|
||
|
|
||
| @require_sentencepiece | ||
| @require_tokenizers | ||
|
|
@@ -523,7 +544,7 @@ def test_add_dummy_prefix(self): | |
| input_ids = self.tokenizer.encode(". Hello") | ||
| self.assertEqual(input_ids, [7, 4, 156, 86, 20]) | ||
| sp_encode = self.tokenizer.sp_model.encode(". Hello") | ||
| self.assertEqual(input_ids, sp_encode) | ||
| self.assertEqual(input_ids, [7] + sp_encode) | ||
| tokens = self.tokenizer.tokenize(". Hello") | ||
| self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"]) | ||
|
|
||
|
|
@@ -534,15 +555,19 @@ def test_remove_extra_whitespaces(self): | |
| input_ids = self.tokenizer.encode(" . Hello") | ||
| self.assertEqual(input_ids, [7, 4, 156, 86, 20]) | ||
| sp_encode = self.tokenizer.sp_model.encode(" . Hello") | ||
| self.assertEqual(input_ids, sp_encode) | ||
| self.assertEqual(input_ids, [7] + sp_encode) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Manually add the |
||
| tokens = self.tokenizer.tokenize(" . Hello") | ||
| self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"]) | ||
|
|
||
| # `'▁'` is also a whitespace | ||
| input_ids = self.tokenizer.encode("▁He is not") | ||
| self.assertEqual(input_ids, [156, 46, 44]) | ||
| tokens = self.tokenizer.tokenize("▁He is not") | ||
| sp_encode = self.tokenizer.sp_model.encode("▁He is not") | ||
| sp_encode = [ | ||
| self.tokenizer.sp_model.piece_to_id("▁He"), | ||
| self.tokenizer.sp_model.piece_to_id("▁is"), | ||
| self.tokenizer.sp_model.piece_to_id("▁not"), | ||
| ] | ||
| self.assertEqual(input_ids, sp_encode) | ||
| self.assertEqual(tokens, ["▁He", "▁is", "▁not"]) # no extra space added | ||
|
|
||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The previous test values were not really good, with this update it makes more sense |
Uh oh!
There was an error while loading. Please reload this page.