Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
303a82c
fix
ArthurZucker Oct 3, 2023
cbf179a
Merge branch 'main' of github.com:huggingface/transformers into fix-main
ArthurZucker Oct 3, 2023
01e18db
last attempt
ArthurZucker Oct 3, 2023
08a560a
current work
ArthurZucker Oct 4, 2023
23c9513
fix forward compatibility
ArthurZucker Oct 4, 2023
0ae13ed
save all special tokens
ArthurZucker Oct 5, 2023
d887f68
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 5, 2023
72ff80e
current state
ArthurZucker Oct 5, 2023
b7b7d13
revert additional changes
ArthurZucker Oct 5, 2023
36d5303
updates
ArthurZucker Oct 5, 2023
ae93856
remove tokenizer.model
ArthurZucker Oct 5, 2023
88ea352
add a test and the fix
ArthurZucker Oct 5, 2023
ca98fbd
nit
ArthurZucker Oct 5, 2023
3c22fbb
revert one more break
ArthurZucker Oct 5, 2023
dc93d5e
fix typefield issue
ArthurZucker Oct 5, 2023
00997e9
quality
ArthurZucker Oct 5, 2023
6143634
more tests
ArthurZucker Oct 5, 2023
907591f
fix fields for FC
ArthurZucker Oct 5, 2023
5df5a83
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 5, 2023
66ecb9e
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 5, 2023
0e7bd61
more nits?
ArthurZucker Oct 5, 2023
381a0ec
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 6, 2023
bf75334
new additional changes
ArthurZucker Oct 6, 2023
fafbbed
how
ArthurZucker Oct 6, 2023
c6de7b2
some updates
ArthurZucker Oct 6, 2023
9a6e750
simplify all
ArthurZucker Oct 7, 2023
8c4ec2c
more nits
ArthurZucker Oct 7, 2023
621ebae
revert some things to original
ArthurZucker Oct 7, 2023
6a6095e
nice
ArthurZucker Oct 7, 2023
e0e5dea
nits
ArthurZucker Oct 7, 2023
92c7754
a small hack
ArthurZucker Oct 7, 2023
9fbbafe
more nits
ArthurZucker Oct 7, 2023
25e2df9
ahhaha
ArthurZucker Oct 7, 2023
2b18cc2
Merge branch 'main' of github.com:huggingface/transformers into fix-main
ArthurZucker Oct 7, 2023
078c94e
fixup
ArthurZucker Oct 7, 2023
ef1e598
update
ArthurZucker Oct 9, 2023
9bf12a8
make test run on ci
ArthurZucker Oct 11, 2023
e6d0381
use subtesting
ArthurZucker Oct 11, 2023
112e4b1
update
ArthurZucker Oct 11, 2023
f794a91
Update .circleci/create_circleci_config.py
ArthurZucker Oct 11, 2023
65aa232
updates
ArthurZucker Oct 11, 2023
8ea095b
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 11, 2023
efc5e7b
fixup
ArthurZucker Oct 11, 2023
aa569b7
nits
ArthurZucker Oct 11, 2023
5ad55f3
replace typo
ArthurZucker Oct 11, 2023
1c22269
fix the test
ArthurZucker Oct 11, 2023
3b93653
nits
ArthurZucker Oct 11, 2023
a2e977a
Merge branch 'main' of github.com:huggingface/transformers into fix-main
ArthurZucker Oct 11, 2023
1acf2dd
update
ArthurZucker Oct 11, 2023
2dde542
None max dif pls
ArthurZucker Oct 11, 2023
9ebf76e
a partial fix
ArthurZucker Oct 11, 2023
6d2c00e
had to revert one thing
ArthurZucker Oct 11, 2023
e4bcb5e
test the fast
ArthurZucker Oct 11, 2023
3d4bffd
updates
ArthurZucker Oct 11, 2023
8bcb345
fixup
ArthurZucker Oct 11, 2023
d9e5fad
and more nits
ArthurZucker Oct 11, 2023
fc34148
more fixes
ArthurZucker Oct 12, 2023
8389094
update
ArthurZucker Oct 12, 2023
78f1ac4
Oupsy :eye:
ArthurZucker Oct 12, 2023
62eb816
Merge branch 'main' of github.com:huggingface/transformers into fix-main
ArthurZucker Oct 12, 2023
5c1ae9c
nits
ArthurZucker Oct 12, 2023
df8ab6f
fix marian
ArthurZucker Oct 12, 2023
677fcb2
on our way to heaven
ArthurZucker Oct 12, 2023
5a3407e
Update src/transformers/models/t5/tokenization_t5.py
ArthurZucker Oct 12, 2023
856a43d
fixup
ArthurZucker Oct 12, 2023
a3cb498
Update src/transformers/tokenization_utils_fast.py
ArthurZucker Oct 12, 2023
62cf2d0
Update src/transformers/tokenization_utils_base.py
ArthurZucker Oct 12, 2023
fe8bba0
fix phobert
ArthurZucker Oct 13, 2023
be68fc2
skip some things, test more
ArthurZucker Oct 13, 2023
814d978
nits
ArthurZucker Oct 13, 2023
f969713
fixup
ArthurZucker Oct 13, 2023
56b0619
fix deberta
ArthurZucker Oct 13, 2023
f2a5447
update
ArthurZucker Oct 13, 2023
5d7bdab
update
ArthurZucker Oct 13, 2023
49dd8b2
more updates
ArthurZucker Oct 13, 2023
3a03c77
skip one test
ArthurZucker Oct 13, 2023
707a688
more updates
ArthurZucker Oct 13, 2023
bbfc382
fix camembert
ArthurZucker Oct 13, 2023
b6b8aed
can't test this one
ArthurZucker Oct 13, 2023
dac7b89
more good fixes
ArthurZucker Oct 14, 2023
b4ca44e
kind of a major update
ArthurZucker Oct 14, 2023
5245825
fixup
ArthurZucker Oct 14, 2023
0724ebf
more fixups
ArthurZucker Oct 14, 2023
066854a
fix pegasus and mpnet
ArthurZucker Oct 15, 2023
f646ab8
remove skipped tests
ArthurZucker Oct 15, 2023
53e2390
fix phoneme tokenizer if self.verbose
ArthurZucker Oct 15, 2023
e0a967f
fix individual models
ArthurZucker Oct 15, 2023
a353871
update common tests
ArthurZucker Oct 15, 2023
fbc4c4f
update testing files
ArthurZucker Oct 15, 2023
64a6bc4
all over again
ArthurZucker Oct 15, 2023
4219b32
nits
ArthurZucker Oct 15, 2023
48b937a
skip test for markup lm
ArthurZucker Oct 15, 2023
d1a4537
fixups
ArthurZucker Oct 15, 2023
60173aa
fix order of addition in fast by sorting the added tokens decoder
ArthurZucker Oct 16, 2023
8402602
proper defaults for deberta
ArthurZucker Oct 16, 2023
d782bbd
correct default for fnet
ArthurZucker Oct 16, 2023
05ab2c2
nits on add tokens, string initialized to special if special
ArthurZucker Oct 16, 2023
bd6c5a5
skip irrelevant herbert tests
ArthurZucker Oct 16, 2023
8a267d3
main fixes
ArthurZucker Oct 16, 2023
7bda15e
update test added_tokens_serialization
ArthurZucker Oct 16, 2023
ac75cd3
the fix for bart like models and class instanciating
ArthurZucker Oct 16, 2023
640885e
update bart
ArthurZucker Oct 16, 2023
45801c0
nit!
ArthurZucker Oct 16, 2023
14c576f
update idefix test
ArthurZucker Oct 16, 2023
2a78cf9
fix whisper!
ArthurZucker Oct 16, 2023
6f28584
some fixup
ArthurZucker Oct 16, 2023
c12656b
fixups
ArthurZucker Oct 16, 2023
8f8c3f1
revert some of the wrong chanegs
ArthurZucker Oct 16, 2023
de51ef7
fixup
ArthurZucker Oct 16, 2023
0f0a3fe
fixup
ArthurZucker Oct 16, 2023
4b693b9
Merge branch 'main' of github.com:huggingface/transformers into fix-main
ArthurZucker Oct 18, 2023
4b82043
skip marian
ArthurZucker Oct 18, 2023
340df3d
skip the correct tests
ArthurZucker Oct 18, 2023
f9fb43d
skip for tf and flax as well
ArthurZucker Oct 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
kind of a major update
- seperate what is only done in fast in fast init and refactor
- add_token(AddedToken(..., speicla = True)) ignores it in fast
- better loading
  • Loading branch information
ArthurZucker committed Oct 14, 2023
commit b4ca44e783bd9ec473c5bd3b017fe0d5d579a9dd
6 changes: 3 additions & 3 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,9 +1168,9 @@ def tokenizer(self, proto):
)
tokenizer.add_special_tokens(
[
AddedToken("<unk>"),
AddedToken("<s>"),
AddedToken("</s>"),
AddedToken("<unk>", normalized=False, special=True),
AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>", normalized=False, special=True),
]
)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def __init__(
self._added_tokens_decoder = {
0: AddedToken("<s>NOTUSED", special = True),
1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
2: AddedToken("</s>NOTUSED"),
2: AddedToken("</s>NOTUSED", special=True),
3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
4: AddedToken("<unk>NOTUSED", special = True),
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def __init__(
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED","<unk>NOTUSED"],
**kwargs,
):
# Mask token behave like a normal word, i.e. include the space before it
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mbart50/tokenization_mbart50.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def __init__(

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/xglm/tokenization_xglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __init__(
self.num_madeup_words = 7
madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]

kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
word for word in madeup_words if word not in kwargs["additional_special_tokens"]
]
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/xglm/tokenization_xglm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __init__(
self.num_madeup_words = 7
madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]

kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
word for word in madeup_words if word not in kwargs["additional_special_tokens"]
]
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def __init__(self, **kwargs):

# 4. If some of the special tokens are not part of the vocab, we add them, at the end.
# the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
self._add_tokens([token for token in self.all_special_tokens_extended if str(token) not in self._added_tokens_encoder], special_tokens=True)
self._add_tokens([token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder], special_tokens=True)

self._decode_use_source_tokenizer = False

Expand Down
104 changes: 23 additions & 81 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ class SpecialTokensMixin:
"additional_special_tokens",
]

def __init__(self, verbose=True, **kwargs):
def __init__(self, verbose=False, **kwargs):
self._bos_token = None
self._eos_token = None
self._unk_token = None
Expand Down Expand Up @@ -1159,12 +1159,7 @@ def mask_token(self, value):

@additional_special_tokens.setter
def additional_special_tokens(self, value):
# just sets the easy to acces list of strings. Should only be used at init time. Expected to not update the
# added_tokens_decoder.
self._additional_special_tokens = [] if value is not None else None
if value is not None:
for token in value:
self._additional_special_tokens.append(token)
self._additional_special_tokens = value if value is not None else None

@property
def bos_token_id(self) -> Optional[int]:
Expand Down Expand Up @@ -2170,14 +2165,11 @@ def _from_pretrained(
init_kwargs["__slow_tokenizer"] = slow_tokenizer
init_kwargs["name_or_path"] = pretrained_model_name_or_path

# Handle tokenizer serialization of added and special tokens
additional_special_tokens = init_kwargs.pop("additional_special_tokens", None) or []
#### Handle tokenizer serialization of added and special tokens
added_tokens_decoder: Dict[int, AddedToken] = {}
added_tokens_map: Dict[str, AddedToken] = {}

# if a fast did not save the tokenizer_config.json
legacy_saved = "added_tokens_decoder" not in init_kwargs
if not legacy_saved:
# if we have info on the slow added tokens
if "added_tokens_decoder" in init_kwargs:
for idx, token in init_kwargs["added_tokens_decoder"].items():
if isinstance(token, dict):
token = AddedToken(**token)
Expand All @@ -2186,23 +2178,8 @@ def _from_pretrained(
added_tokens_map[str(token)] = token
else:
raise ValueError(
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or and AddedToken instance"
)
# Either this or we save the special tokens as added tokens to directly load them instead of having defaults
# Make sure the default string init_kwargs are overwritten! Simulates passing kwargs :) Everything is added_token. But not additional special tokens because class does no overwrite?
# If we save the eos etc as added tokens we don't have this, but this is fool proof
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
if added_tokens_map != {}:
if key == "additional_special_tokens":
init_kwargs[key] = [
added_tokens_map[token] if token in added_tokens_map.keys() else token
for token in init_kwargs[key]
]
if isinstance(init_kwargs[key], dict):
init_kwargs[key] = cls.convert_added_tokens(init_kwargs[key], save=False)
elif init_kwargs[key] in added_tokens_map.keys():
init_kwargs[key] = added_tokens_map[init_kwargs.get(key)]

else:
# begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
if special_tokens_map_file is not None:
Expand All @@ -2214,15 +2191,15 @@ def _from_pretrained(
# We keep this new value and ignore the one stored in the special_tokens_map_file
continue
if isinstance(value, dict):
value = AddedToken(**value)
init_kwargs[key] = value
value = AddedToken(**value,special=True)
elif key == "additional_special_tokens" and isinstance(value, list):
additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or []
for token in value:
token = AddedToken(**token, special=True) if isinstance(token, dict) else token
if token not in additional_special_tokens:
additional_special_tokens.append(token)
else:
init_kwargs[key] = value
value = additional_special_tokens
init_kwargs[key] = value

# allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
# if `added_tokens_decoder` not in `tokenizer_config.json` and `added_tokens.json` is `None`
Expand All @@ -2233,28 +2210,27 @@ def _from_pretrained(
added_tokens = tokenizer_file_handle.pop("added_tokens")
for serialized_tokens in added_tokens:
idx = serialized_tokens.pop("id")
# for legacy purpose, we ignore whether or not these tokens are special. TODO not sure anymore
# serialized_tokens.pop("special")
added_tokens_decoder[idx] = AddedToken(**serialized_tokens)
added_tokens_map[str(token)] = token

# slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
# this is for legacy purpose
if added_tokens_file is not None:
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
for str_token, index in added_tok_encoder.items():
if index not in added_tokens_decoder:
if index not in added_tokens_decoder and str_token not in added_tokens_map:
added_tokens_decoder[index] = AddedToken(str_token, rstrip = True, lstrip = True, normalized = True)
added_tokens_map[str(token)] = token
# end legacy

# slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved!
# thus we delay adding special tokens in the init using `slow_to_fast` flag.
if added_tokens_decoder is not {} and "Fast" in cls.__name__:
init_kwargs["slow_to_fast"] = True
if len(additional_special_tokens) > 0:
init_kwargs["additional_special_tokens"] = additional_special_tokens
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
# Passing AddedTokens only to the class to prevent it from changing the string tokens
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
if added_tokens_map != {} and init_kwargs[key] is not None:
if key != "additional_special_tokens":
init_kwargs[key] = added_tokens_map.get(init_kwargs[key], init_kwargs[key])

init_kwargs["added_tokens_decoder"] = added_tokens_decoder
# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
# Instantiate the tokenizer.
Expand All @@ -2266,41 +2242,7 @@ def _from_pretrained(
"Please check that the provided vocabulary is accessible and not corrupted."
)

# This is slow... if the added tokens decoder was not used (we are fast) use it
if init_kwargs.get("slow_to_fast", False):
# for efficiency should we only add the ones that are not present (str) ?
tokens_to_add = list(added_tokens_decoder.values())
if len(tokens_to_add) > 1:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME:
if "Fast" in cls.__name__:
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
is_last_special = None
tokens = []
for token in tokens_to_add:
is_special = (
token.special
if isinstance(token, AddedToken)
else token in tokenizer.additional_special_tokens
)
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
tokenizer.add_tokens(tokens, special_tokens=is_last_special)
tokens = [token]
is_last_special = is_special

if tokens:
tokenizer.add_tokens(tokens, special_tokens=is_last_special)

# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
# uses the information stored in `added_tokens_decoder`.
if init_kwargs.get("slow_to_fast", False):
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
encoder = tokenizer.added_tokens_encoder
tokenizer.add_tokens([ token for token in tokenizer.all_special_tokens_extended if token not in encoder], special_tokens=True)

if len(added_tokens_decoder) > 0:
if added_tokens_decoder!={} and max(list(added_tokens_decoder.keys())[-1],0) > tokenizer.vocab_size:
logger.warning_advice(
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
" fine-tuned or trained."
Expand Down Expand Up @@ -2410,9 +2352,6 @@ def save_pretrained(

tokenizer_config = copy.deepcopy(self.init_kwargs)

# Let's make sure we properly save the special AddedToken.
tokenizer_config.update(self.special_tokens_map_extended)

# Let's save the init kwargs
target_keys = set(self.init_kwargs.keys())
# Let's save the special tokens map (only the strings)
Expand All @@ -2422,6 +2361,9 @@ def save_pretrained(
if hasattr(self, k):
tokenizer_config[k] = getattr(self, k)

# Let's make sure we properly save the special tokens.
tokenizer_config.update(self.special_tokens_map)

if self.chat_template is not None:
tokenizer_config["chat_template"] = self.chat_template

Expand Down
27 changes: 24 additions & 3 deletions src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
slow_to_fast = kwargs.pop("slow_to_fast", False)
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})

if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
Expand Down Expand Up @@ -156,8 +156,29 @@ def __init__(self, *args, **kwargs):
super().__init__(**kwargs)

# We add the additional tokens that are not part of the vocab
if not slow_to_fast:
self._add_tokens([token for token in self.all_special_tokens_extended if str(token) not in self.added_tokens_encoder], special_tokens=True)
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
# uses the information stored in `added_tokens_decoder`.
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
tokens_to_add = [token for token in added_tokens_decoder.values() if token not in self.added_tokens_decoder]
encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
tokens_to_add += [token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add]
if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
is_last_special = None
tokens = []
special_tokens = self.all_special_tokens
for token in tokens_to_add:
is_special = (token.special or str(token) in special_tokens) if isinstance(token, AddedToken) else str(token) in special_tokens
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
self._add_tokens(tokens, special_tokens=is_last_special)
tokens = [token]
is_last_special = is_special
if tokens:
self._add_tokens(tokens, special_tokens=is_last_special)

@property
def is_fast(self) -> bool:
Expand Down
19 changes: 19 additions & 0 deletions tests/models/pegasus/test_tokenization_pegasus.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,16 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
pass


@unittest.skip("We have to use from_slow")
def test_added_tokens_serialization(self):
pass

@unittest.skip("We have to use from_slow")
def test_added_tokens_serialization(self):
pass



@require_sentencepiece
@require_tokenizers
class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
Expand Down Expand Up @@ -219,3 +229,12 @@ def test_equivalence_to_orig_tokenizer(self):
token_ids,
[182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
)


@unittest.skip("We have to use from_slow")
def test_added_tokens_serialization(self):
pass

@unittest.skip("We have to use from_slow")
def test_added_tokens_serialization(self):
pass
4 changes: 2 additions & 2 deletions tests/models/t5/test_tokenization_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,10 @@ def t5_base_tokenizer_fast(self):
return T5TokenizerFast.from_pretrained("t5-base")

def get_tokenizer(self, **kwargs) -> T5Tokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)

def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
Expand Down
Loading