updat

huggingface · ArthurZucker · Aug 17, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 1, 2023
commit 83af7184e3b4b234650a2c0e5a5b28b4475a4dd9
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
@@ -189,6 +189,7 @@ def tokenize(self, text, **kwargs) -> List[str]:
         # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
         # the beginning of the text
         if not self.legacy:
+            # replacing " " by SPIECE_UNDERLINE prevents any form of stripping...
             text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
         return super().tokenize(text, **kwargs)
 

diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
@@ -448,9 +448,11 @@ def test_remove_extra_whitespaces(self):
         input_ids = self.tokenizer.encode("▁He is not<extra_id_0>             ▁He")
         # TODO another example of lstrip
         self.assertEqual(input_ids, [156, 46, 44, 1000, 262, 15, 2])
-        
+
         tokens = self.tokenizer.tokenize("▁He is not<extra_id_0>              ▁He")
-        self.assertEqual(tokens, ['▁He', '▁is', '▁not', '<extra_id_0>', 'H', 'e'])  # spaces are eaten by spm + our strip
+        self.assertEqual(
+            tokens, ["▁He", "▁is", "▁not", "<extra_id_0>", "H", "e"]
+        )  # spaces are eaten by spm + our strip
         # make sure that the output after the extra id is the same as if
         # extra_id was not there
         input_ids = self.tokenizer.encode("▁He is not             ▁He")
@@ -483,7 +485,7 @@ def test_special_tokens_strip(self):
         self.assertEqual(input_ids, [284, 1000, 262, 15, 2])
         # spaces are eaten by rstrip / lstrip, so this is expected. Don't strip otherwise you break
         tokens = self.tokenizer.tokenize("No <extra_id_0> He")
-        self.assertEqual(tokens, ['▁No', '<extra_id_0>', 'H', 'e'])
+        self.assertEqual(tokens, ["▁No", "<extra_id_0>", "H", "e"])
 
         # Make sure this does not happen if we don't strip
         tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0)