Skip to content
Prev Previous commit
Next Next commit
shorten param name, add st verification by type
  • Loading branch information
staviq committed Oct 10, 2023
commit fc634d87a8904b0844fbb71eb045d0f26d8bfd94
10 changes: 5 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -863,22 +863,22 @@ std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos,
bool allow_special_tokens) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, allow_special_tokens);
bool special) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
}

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos,
bool allow_special_tokens) {
bool special) {
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, allow_special_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, allow_special_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
Expand Down
4 changes: 2 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos,
bool allow_special_tokens = false);
bool special = false);

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos,
bool allow_special_tokens = false);
bool special = false);

// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
Expand Down
67 changes: 54 additions & 13 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2066,7 +2066,7 @@ static void llm_load_hparams(
}

// TODO: This should probably be in llama.h
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool allow_special_tokens = false);
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);

static void llm_load_vocab(
Expand Down Expand Up @@ -2192,15 +2192,30 @@ static void llm_load_vocab(
// are special tokens.
// From testing, this appears to corelate 1:1 with special tokens.
//

// Counting special tokens and verifying in only one direction
// is sufficient to detect difference in those two sets.
//
uint32_t special_tokens_count_by_type = 0;
uint32_t special_tokens_count_from_verification = 0;
bool special_tokens_definition_mismatch = false;

for (const auto & t: vocab.token_to_id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel like we have a redundancy here. We have token_data.type which is supposed to tell us if a token is special or not. In which cases this wouldn't work?

I guess we can have this piece of code here as a sanity check that the special tokens that we have read from the vocabulary are indeed special, but ideally AFAIU we shouldn't need this code in order to function correctly, correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. My approach is ment to be temporary ( or a fallback solution ), untill I know for sure special tokens will always be marked as such in the vocab.

@goerch wasn't certain about token types in bpe PR, so I opted for finding special tokens manually for now.

Copy link
Contributor

@goerch goerch Oct 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we get rid of #3502 first? This looks pretty basic (and terrible :) to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we get rid of #3502 first? This looks pretty basic (and terrible :) to me.

I'm open for critique, but you have to clarify what you mean by "this" and "terrible" :) otherwise I'm not sure how to proceed here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. My approach is ment to be temporary ( or a fallback solution ), untill I know for sure special tokens will always be marked as such in the vocab.

Ok I see. We can probably combine both the tokens marked as special and those that are unmatchable by other tokens. And probably print a warning if there is a mismatch between the 2 sets.

Let's see if @goerch has anything else to add and then I would like to make a pass over the implementation to update the coding style to match better. After that we can merge

Copy link
Contributor

@goerch goerch Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure about our nomenclature, do you mean special tokens == added tokens? I'm mostly leaning towards the nomenclature used in HF's tokenizer.json which contains added_tokens with the is_special attribute. From @jploski I learned that these added tokens are probably very similar to sentence_piece USER_DEFINED tokens.

And one more question: do we agree that vocab.json, merges.txt, added_tokens.json and special_tokens_map.json are probably older HF (or GPT2) serialization formats and we should find all this information in tokenizer.json for newer models (paging @apage43 too because he knows a lot about this stuff)? W.r.t. to these serialization formats we also seem to have some reports indicating that we don't support both of them equally well.

Edit: just adding a hunch: is it possible that tokenizer.json is the serialization format invented for the fast HF tokenizers?

Copy link
Contributor Author

@staviq staviq Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mean special tokens == added tokens?

Oh my, you better grab a chair because you are in for an adventure with this question :)

A really condensed version of what I learned so far:

I couldn't find any clear definition or explanation, but from what I found, there seem to be only one way all pieces of this puzzle fall together to form a clear picture.

A vocab, naturally forms a tree like structure ( several trees actually, there is no common root )

Tokenizer does it's thing, by grabbing small portions of the input text, and trying to steal neighboring portions to form a valid token, rinse and repeat, until nothing can be borrowed from either left or right to merge with and form a valid token.

So basically, tokenizer climbs that tokens tree down from single bytes to full tokens, stepping only on valid branches ( tokens )

Applying this idea backwards, by checking the entire vocab token by token, trying to split its text representation in two and checking if it still forms two valid tokens, you can clearly see some of the tokens in the vocab are not part of that tree family, and will never be matched by the tokenizer.

Which happens to match perfectly with normal tokens being marked as LLAMA_TOKEN_TYPE_NORMAL in the vocab, and tokens not being part of the tokens tree family being marked as whatever type being not LLAMA_TOKEN_TYPE_NORMAL

Therefore, I'm using the term special token, refering to tokens which are "hidden" from the user and un-matchable by tokenizer.

From the practical point of view, that subset of "special" tokens contains exactly what one would expect, <s>,</s>,<unk> and in case of mistral openorca <|im_start|>,<|im_end|>, AKA tokens which are used to control the flow of inference. ( EDIT: plus byte tokens <0xXX> )

And so from the point of view of actual inference, the tokenizer has to be extended the same way, independently of the actual token type, because only normal vs not normal token type matter at that point.

Additional distinction between USER_DEFINED, CONTROL etc can and has to be done the same way, with a tokenizer "pre-matcher", at which point per token type decisions or actions are trivial to implement if needed.

Copy link
Contributor

@goerch goerch Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for taking the time! My mental image is slightly different: I believe to understand the initial training of the tokenizer leads to an initial vocabulary and later model refinement tries to extend this basic vocabulary with added tokens without retraining the tokenizer on the original dataset. These added tokens should be intercepted before the original tokenizer interpretation. In my mental image special tokens somehow describe the effect these added tokens have on detokenization, i.e. should they be displayed or not for example. But I might be completely off here.

Edit:

<s>,</s>,<unk> and in case of mistral openorca <|im_start|>,<|im_end|>, AKA tokens which are used to control the flow of inference.

Here I think of <s>,</s>,<unk> as original CONTROL tokens and <|im_start|>,<|im_end|> as USER_DEFINED tokens which shouldn't be visible after detokinization.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I simply do not know if there is any significance to added "normal" tokens ( added in fine tuning etc )

From what I understand, in tokenizer specifically ( ignoring detokenizer ), normal tokens added the way you say, would simply be tokenized one step further into a longer token instead of being represented by a sequence of smaller tokens.

If such added tokens, are not integrated with the tree structure of the vocab, user would not be able to use them
If however those added tokens are properly coherent with the vocab, tokenizer would match them already, as it is

Which to me sort of seems to defy the purpose of adding "normal" tokens this way

I might be wrong about this part, but that still means a "tokenizer preprocessor" of some sort is needed, and this PR fits that role

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I simply do not know if there is any significance to added "normal" tokens ( added in fine tuning etc )

I believe they have to be processed before the core tokenizer routine in something like prefix_match (only exception are the already defined core tokenizer CONTROL tokens). mpt added tokens for example (from tokenizer.json):

    {
      "id": 0,
      "content": "<|endoftext|>", <-- core, i.e CONTROL
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 1,
      "content": "<|padding|>", <-- unsure about this because it is a valid GPT2 token AFAIU
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 50254,
      "content": "                        ",  <-- extension, i.e USER_DEFINED
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": true,
      "special": false
    },
    [...]

{
const auto & token = t.first;
const auto & id = t.second;

// Count all non-normal tokens in the vocab while iterating
if( vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL )
special_tokens_count_by_type++;

// Skip single character tokens
if( token.length() > 1 )
{
bool is_tokenizable = false;

// Split token string representation in two, in all possible ways
// and check if both halves can be matched to a valid token
for (unsigned i = 1; i < token.length();)
{
const auto left = token.substr(0, i);
Expand All @@ -2211,8 +2226,6 @@ static void llm_load_vocab(

if( utf == 1 )
{
//fprintf(stderr, "BSTC . '%s' '%s' '%s'\n", token.c_str(), left.c_str(), right.c_str());

if (vocab.token_to_id.find( left ) != vocab.token_to_id.end() &&
vocab.token_to_id.find( right ) != vocab.token_to_id.end() )
{
Expand All @@ -2224,31 +2237,57 @@ static void llm_load_vocab(
}
else
{
// fprintf(stderr, "BSTC SKIP '%s' '%s' '%s'\n", token.c_str(), left.c_str(), right.c_str());
// skip over the rest of multibyte utf sequence
i += utf - 1;
}
}

if (!is_tokenizable)
{
// it's faster to re-filter them here, since there is way less candidates now
// Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
// it's faster to re-filter them here, since there are way less candidates now

// Calculate a total "utf" length of a token string representation
size_t utf8_str_len = 0;
for (unsigned i = 0; i < token.length();)
{
utf8_str_len++;
i += utf8_len( token.at(i) );
}

// And skip the ones which are one character
if (utf8_str_len > 1)
{
//fprintf(stderr, "BSTC SPECIAL '%s' '%d' ('%ld')\n", token.c_str(), id, utf8_str_len);
// At this point what we have left are special tokens only

vocab.special_tokens_cache[token] = id;

// Count manually found special tokens
special_tokens_count_from_verification ++;

// If this manually found special token is not marked as such, flag a mismatch
if( vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL )
special_tokens_definition_mismatch = true;
}
}
}
}

if( special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type )
{
fprintf(stderr, "%s: WARNING: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
__func__,
special_tokens_count_from_verification, vocab.id_to_token.size(),
special_tokens_count_by_type, vocab.id_to_token.size()
);
}
else
{
fprintf(stderr, "%s: Special tokens definition check successful ( %u/%zu ).\n",
__func__,
special_tokens_count_from_verification, vocab.id_to_token.size()
);
}
}
}

Expand Down Expand Up @@ -5777,7 +5816,7 @@ struct fragment_buffer_variant{
std::string raw_text;
};

void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
{
// for each special token
for( const auto & st: vocab.special_tokens_cache )
Expand Down Expand Up @@ -5834,7 +5873,8 @@ void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragmen
}
else
{
auto prev = std::prev( buffer.begin(), -(source-1) );
//auto prev = std::prev( buffer.begin(), -(source-1) );
auto prev = std::next( buffer.begin(), (source-1) );
buffer.erase_after(prev);
}
//it = std::prev( it, 1 );
Expand All @@ -5850,7 +5890,8 @@ void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragmen
}
else
{
auto prev = std::prev( buffer.begin(), -(source) );
//auto prev = std::prev( buffer.begin(), -(source) );
auto prev = std::next( buffer.begin(), (source) );
buffer.erase_after(prev);
}
//it = std::prev( it, 1 );
Expand All @@ -5865,7 +5906,7 @@ void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragmen
}
}

static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool allow_special_tokens) {
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
std::vector<llama_vocab::id> output;

// OG tokenizer behavior:
Expand All @@ -5885,7 +5926,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &

fragment_buffer.emplace_front( raw_text );

if (allow_special_tokens) {
if (special) {
tokenizer_st_partition( vocab, fragment_buffer );
}

Expand Down Expand Up @@ -8843,8 +8884,8 @@ int llama_tokenize(
llama_token * tokens,
int n_max_tokens,
bool add_bos,
bool allow_special_tokens) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, allow_special_tokens);
bool special) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);

if (n_max_tokens < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
Expand Down
11 changes: 6 additions & 5 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -511,18 +511,19 @@ extern "C" {
// Tokenization
//

// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
/// @details Convert the provided text into tokens.
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_max_tokens
/// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
LLAMA_API int llama_tokenize(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos,
bool allow_special_tokens);
bool special);

// Token Id -> Piece.
// Uses the vocabulary in the provided context.
Expand Down