Skip to content
114 changes: 85 additions & 29 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
@dataclass
class Params:
n_vocab: int
n_vocab_base: int
n_embd: int
n_mult: int
n_head: int
Expand Down Expand Up @@ -169,6 +170,7 @@ def guessed(model: 'LazyModel') -> 'Params':

return Params(
n_vocab = n_vocab,
n_vocab_base=n_vocab,
n_embd = n_embd,
n_mult = 256,
n_head = n_head,
Expand All @@ -191,6 +193,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':

return Params(
n_vocab = n_vocab,
n_vocab_base=n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
Expand All @@ -215,6 +218,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':

return Params(
n_vocab = n_vocab,
n_vocab_base=n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
Expand All @@ -239,7 +243,7 @@ def load(model_plus: 'ModelPlus') -> 'Params':


class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fname_special_tokens: Optional[Path], fname_tokenizer_config: Optional[Path], vocabtype: Optional[str]) -> None:
self.vocabtype = vocabtype
if self.vocabtype == "bpe":
self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
Expand All @@ -264,35 +268,72 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vo
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
self.special_tokens_map: Dict[int, str] = {}

TOKEN_NAME_TO_ID: Dict[str, int] = {
"unk_token": self.sentencepiece_tokenizer.unk_id(),
"bos_token": self.sentencepiece_tokenizer.bos_id(),
"eos_token": self.sentencepiece_tokenizer.eos_id(),
"pad_token": self.sentencepiece_tokenizer.pad_id()
}

tokenizer_config: Dict[str, Any]
if fname_tokenizer_config is not None:
tokenizer_config = json.load(open(fname_tokenizer_config))
else:
tokenizer_config = {}
for key, value in tokenizer_config.items():
if not isinstance(value, dict) and not isinstance(value, str):
continue
token_id = TOKEN_NAME_TO_ID.get(key, -1)
if token_id == -1:
continue
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value

special_tokens: Dict[str, Any]
if fname_special_tokens is not None:
special_tokens = json.load(open(fname_special_tokens))
else:
special_tokens = {}
for key, value in special_tokens.items():
if not isinstance(value, dict) and not isinstance(value, str):
continue
token_id = TOKEN_NAME_TO_ID.get(key, -1)
if token_id == -1 or token_id in self.special_tokens_map:
continue
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value

def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.sentencepiece_tokenizer
if self.vocabtype == "bpe":
from transformers.models.gpt2 import tokenization_gpt2
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i, item in enumerate(tokenizer):
text: bytes
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
score: float = -i
yield text, score
from transformers.models.gpt2 import tokenization_gpt2
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i, item in enumerate(tokenizer):
text: bytes
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
score: float = -i
yield text, score
else:
for i in range(tokenizer.vocab_size()):
text: bytes
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
raise Exception(f"Invalid token: {piece}")
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score
special_tokens = [tokenizer.bos_id(), tokenizer.eos_id(), tokenizer.pad_id()]
for i in range(tokenizer.vocab_size()):
text: bytes
if tokenizer.is_unknown(i):
text = self.special_tokens_map.get(i, " \u2047 ").encode("utf-8")
elif i in special_tokens:
text = self.special_tokens_map.get(i, "").encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
raise Exception(f"Invalid token: {piece}")
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score

def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
Expand All @@ -303,18 +344,29 @@ def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()

def all_special_tokens(self) -> Iterable[int]:
for token_id in self.special_tokens_map.keys():
yield token_id
for i in range(len(self.added_tokens_list)):
yield self.vocab_size_base + i

def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


class GGMLVocab:
def __init__(self, tokens: List[Tuple[bytes, float]]):
self.tokens = tokens
self.special_tokens = []
self.vocab_size = len(tokens)
self.vocab_size_base = 0

def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
return self.tokens

def all_special_tokens(self) -> Iterable[int]:
return self.special_tokens

def __repr__(self) -> str:
return f"<GGMLVocab with {self.vocab_size} tokens>"

Expand Down Expand Up @@ -1072,10 +1124,10 @@ def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
params.n_mult,
params.n_head,
params.n_layer,
params.n_embd // params.n_head, # rot (obsolete)
params.n_vocab_base | 0xF0000000, # reuse obsolete rot value to store vocab_base
file_type.value,
]
self.fout.write(struct.pack("i" * len(values), *values))
self.fout.write(struct.pack("I" * len(values), *values))

def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
sname = name.encode('utf-8')
Expand All @@ -1093,7 +1145,8 @@ def write_vocab(self, vocab: Vocab) -> None:
@staticmethod
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
params = Params(n_vocab=vocab.vocab_size, n_vocab_base=vocab.vocab_size_base, n_embd=0, n_mult=0,
n_head=1, n_layer=0)
of = OutputFile(fname_out)
of.write_file_header(params, file_type=GGMLFileType.AllF32)
of.write_vocab(vocab)
Expand Down Expand Up @@ -1249,8 +1302,10 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
f"Could not find tokenizer.model in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
added_tokens_path = path.parent / "added_tokens.json"
special_tokens_path = path.parent / "special_tokens_map.json"
tokenizer_config_path = path.parent / "tokenizer_config.json"
print(f"Loading vocab file {path}")
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None, special_tokens_path if special_tokens_path.exists() else None, tokenizer_config_path if tokenizer_config_path.exists() else None,
vocabtype)


Expand Down Expand Up @@ -1313,6 +1368,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype)
params = Params.load(model_plus)
params.n_vocab_base = vocab.vocab_size_base
model = model_plus.model
model = do_necessary_conversions(model, params)
output_type = pick_output_type(model, args.outtype)
Expand Down
85 changes: 74 additions & 11 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,13 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_vocab_base = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_mult = 256;
uint32_t n_head = 32;
uint32_t n_head_kv = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;

// LLaMAv2
// TODO: load from model data hparams
Expand Down Expand Up @@ -277,6 +277,12 @@ struct llama_vocab {

std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;

std::unordered_map<token, id> special_token_to_id;

void add_special_token(const token & word, id token_id) {
special_token_to_id[word] = token_id;
}
};

struct llama_model {
Expand Down Expand Up @@ -509,6 +515,7 @@ struct llama_file_loader {
read_hparams();
read_vocab();
read_tensor_metadata(tensors_map);
set_vocab_sp();
}
void read_magic() {
uint32_t magic = file.read_u32();
Expand Down Expand Up @@ -543,7 +550,8 @@ struct llama_file_loader {
hparams.n_mult = file.read_u32();
hparams.n_head = file.read_u32();
hparams.n_layer = file.read_u32();
hparams.n_rot = file.read_u32();
hparams.n_vocab_base = file.read_u32();
hparams.n_vocab_base = (hparams.n_vocab_base & 0xF0000000) == 0 ? hparams.n_vocab : (hparams.n_vocab_base & ~0xF0000000); // this bitwise operation is necessary for compatibility with older models
hparams.ftype = (enum llama_ftype) file.read_u32();

// LLaMAv2
Expand Down Expand Up @@ -612,6 +620,17 @@ struct llama_file_loader {
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
}
}
void set_vocab_sp() {
uint32_t vocab_sp = 3 + hparams.n_vocab - hparams.n_vocab_base;
vocab.special_token_to_id.reserve(vocab_sp);
for (uint32_t i = 0; i < vocab_sp; i++) {
llama_vocab::id token_id = i > 2 ? hparams.n_vocab_base + i : i;
const auto & word = vocab.id_to_token[token_id].tok;
if (!word.empty()) {
vocab.add_special_token(word, token_id);
}
}
}
};

struct llama_file_saver {
Expand All @@ -635,7 +654,7 @@ struct llama_file_saver {
file.write_u32(hparams.n_mult);
file.write_u32(hparams.n_head);
file.write_u32(hparams.n_layer);
file.write_u32(hparams.n_rot);
file.write_u32(hparams.n_vocab_base | 0xF0000000); // this bitwise operation is necessary for compatibility with older models
file.write_u32(new_ftype);
}
void write_vocab() {
Expand Down Expand Up @@ -1100,7 +1119,7 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_embd/hparams.n_head); // a.k.a. n_embd_head, n_head_dim
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
Expand Down Expand Up @@ -1418,7 +1437,7 @@ static struct ggml_cgraph * llama_build_graph(
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();

LLAMA_ASSERT(n_embd_head == hparams.n_rot);
LLAMA_ASSERT(n_embd_head == hparams.n_embd/hparams.n_head);

const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale;
Expand Down Expand Up @@ -1960,18 +1979,20 @@ struct llama_sp_bigram {
struct llama_tokenizer {
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}

void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
void tokenize(const char * text, size_t len, std::vector<llama_vocab::id> & output) {
symbols_.clear();

// split string into utf8 chars
int index = 0;
size_t offs = 0;
while (offs < text.size()) {
while (offs < len) {
llama_sp_symbol sym;
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
sym.text = text.c_str() + offs;
size_t char_len = std::min(len - offs, utf8_len(text[offs]));
sym.text = text + offs;
sym.n = char_len;
offs += char_len;
sym.prev = index - 1;
sym.next = offs == text.size() ? -1 : index + 1;
sym.next = offs == len ? -1 : index + 1;
index++;
symbols_.emplace_back(sym);
}
Expand Down Expand Up @@ -2074,7 +2095,45 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
output.push_back(llama_token_bos());
}

tokenizer.tokenize(text, output);
if (vocab.special_token_to_id.empty()) {
tokenizer.tokenize(text.c_str(), text.size(), output);
return output;
}

size_t delim_start = 0;
size_t last_delim_end = 0;

while (delim_start < text.size()) {
size_t delim_end = 0;
llama_vocab::id token_id = -1;

for (const auto & mit : vocab.special_token_to_id) {
const std::string & delimiter = mit.first;
size_t end = delim_start + delimiter.size();
if (end <= text.size() && text.compare(delim_start, delimiter.size(), delimiter) == 0) {
if (token_id == -1 || end > delim_end) {
token_id = mit.second;
delim_end = end;
}
}
}

if (token_id != -1) {
if (last_delim_end < delim_start) {
tokenizer.tokenize(text.c_str() + last_delim_end, delim_start - last_delim_end, output);
}
output.push_back(token_id);
delim_start = delim_end;
last_delim_end = delim_end;
} else {
delim_start++;
}
}

if (last_delim_end < text.size()) {
tokenizer.tokenize(text.c_str() + last_delim_end, text.size() - last_delim_end, output);
}

return output;
}

Expand Down Expand Up @@ -4212,6 +4271,10 @@ llama_token llama_token_nl() {
return 13;
}

void llama_add_special_token(struct llama_model * model, const char * token, llama_token token_id) {
model->vocab.add_special_token(token, token_id);
}

struct llama_timings llama_get_timings(struct llama_context * ctx) {
struct llama_timings result = {
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
Expand Down
5 changes: 5 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,11 @@ extern "C" {
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line

LLAMA_API void llama_add_special_token(
struct llama_model * model,
const char * token,
llama_token token_id);

// Grammar
//
LLAMA_API struct llama_grammar * llama_grammar_init(
Expand Down
Loading