From bb95f56538559ebde1484e32c80a2aaf00b8940c Mon Sep 17 00:00:00 2001 From: cornelk Date: Sun, 12 Mar 2023 20:49:42 -0600 Subject: [PATCH 01/44] first go embedded version --- .gitignore | 3 + Makefile | 13 ++-- go.mod | 3 + main.cpp | 177 ++++++++++++++++++++++++++++++++------------------- main.go | 61 ++++++++++++++++++ main.h | 19 ++++++ quantize.cpp | 2 + 7 files changed, 208 insertions(+), 70 deletions(-) create mode 100644 go.mod create mode 100644 main.go create mode 100644 main.h diff --git a/.gitignore b/.gitignore index 5eb1ff1b873f1..4414b8428d40c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.o +*.so *.a .cache/ +.idea .vs/ .vscode/ .DS_Store @@ -18,6 +20,7 @@ models/* /main /quantize +/llama-go arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index 8388c290d75ce..95a2e3b97cf4e 100644 --- a/Makefile +++ b/Makefile @@ -172,7 +172,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize +default: main.o quantize libllama.a # # Build library @@ -185,11 +185,14 @@ utils.o: utils.cpp utils.h $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o clean: - rm -f *.o main quantize + rm -f *.o *.a quantize -main: main.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS) - ./main -h +main.o: ggml.o utils.o + $(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS) + #./main -h + +libllama.a: main.o ggml.o utils.o + ar src libllama.a main.o ggml.o utils.o quantize: quantize.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) diff --git a/go.mod b/go.mod new file mode 100644 index 0000000000000..b5878754e2ace --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/cornelk/llama-go + +go 1.19 diff --git a/main.cpp b/main.cpp index 2f47480698f1e..a7d312b1bc4d4 100644 --- a/main.cpp +++ b/main.cpp @@ -1,5 +1,5 @@ #include "ggml.h" - +#include "main.h" #include "utils.h" #include @@ -69,9 +69,19 @@ struct llama_model { std::map tensors; }; +struct llama_state { + gpt_vocab vocab; + llama_model model; + struct { + int64_t t_load_us = -1; + int64_t t_sample_us = -1; + int64_t t_predict_us = -1; + } timing; +}; + // load the model's weights from a file bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); +// printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { @@ -110,16 +120,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_mult = %d\n", __func__, hparams.n_mult); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: f16 = %d\n", __func__, hparams.f16); - printf("%s: n_ff = %d\n", __func__, n_ff); - printf("%s: n_parts = %d\n", __func__, n_parts); +// printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); +// printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); +// printf("%s: n_embd = %d\n", __func__, hparams.n_embd); +// printf("%s: n_mult = %d\n", __func__, hparams.n_mult); +// printf("%s: n_head = %d\n", __func__, hparams.n_head); +// printf("%s: n_layer = %d\n", __func__, hparams.n_layer); +// printf("%s: n_rot = %d\n", __func__, hparams.n_rot); +// printf("%s: f16 = %d\n", __func__, hparams.f16); +// printf("%s: n_ff = %d\n", __func__, n_ff); +// printf("%s: n_parts = %d\n", __func__, n_parts); } // load vocab @@ -203,7 +213,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab ctx_size += (5 + 10*n_layer)*256; // object overhead - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); +// printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context @@ -290,7 +300,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); +// printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); } const size_t file_offset = fin.tellg(); @@ -308,7 +318,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab fname_part += "." + std::to_string(i); } - printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); +// printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); fin = std::ifstream(fname_part, std::ios::binary); fin.seekg(file_offset); @@ -318,7 +328,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab int n_tensors = 0; size_t total_size = 0; - printf("%s: ", __func__); +// printf("%s: ", __func__); while (true) { int32_t n_dims; @@ -482,15 +492,15 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab } //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); - } +// if (++n_tensors % 8 == 0) { +// printf("."); +// fflush(stdout); +// } } - printf(" done\n"); +// printf(" done\n"); - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); +// printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } fin.close(); @@ -732,6 +742,8 @@ bool llama_eval( return true; } +/* + int main(int argc, char ** argv) { const int64_t t_main_start_us = ggml_time_us(); @@ -761,57 +773,89 @@ int main(int argc, char ** argv) { gpt_vocab vocab; llama_model model; + */ + +void* llama_allocate_state() { + return new llama_state; +} + +void* llama_allocate_params(const char *input, int threads, int tokens) { + gpt_params* params = new gpt_params; + params->prompt = input; + params->n_threads = threads; + params->n_predict = tokens; + return params; +} + +void llama_free_params(void* params_ptr) { + gpt_params* params = (gpt_params*) params_ptr; + delete params; +} + +bool llama_bootstrap(const char *model_path, void* state_pr) // load the model { + llama_state* state = (llama_state*) state_pr; const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ?? - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; + if (!llama_model_load(model_path, state->model, state->vocab, 512)) { // TODO: set context from user input ?? + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path); + return false; } - t_load_us = ggml_time_us() - t_start_us; + state->timing.t_load_us = ggml_time_us() - t_start_us; + return true; } +int llama_predict(void* params_ptr, void* state_pr) { + gpt_params* params = (gpt_params*) params_ptr; + llama_state* state = (llama_state*) state_pr; + + const int64_t t_main_start_us = ggml_time_us(); int n_past = 0; - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; + state->timing.t_sample_us = 0; + state->timing.t_predict_us = 0; std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp = ::llama_tokenize(state->vocab, params->prompt, true); - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + params->n_predict = std::min(params->n_predict, state->model.hparams.n_ctx - (int) embd_inp.size()); printf("\n"); - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); +// printf("%s: prompt: '%s'\n", __func__, params->prompt.c_str()); +// printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); +// for (int i = 0; i < (int) embd_inp.size(); i++) { +// printf("%6d -> '%s'\n", embd_inp[i], state->vocab.id_to_token.at(embd_inp[i]).c_str()); +// } +// printf("\n"); +// printf("sampling parameters: temp = %f, top_k = %d, top_p = %f\n", params->temp, params->top_k, params->top_p); +// printf("\n\n"); + + std::vector embd; + + if (params->seed < 0) { + params->seed = time(NULL); } - printf("\n"); - printf("sampling parameters: temp = %f, top_k = %d, top_p = %f\n", params.temp, params.top_k, params.top_p); - printf("\n\n"); - - std::vector embd; + std::mt19937 rng(params->seed); // determine the required inference memory per token: size_t mem_per_token = 0; - llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + llama_eval(state->model, params->n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); - for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + for (int i = embd.size(); i < embd_inp.size() + params->n_predict; i++) { // predict if (embd.size() > 0) { const int64_t t_start_us = ggml_time_us(); - if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { + if (!llama_eval(state->model, params->n_threads, n_past, embd, logits, mem_per_token)) { printf("Failed to predict\n"); return 1; } - t_predict_us += ggml_time_us() - t_start_us; + state->timing.t_predict_us += ggml_time_us() - t_start_us; } n_past += embd.size(); @@ -819,19 +863,19 @@ int main(int argc, char ** argv) { if (i >= embd_inp.size()) { // sample next token - const float top_p = params.top_p; - const float temp = params.temp; + const float top_p = params->top_p; + const float temp = params->temp; - const int n_vocab = model.hparams.n_vocab; + const int n_vocab = state->model.hparams.n_vocab; gpt_vocab::id id = 0; { const int64_t t_start_sample_us = ggml_time_us(); - id = llama_sample_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_p, temp, rng); + id = llama_sample_top_p(state->vocab, logits.data() + (logits.size() - n_vocab), top_p, temp, rng); - t_sample_us += ggml_time_us() - t_start_sample_us; + state->timing.t_sample_us += ggml_time_us() - t_start_sample_us; } // add it to the context @@ -840,7 +884,7 @@ int main(int argc, char ** argv) { // if here, it means we are still processing the input prompt for (int k = i; k < embd_inp.size(); k++) { embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { + if (embd.size() > params->n_batch) { break; } } @@ -848,31 +892,34 @@ int main(int argc, char ** argv) { } // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); + for (auto id: embd) { + printf("%s", state->vocab.id_to_token[id].c_str()); } fflush(stdout); // end of text token if (embd.back() == 2) { - printf(" [end of text]\n"); - break; +// printf(" [end of text]\n"); + return 2; } } // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } +// { +// const int64_t t_main_end_us = ggml_time_us(); +// +// printf("\n\n"); +// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); +// printf("%s: load time = %8.2f ms\n", __func__, state->timing.t_load_us / 1000.0f); +// printf("%s: sample time = %8.2f ms\n", __func__, state->timing.t_sample_us / 1000.0f); +// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, state->timing.t_predict_us / 1000.0f, state->timing.t_predict_us / 1000.0f / n_past); +// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); +// } + return 0; +} - ggml_free(model.ctx); +void llama_finalize(llama_state &state) { + ggml_free(state.model.ctx); - return 0; +// return 0; } diff --git a/main.go b/main.go new file mode 100644 index 0000000000000..0dbaf2e8b2089 --- /dev/null +++ b/main.go @@ -0,0 +1,61 @@ +package main + +// #cgo CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mavx -mavx2 -mfma -mf16c -msse3 +// #cgo CXXFLAGS: -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I. +// #include "main.h" +import "C" +import ( + "bufio" + "flag" + "fmt" + "os" +) + +func main() { + var model string + var threads, tokens int + + flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError) + flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load") + flags.IntVar(&threads, "t", 4, "number of threads to use during computation") + flags.IntVar(&tokens, "n", 128, "number of tokens to predict") + + err := flags.Parse(os.Args[1:]) + if err != nil { + fmt.Printf("Parsing program arguments failed: %s", err) + os.Exit(1) + } + + state := C.llama_allocate_state() + + fmt.Printf("Loading model %s...", model) + modelPath := C.CString(model) + success := C.llama_bootstrap(modelPath, state) + if !success { + fmt.Println("Loading the model failed") + os.Exit(1) + } + fmt.Printf("Model loaded successfully.\n\n") + + reader := bufio.NewReader(os.Stdin) + for { + fmt.Print("Enter prompt: ") + text, err := reader.ReadString('\n') + if err != nil { + fmt.Printf("Reading the prompt failed: %s", err) + os.Exit(1) + } + + input := C.CString(text) + params := C.llama_allocate_params(input, C.int(threads), C.int(tokens)) + result := C.llama_predict(params, state) + if result == 2 { + fmt.Println("Predicting failed") + os.Exit(1) + } + + C.llama_free_params(params) + + fmt.Printf("\n\n") + } +} diff --git a/main.h b/main.h new file mode 100644 index 0000000000000..a30304372125f --- /dev/null +++ b/main.h @@ -0,0 +1,19 @@ +// num.h +#ifdef __cplusplus +extern "C" { +#endif + +#include + +void *llama_allocate_state(); + +bool llama_bootstrap(const char *model_path, void *state_pr); + +void* llama_allocate_params(const char *input, int threads, int tokens); +void llama_free_params(void* params_ptr); + +int llama_predict(void* params_ptr, void* state_pr); + +#ifdef __cplusplus +} +#endif diff --git a/quantize.cpp b/quantize.cpp index 0ae537339ecf3..a2b2f574369e4 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -288,6 +288,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna // usage: // ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // +/* int main(int argc, char ** argv) { if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); @@ -335,3 +336,4 @@ int main(int argc, char ** argv) { return 0; } +*/ From d0cc36c131f61e8d539156c2130959df631f553f Mon Sep 17 00:00:00 2001 From: cornelk Date: Sun, 12 Mar 2023 21:29:55 -0600 Subject: [PATCH 02/44] improve prompt --- main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 0dbaf2e8b2089..718610f62b224 100644 --- a/main.go +++ b/main.go @@ -28,7 +28,7 @@ func main() { state := C.llama_allocate_state() - fmt.Printf("Loading model %s...", model) + fmt.Printf("Loading model %s...\n", model) modelPath := C.CString(model) success := C.llama_bootstrap(modelPath, state) if !success { @@ -39,7 +39,7 @@ func main() { reader := bufio.NewReader(os.Stdin) for { - fmt.Print("Enter prompt: ") + fmt.Print(">>> ") text, err := reader.ReadString('\n') if err != nil { fmt.Printf("Reading the prompt failed: %s", err) From 97a9a9aaa64c96e7819ea2bcf12dee456f4d96d6 Mon Sep 17 00:00:00 2001 From: cornelk Date: Sun, 12 Mar 2023 21:30:10 -0600 Subject: [PATCH 03/44] fix compilation of quantize --- Makefile | 3 +-- quantize.cpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 95a2e3b97cf4e..4dc4fa4d7e1d3 100644 --- a/Makefile +++ b/Makefile @@ -189,13 +189,12 @@ clean: main.o: ggml.o utils.o $(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS) - #./main -h libllama.a: main.o ggml.o utils.o ar src libllama.a main.o ggml.o utils.o quantize: quantize.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) + $(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) # # Tests diff --git a/quantize.cpp b/quantize.cpp index a2b2f574369e4..f8b1c4440b3bd 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -288,7 +288,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna // usage: // ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // -/* +#ifdef QUANTIZE int main(int argc, char ** argv) { if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); @@ -336,4 +336,4 @@ int main(int argc, char ** argv) { return 0; } -*/ +#endif From f090fdfb02f38ed7526a6d8d23f67fa53fb7458e Mon Sep 17 00:00:00 2001 From: cornelk Date: Sun, 12 Mar 2023 21:30:37 -0600 Subject: [PATCH 04/44] update readme --- README.md | 170 +++++++++++++----------------------------------------- 1 file changed, 39 insertions(+), 131 deletions(-) diff --git a/README.md b/README.md index 5194f6efc7b9d..d45b3e78f77c0 100644 --- a/README.md +++ b/README.md @@ -1,128 +1,39 @@ -# llama.cpp +# llama-go -Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++ +Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in Golang with embedded C/C++. -**Hot topics** +## Description -- Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22 +This project embeds the work of [llama.cpp](https://github.com/ggerganov/llama.cpp) in a Golang binary. +The main goal is to run the model using 4-bit quantization using CPU on Consumer-Grade hardware. -## Description +At startup, the model is loaded and a prompt is offered to enter a prompt, +after the results have been printed another prompt can be entered. +The program can be quit using ctrl+c. -The main goal is to run the model using 4-bit quantization on a MacBook. - -- Plain C/C++ implementation without dependencies -- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework -- AVX2 support for x86 architectures -- Mixed F16 / F32 precision -- 4-bit quantization support -- Runs on the CPU - -This was hacked in an evening - I have no idea if it works correctly. -Please do not make conclusions about the models based on the results from this implementation. -For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly. -New features will probably be added mostly through community contributions, if any. - ---- - -Here is a typical run using LLaMA-7B: - -```java -make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 -I llama.cpp build info: -I UNAME_S: Darwin -I UNAME_P: arm -I UNAME_M: arm64 -I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -DGGML_USE_ACCELERATE -I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I LDFLAGS: -framework Accelerate -I CC: Apple clang version 14.0.0 (clang-1400.0.29.202) -I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202) - -make: Nothing to be done for `default'. -main: seed = 1678486056 -llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ... -llama_model_load: n_vocab = 32000 -llama_model_load: n_ctx = 512 -llama_model_load: n_embd = 4096 -llama_model_load: n_mult = 256 -llama_model_load: n_head = 32 -llama_model_load: n_layer = 32 -llama_model_load: n_rot = 128 -llama_model_load: f16 = 2 -llama_model_load: n_ff = 11008 -llama_model_load: ggml ctx size = 4529.34 MB -llama_model_load: memory_size = 512.00 MB, n_mem = 16384 -llama_model_load: .................................... done -llama_model_load: model size = 4017.27 MB / num tensors = 291 - -main: prompt: 'Building a website can be done in 10 simple steps:' -main: number of tokens in prompt = 15 - 1 -> '' - 8893 -> 'Build' - 292 -> 'ing' - 263 -> ' a' - 4700 -> ' website' - 508 -> ' can' - 367 -> ' be' - 2309 -> ' done' - 297 -> ' in' - 29871 -> ' ' - 29896 -> '1' - 29900 -> '0' - 2560 -> ' simple' - 6576 -> ' steps' - 29901 -> ':' - -sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000 - - -Building a website can be done in 10 simple steps: -1) Select a domain name and web hosting plan -2) Complete a sitemap -3) List your products -4) Write product descriptions -5) Create a user account -6) Build the template -7) Start building the website -8) Advertise the website -9) Provide email support -10) Submit the website to search engines -A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves. -The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser. -The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer. -A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server. -A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen. -A website can also be viewed on different devices such as desktops, tablets and smartphones. -Hence, to have a website displayed on a browser, the website must be hosted. -A domain name is an address of a website. It is the name of the website. -The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server. -A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen. -A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted. -A domain name is an address of a website. It is the name of the website. -A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves. -The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser. -A website is known as a website when it is hosted - -main: mem per token = 14434244 bytes -main: load time = 1332.48 ms -main: sample time = 1081.40 ms -main: predict time = 31378.77 ms / 61.41 ms per token -main: total time = 34036.74 ms -``` +This project was tested on Linux but should be able to get to work on macOS as well. -And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook: +## Requirements -https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4 +The memory requirements for the models are approximately: -## Usage +``` +7B -> 4 GB +13B -> 8 GB +30B -> 16 GB +65B -> 32 GB +``` + +## Installation Here are the step for the LLaMA-7B model: ```bash # build this repo -git clone https://github.com/ggerganov/llama.cpp -cd llama.cpp +git clone https://github.com/cornelk/llama-go +cd llama-go make +CGO_CFLAGS_ALLOW='-mf.*' go build . # obtain the original LLaMA model weights and place them in ./models ls ./models @@ -136,9 +47,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1 # quantize the model to 4-bits ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2 - -# run the inference -./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 ``` For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format @@ -156,12 +64,6 @@ You need to quantize each of them separately like this: ./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2 ``` -Everything else is the same. Simply run: - -```bash -./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 -``` - The number of files generated for each model is as follows: ``` @@ -173,16 +75,22 @@ The number of files generated for each model is as follows: When running the larger models, make sure you have enough disk space to store all the intermediate files. -## Limitations +## Usage + +```bash +./llama-go -m ./models/13B/ggml-model-q4_0.bin -t 4 -n 128 -- Not sure if my tokenizer is correct. There are a few places where we might have a mistake: - - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87 - - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69 - In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that -- I don't know yet how much the quantization affects the quality of the generated text -- Probably the token sampling can be improved -- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder, - there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simlpy don't - know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the - performance will be the same, since no BLAS calls are invoked by the current implementation +Loading model ./models/13B/ggml-model-q4_0.bin... +Model loaded successfully. +>>> Some good pun names for a pet groomer: + +Some good pun names for a pet groomer: +Rub-a-Dub, Scooby Doo +Hair Force One +Duck and Cover, Two Fleas, One Duck +... + +>>> + +``` From f86c433d466ea276497e612f44482be55db741ba Mon Sep 17 00:00:00 2001 From: cornelk Date: Sun, 12 Mar 2023 22:30:41 -0600 Subject: [PATCH 05/44] update readme --- README.md | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d45b3e78f77c0..3bfab0f0de264 100644 --- a/README.md +++ b/README.md @@ -18,16 +18,14 @@ This project was tested on Linux but should be able to get to work on macOS as w The memory requirements for the models are approximately: ``` -7B -> 4 GB -13B -> 8 GB -30B -> 16 GB -65B -> 32 GB +7B -> 4 GB (1 file) +13B -> 8 GB (2 files) +30B -> 16 GB (4 files) +65B -> 32 GB (8 files) ``` ## Installation -Here are the step for the LLaMA-7B model: - ```bash # build this repo git clone https://github.com/cornelk/llama-go @@ -35,12 +33,18 @@ cd llama-go make CGO_CFLAGS_ALLOW='-mf.*' go build . -# obtain the original LLaMA model weights and place them in ./models -ls ./models -65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model - # install Python dependencies python3 -m pip install torch numpy sentencepiece +``` + +Obtain the original LLaMA model weights and place them in ./models - +for example by using the https://github.com/shawwn/llama-dl script to download them. + +Use the following steps to convert the LLaMA-7B model to a format that is compatible: + +```bash +ls ./models +65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model # convert the 7B model to ggml FP16 format python3 convert-pth-to-ggml.py models/7B/ 1 @@ -64,15 +68,6 @@ You need to quantize each of them separately like this: ./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2 ``` -The number of files generated for each model is as follows: - -``` -7B -> 1 file -13B -> 2 files -30B -> 4 files -65B -> 8 files -``` - When running the larger models, make sure you have enough disk space to store all the intermediate files. ## Usage From deb304d6dbb3775740e4447951293fafa28ff331 Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 09:00:18 -0600 Subject: [PATCH 06/44] add llama-go compilation to makefile --- Makefile | 6 ++++-- README.md | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 4dc4fa4d7e1d3..e8ea69e737f7c 100644 --- a/Makefile +++ b/Makefile @@ -172,7 +172,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main.o quantize libllama.a +default: main.o quantize libllama.a llama-go # # Build library @@ -185,7 +185,7 @@ utils.o: utils.cpp utils.h $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o clean: - rm -f *.o *.a quantize + rm -f *.o *.a quantize llama-go main.o: ggml.o utils.o $(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS) @@ -196,6 +196,8 @@ libllama.a: main.o ggml.o utils.o quantize: quantize.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) +llama-go: + CGO_CFLAGS_ALLOW='-mf.*' go build . # # Tests # diff --git a/README.md b/README.md index 3bfab0f0de264..6b1614ae872fb 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,6 @@ The memory requirements for the models are approximately: git clone https://github.com/cornelk/llama-go cd llama-go make -CGO_CFLAGS_ALLOW='-mf.*' go build . # install Python dependencies python3 -m pip install torch numpy sentencepiece From ea0ff166a0930c8c84c10070cf9f1d12ff9dd82f Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 13:08:37 -0600 Subject: [PATCH 07/44] reduce code changes --- main.cpp | 106 ++++++++++++++++++++++++++----------------------------- 1 file changed, 50 insertions(+), 56 deletions(-) diff --git a/main.cpp b/main.cpp index d1ec26e7cbc30..ea17e5f1dd3a2 100644 --- a/main.cpp +++ b/main.cpp @@ -136,18 +136,18 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - -// fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); -// fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); -// fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); -// fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); -// fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); -// fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); -// fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); -// fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); -// fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); -// fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); - } +/* + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); +*/ } // load vocab { @@ -515,11 +515,11 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab fflush(stderr); }*/ } +/* + fprintf(stderr, " done\n"); -// fprintf(stderr, " done\n"); - -// fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); - } + fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); +*/ } fin.close(); } @@ -867,27 +867,27 @@ bool llama_bootstrap(const char *model_path, void* state_pr) } */ int llama_predict(void* params_ptr, void* state_pr) { - gpt_params* params = (gpt_params*) params_ptr; - llama_state* state = (llama_state*) state_pr; - if (params->seed < 0) { - params->seed = time(NULL); + gpt_params params = *(gpt_params*) params_ptr; + llama_state state = *(llama_state*) state_pr; + if (params.seed < 0) { + params.seed = time(NULL); } - std::mt19937 rng(params->seed); + std::mt19937 rng(params.seed); int n_past = 0; - state->timing.t_sample_us = 0; - state->timing.t_predict_us = 0; + state.timing.t_sample_us = 0; + state.timing.t_predict_us = 0; std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(state->vocab, params->prompt, true); + std::vector embd_inp = ::llama_tokenize(state.vocab, params.prompt, true); - params->n_predict = std::min(params->n_predict, state->model.hparams.n_ctx - (int) embd_inp.size()); + params.n_predict = std::min(params.n_predict, state.model.hparams.n_ctx - (int) embd_inp.size()); // tokenize the reverse prompt - std::vector antiprompt_inp = ::llama_tokenize(state->vocab, params->antiprompt, false); + std::vector antiprompt_inp = ::llama_tokenize(state.vocab, params.antiprompt, false); fprintf(stderr, "\n"); /*fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -896,7 +896,7 @@ int llama_predict(void* params_ptr, void* state_pr) { fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); } fprintf(stderr, "\n"); - if (params->interactive) { + if (params.interactive) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = sigint_handler; @@ -923,16 +923,16 @@ int llama_predict(void* params_ptr, void* state_pr) { // determine the required inference memory per token: size_t mem_per_token = 0; - llama_eval(state->model, params->n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); + llama_eval(state.model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); - llama_eval(state->model, params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + llama_eval(state.model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - int last_n_size = params->repeat_last_n; + int last_n_size = params.repeat_last_n; std::vector last_n_tokens(last_n_size); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - if (params->interactive) { + if (params.interactive) { fprintf(stderr, "== Running in interactive mode. ==\n" #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) " - Press Ctrl+C to interject at any time.\n" @@ -941,17 +941,17 @@ int llama_predict(void* params_ptr, void* state_pr) { " - If you want to submit another line, end your input in '\\'.\n"); } - int remaining_tokens = params->n_predict; + int remaining_tokens = params.n_predict; int input_consumed = 0; bool input_noecho = false; // prompt user immediately after the starting prompt has been loaded - if (params->interactive_start) { + if (params.interactive_start) { is_interacting = true; } // set the color for the prompt which will be output initially - if (params->use_color) { + if (params.use_color) { printf(ANSI_COLOR_YELLOW); } @@ -960,12 +960,12 @@ int llama_predict(void* params_ptr, void* state_pr) { if (embd.size() > 0) { const int64_t t_start_us = ggml_time_us(); - if (!llama_eval(state->model, params->n_threads, n_past, embd, logits, mem_per_token)) { + if (!llama_eval(state.model, params.n_threads, n_past, embd, logits, mem_per_token)) { fprintf(stderr, "Failed to predict\n"); return 1; } - state->timing.t_predict_us += ggml_time_us() - t_start_us; + state.timing.t_predict_us += ggml_time_us() - t_start_us; } n_past += embd.size(); @@ -973,24 +973,24 @@ int llama_predict(void* params_ptr, void* state_pr) { if (embd_inp.size() <= input_consumed) { // out of user input, sample next token - const float top_k = params->top_k; - const float top_p = params->top_p; - const float temp = params->temp; - const float repeat_penalty = params->repeat_penalty; + const float top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const float repeat_penalty = params.repeat_penalty; - const int n_vocab = state->model.hparams.n_vocab; + const int n_vocab = state.model.hparams.n_vocab; gpt_vocab::id id = 0; { const int64_t t_start_sample_us = ggml_time_us(); - id = llama_sample_top_p_top_k(state->vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); + id = llama_sample_top_p_top_k(state.vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); - state->timing.t_sample_us += ggml_time_us() - t_start_sample_us; + state.timing.t_sample_us += ggml_time_us() - t_start_sample_us; } // add it to the context @@ -1008,13 +1008,13 @@ int llama_predict(void* params_ptr, void* state_pr) { last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(embd_inp[input_consumed]); ++input_consumed; - if (embd.size() > params->n_batch) { + if (embd.size() > params.n_batch) { break; } } // reset color to default if we there is no pending user input - if (!input_noecho && params->use_color && embd_inp.size() == input_consumed) { + if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) { printf(ANSI_COLOR_RESET); } } @@ -1022,14 +1022,14 @@ int llama_predict(void* params_ptr, void* state_pr) { // display text if (!input_noecho) { for (auto id : embd) { - printf("%s", state->vocab.id_to_token[id].c_str()); + printf("%s", state.vocab.id_to_token[id].c_str()); } fflush(stdout); } // in interactive mode, and not currently processing queued inputs; // check if we should prompt the user for more - if (params->interactive && embd_inp.size() <= input_consumed) { + if (params.interactive && embd_inp.size() <= input_consumed) { // check for reverse prompt if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) { // reverse prompt found @@ -1042,13 +1042,13 @@ int llama_predict(void* params_ptr, void* state_pr) { fflush(stdout); char buf[256] = {0}; int n_read; - if(params->use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); + if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { // presumable empty line, consume the newline scanf("%*c"); n_read=0; } - if(params->use_color) printf(ANSI_COLOR_RESET); + if(params.use_color) printf(ANSI_COLOR_RESET); if (n_read > 0 && buf[n_read-1]=='\\') { another_line = true; @@ -1060,7 +1060,7 @@ int llama_predict(void* params_ptr, void* state_pr) { buf[n_read+1] = 0; } - std::vector line_inp = ::llama_tokenize(state->vocab, buf, false); + std::vector line_inp = ::llama_tokenize(state.vocab, buf, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); remaining_tokens -= line_inp.size(); @@ -1096,9 +1096,3 @@ int llama_predict(void* params_ptr, void* state_pr) { */ return 0; } - -void llama_finalize(llama_state &state) { - ggml_free(state.model.ctx); - -// return 0; -} From f8f93b8d77d07c0ed15356838787f13eee9f532b Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 13:16:46 -0600 Subject: [PATCH 08/44] reduce code changes --- main.cpp | 27 ++++++++++++++++----------- main.h | 1 - 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/main.cpp b/main.cpp index ea17e5f1dd3a2..56c319d72b9ec 100644 --- a/main.cpp +++ b/main.cpp @@ -509,7 +509,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab total_size += ggml_nbytes(tensor)/n_parts; } /* - fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); if (++n_tensors % 8 == 0) { fprintf(stderr, "."); fflush(stderr); @@ -762,6 +762,7 @@ bool llama_eval( } static bool is_interacting = false; + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) void sigint_handler(int signo) { if (signo == SIGINT) { @@ -869,6 +870,10 @@ bool llama_bootstrap(const char *model_path, void* state_pr) int llama_predict(void* params_ptr, void* state_pr) { gpt_params params = *(gpt_params*) params_ptr; llama_state state = *(llama_state*) state_pr; + gpt_vocab vocab = state.vocab; + llama_model model = state.model; + + if (params.seed < 0) { params.seed = time(NULL); } @@ -882,12 +887,12 @@ int llama_predict(void* params_ptr, void* state_pr) { std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(state.vocab, params.prompt, true); + std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); - params.n_predict = std::min(params.n_predict, state.model.hparams.n_ctx - (int) embd_inp.size()); + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); // tokenize the reverse prompt - std::vector antiprompt_inp = ::llama_tokenize(state.vocab, params.antiprompt, false); + std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); fprintf(stderr, "\n"); /*fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -923,9 +928,9 @@ int llama_predict(void* params_ptr, void* state_pr) { // determine the required inference memory per token: size_t mem_per_token = 0; - llama_eval(state.model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); + llama_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); - llama_eval(state.model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); int last_n_size = params.repeat_last_n; std::vector last_n_tokens(last_n_size); @@ -960,7 +965,7 @@ int llama_predict(void* params_ptr, void* state_pr) { if (embd.size() > 0) { const int64_t t_start_us = ggml_time_us(); - if (!llama_eval(state.model, params.n_threads, n_past, embd, logits, mem_per_token)) { + if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { fprintf(stderr, "Failed to predict\n"); return 1; } @@ -978,14 +983,14 @@ int llama_predict(void* params_ptr, void* state_pr) { const float temp = params.temp; const float repeat_penalty = params.repeat_penalty; - const int n_vocab = state.model.hparams.n_vocab; + const int n_vocab = model.hparams.n_vocab; gpt_vocab::id id = 0; { const int64_t t_start_sample_us = ggml_time_us(); - id = llama_sample_top_p_top_k(state.vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); + id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); @@ -1022,7 +1027,7 @@ int llama_predict(void* params_ptr, void* state_pr) { // display text if (!input_noecho) { for (auto id : embd) { - printf("%s", state.vocab.id_to_token[id].c_str()); + printf("%s", vocab.id_to_token[id].c_str()); } fflush(stdout); } @@ -1060,7 +1065,7 @@ int llama_predict(void* params_ptr, void* state_pr) { buf[n_read+1] = 0; } - std::vector line_inp = ::llama_tokenize(state.vocab, buf, false); + std::vector line_inp = ::llama_tokenize(vocab, buf, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); remaining_tokens -= line_inp.size(); diff --git a/main.h b/main.h index a30304372125f..536ebd209018b 100644 --- a/main.h +++ b/main.h @@ -1,4 +1,3 @@ -// num.h #ifdef __cplusplus extern "C" { #endif From bf383913ab43b5845dce9da8982ade91a334af6f Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 13:21:31 -0600 Subject: [PATCH 09/44] simplify quantize instructions --- README.md | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/README.md b/README.md index 6b1614ae872fb..6ba7de8db8b6e 100644 --- a/README.md +++ b/README.md @@ -49,22 +49,7 @@ ls ./models python3 convert-pth-to-ggml.py models/7B/ 1 # quantize the model to 4-bits -./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2 -``` - -For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format -will create 2 ggml files, instead of one: - -```bash -ggml-model-f16.bin -ggml-model-f16.bin.1 -``` - -You need to quantize each of them separately like this: - -```bash -./quantize ./models/13B/ggml-model-f16.bin ./models/13B/ggml-model-q4_0.bin 2 -./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2 +./quantize.sh 7B ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. From 9f9d3838913528bb10c17286fb24a4c467eac318 Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 13:42:25 -0600 Subject: [PATCH 10/44] do not print error on ctrl-c --- main.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main.go b/main.go index 718610f62b224..cde0d43b45680 100644 --- a/main.go +++ b/main.go @@ -8,6 +8,7 @@ import ( "bufio" "flag" "fmt" + "io" "os" ) @@ -42,6 +43,9 @@ func main() { fmt.Print(">>> ") text, err := reader.ReadString('\n') if err != nil { + if err == io.EOF { + os.Exit(0) + } fmt.Printf("Reading the prompt failed: %s", err) os.Exit(1) } From 86770550bdb999515e87ba7608e357195538a67f Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 18:30:56 -0600 Subject: [PATCH 11/44] minor improvements --- Makefile | 9 +++++---- main.cpp | 45 ++++++++++++++++++++++----------------------- main.go | 6 +++--- main.h | 2 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/Makefile b/Makefile index b96d694e799bb..d2862bb6f05d1 100644 --- a/Makefile +++ b/Makefile @@ -189,10 +189,11 @@ utils.o: utils.cpp utils.h $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o clean: - rm -f *.o *.a quantize llama-go + rm -f *.o main quantize + rm -f *.a llama-go -main.o: ggml.o utils.o - $(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS) +main.o: main.cpp ggml.o utils.o + $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main.o -c $(LDFLAGS) libllama.a: main.o ggml.o utils.o ar src libllama.a main.o ggml.o utils.o @@ -200,7 +201,7 @@ libllama.a: main.o ggml.o utils.o quantize: quantize.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) -llama-go: +llama-go: main.go main.cpp main.h CGO_CFLAGS_ALLOW='-mf.*' go build . # # Tests diff --git a/main.cpp b/main.cpp index 56c319d72b9ec..28074175675c7 100644 --- a/main.cpp +++ b/main.cpp @@ -827,37 +827,21 @@ int main(int argc, char ** argv) { */ -void* llama_allocate_state() { - return new llama_state; -} - -void* llama_allocate_params(const char *input, int threads, int tokens) { - gpt_params* params = new gpt_params; - params->prompt = input; - params->n_threads = threads; - params->n_predict = tokens; - return params; -} - -void llama_free_params(void* params_ptr) { - gpt_params* params = (gpt_params*) params_ptr; - delete params; -} - -bool llama_bootstrap(const char *model_path, void* state_pr) +int llama_bootstrap(const char *model_path, void* state_pr) // load the model { ggml_time_init(); llama_state* state = (llama_state*) state_pr; + const int64_t t_start_us = ggml_time_us(); if (!llama_model_load(model_path, state->model, state->vocab, 512)) { // TODO: set context from user input ?? fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path); - return false; + return 1; } state->timing.t_load_us = ggml_time_us() - t_start_us; - return true; + return 0; } /* // print system information @@ -887,7 +871,7 @@ int llama_predict(void* params_ptr, void* state_pr) { std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); @@ -928,8 +912,6 @@ int llama_predict(void* params_ptr, void* state_pr) { // determine the required inference memory per token: size_t mem_per_token = 0; - llama_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); - llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); int last_n_size = params.repeat_last_n; @@ -1101,3 +1083,20 @@ int llama_predict(void* params_ptr, void* state_pr) { */ return 0; } + +void* llama_allocate_state() { + return new llama_state; +} + +void* llama_allocate_params(const char *input, int threads, int tokens) { + gpt_params* params = new gpt_params; + params->prompt = input; + params->n_threads = threads; + params->n_predict = tokens; + return params; +} + +void llama_free_params(void* params_ptr) { + gpt_params* params = (gpt_params*) params_ptr; + delete params; +} diff --git a/main.go b/main.go index cde0d43b45680..c3f265ab30fe4 100644 --- a/main.go +++ b/main.go @@ -31,8 +31,8 @@ func main() { fmt.Printf("Loading model %s...\n", model) modelPath := C.CString(model) - success := C.llama_bootstrap(modelPath, state) - if !success { + result := C.llama_bootstrap(modelPath, state) + if result != 0 { fmt.Println("Loading the model failed") os.Exit(1) } @@ -52,7 +52,7 @@ func main() { input := C.CString(text) params := C.llama_allocate_params(input, C.int(threads), C.int(tokens)) - result := C.llama_predict(params, state) + result = C.llama_predict(params, state) if result == 2 { fmt.Println("Predicting failed") os.Exit(1) diff --git a/main.h b/main.h index 536ebd209018b..5cf54c62ce3d5 100644 --- a/main.h +++ b/main.h @@ -6,7 +6,7 @@ extern "C" { void *llama_allocate_state(); -bool llama_bootstrap(const char *model_path, void *state_pr); +int llama_bootstrap(const char *model_path, void *state_pr); void* llama_allocate_params(const char *input, int threads, int tokens); void llama_free_params(void* params_ptr); From b1f9a795505ad5db984c24ceed43a537ac63b836 Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 21:05:40 -0600 Subject: [PATCH 12/44] allow modifying parameters at runtime --- README.md | 6 +++ main.cpp | 11 +++++- main.go | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- main.h | 3 +- 4 files changed, 124 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6ba7de8db8b6e..e42c1a86f6c15 100644 --- a/README.md +++ b/README.md @@ -73,3 +73,9 @@ Duck and Cover, Two Fleas, One Duck >>> ``` + +The settings can be changed at runtime, multiple values are possible: +```bash +>>> seed=1234 threads=8 +Current settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95 +``` diff --git a/main.cpp b/main.cpp index 28074175675c7..415b1d5bde62c 100644 --- a/main.cpp +++ b/main.cpp @@ -1088,11 +1088,18 @@ void* llama_allocate_state() { return new llama_state; } -void* llama_allocate_params(const char *input, int threads, int tokens) { +void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, + float top_p, float temp, float repeat_penalty) { gpt_params* params = new gpt_params; - params->prompt = input; + params->seed = seed; params->n_threads = threads; params->n_predict = tokens; + params->top_k = top_k; + params->top_p = top_p; + params->n_predict = tokens; + params->temp = temp; + params->repeat_penalty = repeat_penalty; + params->prompt = prompt; return params; } diff --git a/main.go b/main.go index c3f265ab30fe4..9226c59a06d70 100644 --- a/main.go +++ b/main.go @@ -10,11 +10,35 @@ import ( "fmt" "io" "os" + "reflect" + "sort" + "strconv" + "strings" +) + +var ( + seed = -1 + threads = 0 + tokens = 0 + + topK = 40 + topP = 0.95 + temp = 0.80 + repeatPenalty = 1.30 + + options = map[string]any{ + "repeat_penalty": &repeatPenalty, + "seed": &seed, + "temp": &temp, + "threads": &threads, + "tokens": &tokens, + "top_k": &topK, + "top_p": &topP, + } ) func main() { var model string - var threads, tokens int flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError) flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load") @@ -36,9 +60,11 @@ func main() { fmt.Println("Loading the model failed") os.Exit(1) } - fmt.Printf("Model loaded successfully.\n\n") + fmt.Printf("Model loaded successfully.\n") + printSettings() reader := bufio.NewReader(os.Stdin) + for { fmt.Print(">>> ") text, err := reader.ReadString('\n') @@ -50,8 +76,18 @@ func main() { os.Exit(1) } + optionChanged, err := handleParameterChange(text) + if err != nil { + fmt.Printf("Reading the prompt failed: %s", err) + os.Exit(1) + } + if optionChanged { + continue + } + input := C.CString(text) - params := C.llama_allocate_params(input, C.int(threads), C.int(tokens)) + params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), + C.float(topP), C.float(temp), C.float(repeatPenalty)) result = C.llama_predict(params, state) if result == 2 { fmt.Println("Predicting failed") @@ -63,3 +99,71 @@ func main() { fmt.Printf("\n\n") } } + +// handleParameterChange parses the input for any parameter changes. +// This is a generic function that can handle int and float type parameters. +// The parameters need to be referenced by pointer in the options map. +func handleParameterChange(input string) (bool, error) { + optionChanged := false + words := strings.Split(input, " ") + + for _, word := range words { + parsed := strings.Split(word, "=") + + if len(parsed) < 2 { + break + } + + s := strings.TrimSpace(parsed[0]) + opt, ok := options[s] + if !ok { + break + } + + val := reflect.ValueOf(opt) + if val.Kind() != reflect.Ptr { + return false, fmt.Errorf("option %s is not a pointer", s) + } + val = val.Elem() + argument := strings.TrimSpace(parsed[1]) + optionChanged = true + + switch val.Kind() { + case reflect.Int: + i, err := strconv.ParseInt(argument, 10, 64) + if err != nil { + return false, fmt.Errorf("parsing value '%s' as int: %w", argument, err) + } + val.SetInt(i) + + case reflect.Float32, reflect.Float64: + f, err := strconv.ParseFloat(argument, 64) + if err != nil { + return false, fmt.Errorf("parsing value '%s' as float: %w", argument, err) + } + val.SetFloat(f) + + default: + return false, fmt.Errorf("unsupported option %s type %T", s, opt) + } + } + + if optionChanged { + printSettings() + } + return optionChanged, nil +} + +func printSettings() { + var settings sort.StringSlice + for setting, value := range options { + val := reflect.ValueOf(value) + if val.Kind() == reflect.Ptr { + val = val.Elem() + } + settings = append(settings, fmt.Sprintf("%s=%v", setting, val.Interface())) + } + sort.Sort(settings) + s := strings.Join(settings, " ") + fmt.Printf("Current settings: %s\n\n", s) +} diff --git a/main.h b/main.h index 5cf54c62ce3d5..f8881d82479f2 100644 --- a/main.h +++ b/main.h @@ -8,7 +8,8 @@ void *llama_allocate_state(); int llama_bootstrap(const char *model_path, void *state_pr); -void* llama_allocate_params(const char *input, int threads, int tokens); +void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, + float top_p, float temp, float repeat_penalty); void llama_free_params(void* params_ptr); int llama_predict(void* params_ptr, void* state_pr); From 65e4616f6f8670e691fb418a8d9288015bc940ba Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 21:10:25 -0600 Subject: [PATCH 13/44] change settings string and fix ci macos compilation --- README.md | 2 +- main.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e42c1a86f6c15..63e462356a5f6 100644 --- a/README.md +++ b/README.md @@ -77,5 +77,5 @@ Duck and Cover, Two Fleas, One Duck The settings can be changed at runtime, multiple values are possible: ```bash >>> seed=1234 threads=8 -Current settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95 +Settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95 ``` diff --git a/main.go b/main.go index 9226c59a06d70..11a16fb7d7877 100644 --- a/main.go +++ b/main.go @@ -26,7 +26,7 @@ var ( temp = 0.80 repeatPenalty = 1.30 - options = map[string]any{ + options = map[string]interface{}{ "repeat_penalty": &repeatPenalty, "seed": &seed, "temp": &temp, @@ -165,5 +165,5 @@ func printSettings() { } sort.Sort(settings) s := strings.Join(settings, " ") - fmt.Printf("Current settings: %s\n\n", s) + fmt.Printf("Settings: %s\n\n", s) } From 5594cda385e753928e1dc702b3ed2f57e1a99f8c Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 22:56:04 -0600 Subject: [PATCH 14/44] fix prediction result error handling --- main.cpp | 2 +- main.go | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/main.cpp b/main.cpp index 415b1d5bde62c..9817108821fca 100644 --- a/main.cpp +++ b/main.cpp @@ -948,7 +948,7 @@ int llama_predict(void* params_ptr, void* state_pr) { const int64_t t_start_us = ggml_time_us(); if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - fprintf(stderr, "Failed to predict\n"); +// fprintf(stderr, "Failed to predict\n"); return 1; } diff --git a/main.go b/main.go index 11a16fb7d7877..39790ec9fe326 100644 --- a/main.go +++ b/main.go @@ -89,8 +89,10 @@ func main() { params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), C.float(topP), C.float(temp), C.float(repeatPenalty)) result = C.llama_predict(params, state) - if result == 2 { - fmt.Println("Predicting failed") + switch result { + case 0, 2: + case 1: + fmt.Println("\nPredicting failed") os.Exit(1) } From 20080e8cdbc62f192cf944fe9b35da352cce059e Mon Sep 17 00:00:00 2001 From: cornelk Date: Mon, 13 Mar 2023 23:13:27 -0600 Subject: [PATCH 15/44] allow multiline input for prompts --- main.go | 56 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/main.go b/main.go index 39790ec9fe326..7af938f626c7f 100644 --- a/main.go +++ b/main.go @@ -66,8 +66,34 @@ func main() { reader := bufio.NewReader(os.Stdin) for { - fmt.Print(">>> ") - text, err := reader.ReadString('\n') + text := readMultiLineInput(reader) + + input := C.CString(text) + params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), + C.float(topP), C.float(temp), C.float(repeatPenalty)) + result = C.llama_predict(params, state) + switch result { + case 0: + case 1: + fmt.Println("\nPredicting failed") + os.Exit(1) + case 2: + fmt.Printf(" ") + } + + C.llama_free_params(params) + + fmt.Printf("\n\n") + } +} + +// readMultiLineInput reads input until an empty line is entered. +func readMultiLineInput(reader *bufio.Reader) string { + var lines []string + fmt.Print(">>> ") + + for { + line, err := reader.ReadString('\n') if err != nil { if err == io.EOF { os.Exit(0) @@ -76,30 +102,26 @@ func main() { os.Exit(1) } - optionChanged, err := handleParameterChange(text) + if len(strings.TrimSpace(line)) == 0 { + break + } + + optionChanged, err := handleParameterChange(line) if err != nil { fmt.Printf("Reading the prompt failed: %s", err) os.Exit(1) } if optionChanged { + lines = nil + fmt.Print(">>> ") continue } - input := C.CString(text) - params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), - C.float(topP), C.float(temp), C.float(repeatPenalty)) - result = C.llama_predict(params, state) - switch result { - case 0, 2: - case 1: - fmt.Println("\nPredicting failed") - os.Exit(1) - } - - C.llama_free_params(params) - - fmt.Printf("\n\n") + lines = append(lines, line) } + + text := strings.Join(lines, "") + return text } // handleParameterChange parses the input for any parameter changes. From b94e476c3d353861fb1c4537af6af913efdf581c Mon Sep 17 00:00:00 2001 From: cornelk Date: Tue, 14 Mar 2023 18:43:15 -0600 Subject: [PATCH 16/44] allow setting of repeat_last_n --- main.cpp | 6 ++++-- main.go | 15 +++++++++------ main.h | 4 ++-- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/main.cpp b/main.cpp index 9817108821fca..4aff0c90b813a 100644 --- a/main.cpp +++ b/main.cpp @@ -1089,16 +1089,18 @@ void* llama_allocate_state() { } void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, - float top_p, float temp, float repeat_penalty) { + float top_p, float temp, float repeat_penalty, int repeat_last_n) { gpt_params* params = new gpt_params; params->seed = seed; params->n_threads = threads; params->n_predict = tokens; + params->repeat_last_n = repeat_last_n; + params->top_k = top_k; params->top_p = top_p; - params->n_predict = tokens; params->temp = temp; params->repeat_penalty = repeat_penalty; + params->prompt = prompt; return params; } diff --git a/main.go b/main.go index 7af938f626c7f..7513a74db0abf 100644 --- a/main.go +++ b/main.go @@ -17,9 +17,10 @@ import ( ) var ( - seed = -1 - threads = 0 - tokens = 0 + repeatLastN = 64 + seed = -1 + threads = 4 + tokens = 128 topK = 40 topP = 0.95 @@ -27,11 +28,12 @@ var ( repeatPenalty = 1.30 options = map[string]interface{}{ + "repeat_last_n": &repeatLastN, // last n tokens to penalize "repeat_penalty": &repeatPenalty, - "seed": &seed, + "seed": &seed, // RNG seed, -1 will seed based on current time "temp": &temp, "threads": &threads, - "tokens": &tokens, + "tokens": &tokens, // new tokens to predict "top_k": &topK, "top_p": &topP, } @@ -70,7 +72,7 @@ func main() { input := C.CString(text) params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), - C.float(topP), C.float(temp), C.float(repeatPenalty)) + C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN)) result = C.llama_predict(params, state) switch result { case 0: @@ -178,6 +180,7 @@ func handleParameterChange(input string) (bool, error) { return optionChanged, nil } +// printSettings outputs the current settings, alphabetically sorted. func printSettings() { var settings sort.StringSlice for setting, value := range options { diff --git a/main.h b/main.h index f8881d82479f2..9e2ad5203de30 100644 --- a/main.h +++ b/main.h @@ -8,8 +8,8 @@ void *llama_allocate_state(); int llama_bootstrap(const char *model_path, void *state_pr); -void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, - float top_p, float temp, float repeat_penalty); +void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, + int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n); void llama_free_params(void* params_ptr); int llama_predict(void* params_ptr, void* state_pr); From 9f57e389da313e0f9fe74841963e41e00df5c03d Mon Sep 17 00:00:00 2001 From: cornelk Date: Tue, 14 Mar 2023 18:56:26 -0600 Subject: [PATCH 17/44] disable windows build in ci --- .github/workflows/build.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a068ae75f966..c4377bef8fd4b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,19 +33,19 @@ jobs: run: | make - windows-latest: - runs-on: windows-latest - - steps: - - name: Clone - uses: actions/checkout@v1 - - - name: Build - run: | - mkdir build - cd build - cmake .. - cmake --build . --config Release +# windows-latest: +# runs-on: windows-latest +# +# steps: +# - name: Clone +# uses: actions/checkout@v1 +# +# - name: Build +# run: | +# mkdir build +# cd build +# cmake .. +# cmake --build . --config Release # ubuntu-latest-gcc: # runs-on: ubuntu-latest From 3406313d5a591dc9f693c1b3f9edf51038874059 Mon Sep 17 00:00:00 2001 From: Cornel Date: Tue, 14 Mar 2023 20:28:18 -0600 Subject: [PATCH 18/44] fix and reenable windows ci build (#1) --- .github/workflows/build.yml | 26 +++++++++++++------------- CMakeLists.txt | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c4377bef8fd4b..1a068ae75f966 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,19 +33,19 @@ jobs: run: | make -# windows-latest: -# runs-on: windows-latest -# -# steps: -# - name: Clone -# uses: actions/checkout@v1 -# -# - name: Build -# run: | -# mkdir build -# cd build -# cmake .. -# cmake --build . --config Release + windows-latest: + runs-on: windows-latest + + steps: + - name: Clone + uses: actions/checkout@v1 + + - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release # ubuntu-latest-gcc: # runs-on: ubuntu-latest diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3be38a55740..fc488f9eb715f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,7 +104,7 @@ endif() # set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF) # endif() -add_executable(llama +add_library(llama main.cpp utils.cpp utils.h) @@ -120,7 +120,7 @@ add_library(ggml target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS}) target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS}) -target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS}) +target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS} -DQUANTIZE) target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) target_include_directories(ggml PUBLIC .) From 6a646ea7515d8b6651f53a195f9a827c542591cb Mon Sep 17 00:00:00 2001 From: mudler Date: Fri, 17 Mar 2023 21:19:22 +0100 Subject: [PATCH 19/44] Make model return strings --- Makefile | 14 ++--- main.cpp => lama.cpp | 121 ++++++------------------------------------- main.h => lama.h | 2 +- main.go | 22 ++++---- 4 files changed, 35 insertions(+), 124 deletions(-) rename main.cpp => lama.cpp (88%) rename main.h => lama.h (86%) diff --git a/Makefile b/Makefile index d2862bb6f05d1..6281674cce1bc 100644 --- a/Makefile +++ b/Makefile @@ -176,7 +176,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main.o quantize libllama.a llama-go +default: lama.o quantize libllama.a llama-go # # Build library @@ -189,19 +189,19 @@ utils.o: utils.cpp utils.h $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o clean: - rm -f *.o main quantize + rm -f *.o lama quantize rm -f *.a llama-go -main.o: main.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main.o -c $(LDFLAGS) +lama.o: lama.cpp ggml.o utils.o + $(CXX) $(CXXFLAGS) lama.cpp ggml.o utils.o -o lama.o -c $(LDFLAGS) -libllama.a: main.o ggml.o utils.o - ar src libllama.a main.o ggml.o utils.o +libllama.a: lama.o ggml.o utils.o + ar src libllama.a lama.o ggml.o utils.o quantize: quantize.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) -llama-go: main.go main.cpp main.h +llama-go: main.go lama.cpp main.h CGO_CFLAGS_ALLOW='-mf.*' go build . # # Tests diff --git a/main.cpp b/lama.cpp similarity index 88% rename from main.cpp rename to lama.cpp index e9293c63a3428..efd2ac28ceff0 100644 --- a/main.cpp +++ b/lama.cpp @@ -1,5 +1,5 @@ #include "ggml.h" -#include "main.h" +#include "lama.h" #include "utils.h" #include @@ -855,7 +855,7 @@ int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx) params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } */ -int llama_predict(void* params_ptr, void* state_pr) { +int llama_predict(void* params_ptr, void* state_pr, char* result) { gpt_params params = *(gpt_params*) params_ptr; llama_state state = *(llama_state*) state_pr; gpt_vocab vocab = state.vocab; @@ -882,38 +882,8 @@ int llama_predict(void* params_ptr, void* state_pr) { // tokenize the reverse prompt std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); - fprintf(stderr, "\n"); - /*fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); - } - fprintf(stderr, "\n"); - if (params.interactive) { -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = sigint_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -#elif defined (_WIN32) - signal(SIGINT, sigint_handler); -#endif - - fprintf(stderr, "%s: interactive mode on.\n", __func__); - - if(antiprompt_inp.size()) { - fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); - fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); - for (int i = 0; i < (int) antiprompt_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); - } - fprintf(stderr, "\n"); - } - } - fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); - fprintf(stderr, "\n\n"); -*/ + //fprintf(stderr, "\n"); + std::vector embd; // determine the required inference memory per token: @@ -925,36 +895,25 @@ int llama_predict(void* params_ptr, void* state_pr) { std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - if (params.interactive) { - fprintf(stderr, "== Running in interactive mode. ==\n" -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - " - Press Ctrl+C to interject at any time.\n" -#endif - " - Press Return to return control to LLaMa.\n" - " - If you want to submit another line, end your input in '\\'.\n"); - } + int remaining_tokens = params.n_predict; int input_consumed = 0; bool input_noecho = false; - // prompt user immediately after the starting prompt has been loaded - if (params.interactive_start) { - is_interacting = true; - } + std::string res = ""; - // set the color for the prompt which will be output initially - if (params.use_color) { - printf(ANSI_COLOR_YELLOW); - } - - while (remaining_tokens > 0) { + while (true) { + if (params.n_predict != 0 && remaining_tokens <= 0) { + break; + } // predict if (embd.size() > 0) { const int64_t t_start_us = ggml_time_us(); if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { // fprintf(stderr, "Failed to predict\n"); + strcpy(result, res.c_str()); return 1; } @@ -1005,70 +964,21 @@ int llama_predict(void* params_ptr, void* state_pr) { break; } } - - // reset color to default if we there is no pending user input - if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) { - printf(ANSI_COLOR_RESET); - } } // display text if (!input_noecho) { for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - } - - // in interactive mode, and not currently processing queued inputs; - // check if we should prompt the user for more - if (params.interactive && embd_inp.size() <= input_consumed) { - // check for reverse prompt - if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) { - // reverse prompt found - is_interacting = true; - } - if (is_interacting) { - // currently being interactive - bool another_line=true; - while (another_line) { - fflush(stdout); - char buf[256] = {0}; - int n_read; - if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); - if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { - // presumable empty line, consume the newline - scanf("%*c"); - n_read=0; - } - if(params.use_color) printf(ANSI_COLOR_RESET); - - if (n_read > 0 && buf[n_read-1]=='\\') { - another_line = true; - buf[n_read-1] = '\n'; - buf[n_read] = 0; - } else { - another_line = false; - buf[n_read] = '\n'; - buf[n_read+1] = 0; - } - - std::vector line_inp = ::llama_tokenize(vocab, buf, false); - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - - remaining_tokens -= line_inp.size(); - - input_noecho = true; // do not echo this again - } - - is_interacting = false; + res += vocab.id_to_token[id].c_str(); } } + // end of text token if (embd.back() == 2) { // fprintf(stderr, " [end of text]\n"); - return 2; + // return 2; + break; } } /* @@ -1094,6 +1004,7 @@ int llama_predict(void* params_ptr, void* state_pr) { printf(ANSI_COLOR_RESET); } */ + strcpy(result, res.c_str()); return 0; } diff --git a/main.h b/lama.h similarity index 86% rename from main.h rename to lama.h index 2b6b77f5f4d43..fc708bce51031 100644 --- a/main.h +++ b/lama.h @@ -12,7 +12,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n); void llama_free_params(void* params_ptr); -int llama_predict(void* params_ptr, void* state_pr); +int llama_predict(void* params_ptr, void* state_pr, char* result); #ifdef __cplusplus } diff --git a/main.go b/main.go index afa210e164393..7a484ba961d49 100644 --- a/main.go +++ b/main.go @@ -2,7 +2,7 @@ package main // #cgo CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mavx -mavx2 -mfma -mf16c -msse3 // #cgo CXXFLAGS: -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I. -// #include "main.h" +// #include "lama.h" import "C" import ( "bufio" @@ -11,9 +11,11 @@ import ( "io" "os" "reflect" + "runtime" "sort" "strconv" "strings" + "unsafe" ) var ( @@ -46,7 +48,7 @@ func main() { flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError) flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load") - flags.IntVar(&threads, "t", 4, "number of threads to use during computation") + flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation") flags.IntVar(&tokens, "n", 128, "number of tokens to predict") err := flags.Parse(os.Args[1:]) @@ -73,17 +75,14 @@ func main() { text := readMultiLineInput(reader) input := C.CString(text) + out := make([]byte, tokens) params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN)) - result = C.llama_predict(params, state) - switch result { - case 0: - case 1: - fmt.Println("\nPredicting failed") - os.Exit(1) - case 2: - fmt.Printf(" ") - } + C.llama_predict(params, state, (*C.char)(unsafe.Pointer(&out[0]))) + res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) + + res = strings.TrimPrefix(res, text) + fmt.Printf("\ngolang: %s\n", res) C.llama_free_params(params) @@ -125,6 +124,7 @@ func readMultiLineInput(reader *bufio.Reader) string { } text := strings.Join(lines, "") + fmt.Println("Sending", text) return text } From 00f34c1452aa2ee5a423efffd1dc6b21d5e659c8 Mon Sep 17 00:00:00 2001 From: Matvey Soloviev Date: Fri, 17 Mar 2023 05:48:39 +0100 Subject: [PATCH 20/44] Q4_1 quantization (#193) * Add AVX2 version of ggml_vec_dot_q4_1 * Small optimisations to q4_1 dot product (@Const-me) * Rearrange Q4_1 quantization to work for multipart models. (Fix #152) * Fix ggml_vec_mad_q4_1 too * Fix non-vectorised q4_1 vec mul --- ggml.c | 149 ++++++++++++++++++++++++++++++++++++++++++------------ utils.cpp | 20 +++++--- 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/ggml.c b/ggml.c index 535c7b7d281dd..c4f8389171026 100644 --- a/ggml.c +++ b/ggml.c @@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + const size_t bs = 2*sizeof(float) + QK/2; - float * restrict pm = (float *) (y); - float * restrict pd = (float *) (pm + nb); - uint8_t * restrict pb = (uint8_t *) (pd + nb); + uint8_t * restrict pd = ((uint8_t *)y + 0*bs); + uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float)); + uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float)); uint8_t pp[QK/2]; @@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - pm[i] = min; - pd[i] = d; + *(float *)pm = min; + *(float *)pd = d; + pm += bs; + pd += bs; for (int l = 0; l < QK; l += 2) { const float v0 = (x[i*QK + l + 0] - min)*id; @@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*QK/2, pp, sizeof(pp)); + memcpy(pb, pp, sizeof(pp)); + pb += bs; } } @@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pm = (const float *) (x); - const float * restrict pd = (const float *) (pm + nb); - const uint8_t * restrict pb = (const uint8_t *) (pd + nb); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { - const float m = pm[i]; - const float d = pd[i]; + const float d = *(const float *) (pd + i*bs); + const float m = *(const float *) (pm + i*bs); - const uint8_t * restrict pp = pb + i*QK/2; + const uint8_t * restrict pp = pb + i*bs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; @@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { const int nb = n / QK; - const float * restrict pm0 = (const float *) x; - const float * restrict pm1 = (const float *) y; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pd0 = (const float *) (pm0 + nb); - const float * restrict pd1 = (const float *) (pm1 + nb); + const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs); + + const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float)); - const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); - const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); + const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); + const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float)); float sumf = 0.0; -#if 1 +#if defined(__AVX2__) +#if QK == 32 + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + // Accumulator for constant offsets + float acc_offset = 0.0f; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float * m0 = (const float *) (pm0 + i*bs); + const float * m1 = (const float *) (pm1 + i*bs); + + const float * d0 = (const float *) (pd0 + i*bs); + const float * d1 = (const float *) (pd1 + i*bs); + + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; + + const __m256 d0v = _mm256_broadcast_ss( d0 ); + const __m256 d1v = _mm256_broadcast_ss( d1 ); + const __m256 m0v = _mm256_broadcast_ss( m0 ); + const __m256 m1v = _mm256_broadcast_ss( m1 ); + + + // Compute combined scale for the block + const __m256 scale_01 = _mm256_mul_ps( d0v, d1v ); + + // Compute cross scales for the block + const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); + const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); + const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + __m256i bx = bytesFromNibbles( p0 ); + __m256i by = bytesFromNibbles( p1 ); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. + + // Sign-extend first 16 signed bytes into int16_t + __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); + __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); + // Compute products of int16_t integers, add pairwise + __m256i i32 = _mm256_madd_epi16( x16, y16 ); + + // Sign-extend last 16 signed bytes into int16_t vectors + __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); + __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); + // Accumulate products of int16_t integers + i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) ); + + // compute sums of unsigned bytes in bx, by in blocks of 8. + // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000, + // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400. + // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ] + __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() ); + __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() ); + __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) ); + __m256 sums = _mm256_cvtepi32_ps( sumsi ); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps( i32 ); + // Apply the scale, and accumulate + // acc += d0*d1*x*y + d0*m1*x + d1*m0*y + acc = _mm256_fmadd_ps( scale_01, p, acc ); + acc = _mm256_fmadd_ps( cross_scales, sums, acc ); + // acc_offset += m0*m1 (for each entry in the block) + acc_offset += (*m0)*(*m1); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + + sumf = _mm_cvtss_f32( res ) + acc_offset * QK; +#else +#error "not implemented for QK" +#endif +#else // scalar for (int i = 0; i < nb; i++) { - const float m0 = pm0[i]; - const float m1 = pm1[i]; + const float m0 = *(const float *) (pm0 + i*bs); + const float m1 = *(const float *) (pm1 + i*bs); - const float d0 = pd0[i]; - const float d1 = pd1[i]; + const float d0 = *(const float *) (pd0 + i*bs); + const float d1 = *(const float *) (pd1 + i*bs); - const uint8_t * restrict p0 = pb0 + i*QK/2; - const uint8_t * restrict p1 = pb1 + i*QK/2; + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; @@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res assert(n % QK == 0); const int nb = n / QK; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pm = (const float *) (x); - const float * restrict pd = (const float *) (pm + nb); - const uint8_t * restrict pb = (const uint8_t *) (pd + nb); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { - const float m = pm[i]; - const float d = pd[i]; + const float d = *(const float *) (pd + i*bs); + const float m = *(const float *) (pm + i*bs); - const uint8_t * restrict pp = pb + i*QK/2; + const uint8_t * restrict pp = pb + i*bs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; diff --git a/utils.cpp b/utils.cpp index aa3ad1053da02..26e313d5f1bf9 100644 --- a/utils.cpp +++ b/utils.cpp @@ -489,7 +489,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { const int nb = k / qk; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t row_size = nb*bs; assert(k % qk == 0); @@ -498,10 +499,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t char * pdst = (char *) dst; - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); + for (int j = 0; j < n; j += k) { + uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); + uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); + uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); @@ -519,8 +520,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - pm[i] = min; - pd[i] = d; + *(float *) pd = d; + *(float *) pm = min; + pd += bs; + pm += bs; for (int l = 0; l < qk; l += 2) { const float v0 = (src[j + i*qk + l + 0] - min)*id; @@ -538,7 +541,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*qk/2, pp, pp_size); + memcpy(pb, pp, pp_size); + pb += bs; } } } From d79f4df0d55a3153121d6e200fa7334cc2be0d4d Mon Sep 17 00:00:00 2001 From: thement <40525767+thement@users.noreply.github.com> Date: Fri, 17 Mar 2023 21:05:58 +0100 Subject: [PATCH 21/44] Implement non-greedy tokenizer that tries to maximize token lengths (#242) * Implement non-greedy tokenizer that tries to maximize token lengths * Insert single space in front of the prompt - this is to match original llama tokenizer behavior --------- Co-authored-by: Jakub Horak --- lama.cpp | 2 ++ utils.cpp | 68 ++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/lama.cpp b/lama.cpp index efd2ac28ceff0..d79f1e3b37d04 100644 --- a/lama.cpp +++ b/lama.cpp @@ -874,6 +874,8 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) { std::vector logits; + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); // tokenize the prompt std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); diff --git a/utils.cpp b/utils.cpp index 26e313d5f1bf9..7539edd86d1a1 100644 --- a/utils.cpp +++ b/utils.cpp @@ -275,41 +275,57 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } +// TODO: Calculate this constant from the vocabulary +#define MAX_TOKEN_LEN 18 +// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - std::vector res; - - if (bos) { - res.push_back(1); // TODO: replace with vocab.bos - } - - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; + std::vector score; + std::vector prev; + int len = text.length(); + + score.resize(len + 1); + prev.resize(len + 1); + + // Forward pass + for (int i = 0; i < len; i++) { + int max_len = std::min(len - i, MAX_TOKEN_LEN); + for (int sub_len = 1; sub_len <= len - i; sub_len++) { + auto sub = text.substr(i, sub_len); + auto token = vocab.token_to_id.find(sub); + if (token != vocab.token_to_id.end()) { + int token_score = sub.length() * sub.length(); + int local_score = score[i] + token_score; + int next = i + sub_len; + if (score[next] < local_score) { + score[next] = local_score; + prev[next] = (*token).second; + } } } + } - if (l == 0) { - break; + // Backward pass + int i = len; + while (i > 0) { + gpt_vocab::id token_id = prev[i]; + if (token_id == 0) { + // TODO: Return error or something more meaningful + printf("failed to tokenize string!\n"); + break; } + res.push_back(token_id); + auto token = (*vocab.id_to_token.find(token_id)).second; + i -= token.length(); + } - res.push_back(t); - pos += l; + if (bos) { + res.push_back(1); // TODO: replace with vocab.bos } + // Pieces are in reverse order so correct that + std::reverse(res.begin(), res.end()); + return res; } From 655d8dd0e6b3bcbf7ad6c6aceeab7ebdac0f7068 Mon Sep 17 00:00:00 2001 From: mudler Date: Fri, 17 Mar 2023 21:24:34 +0100 Subject: [PATCH 22/44] Fix makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6281674cce1bc..cb9ccd644fd16 100644 --- a/Makefile +++ b/Makefile @@ -201,7 +201,7 @@ libllama.a: lama.o ggml.o utils.o quantize: quantize.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) -llama-go: main.go lama.cpp main.h +llama-go: main.go lama.cpp lama.h CGO_CFLAGS_ALLOW='-mf.*' go build . # # Tests From d04e7fc005e358d52a4a5b27880e83eefc933360 Mon Sep 17 00:00:00 2001 From: mudler Date: Fri, 17 Mar 2023 22:09:13 +0100 Subject: [PATCH 23/44] Update README --- README.md | 83 ++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 63e462356a5f6..b521ce6ee934d 100644 --- a/README.md +++ b/README.md @@ -1,81 +1,26 @@ # llama-go -Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in Golang with embedded C/C++. +This is [llama.cpp](https://github.com/ggerganov/llama.cpp) port in golang to use as a library. -## Description - -This project embeds the work of [llama.cpp](https://github.com/ggerganov/llama.cpp) in a Golang binary. -The main goal is to run the model using 4-bit quantization using CPU on Consumer-Grade hardware. - -At startup, the model is loaded and a prompt is offered to enter a prompt, -after the results have been printed another prompt can be entered. -The program can be quit using ctrl+c. - -This project was tested on Linux but should be able to get to work on macOS as well. - -## Requirements - -The memory requirements for the models are approximately: - -``` -7B -> 4 GB (1 file) -13B -> 8 GB (2 files) -30B -> 16 GB (4 files) -65B -> 32 GB (8 files) -``` - -## Installation - -```bash -# build this repo -git clone https://github.com/cornelk/llama-go -cd llama-go -make +## Usage -# install Python dependencies -python3 -m pip install torch numpy sentencepiece ``` - -Obtain the original LLaMA model weights and place them in ./models - -for example by using the https://github.com/shawwn/llama-dl script to download them. - -Use the following steps to convert the LLaMA-7B model to a format that is compatible: - -```bash -ls ./models -65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model - -# convert the 7B model to ggml FP16 format -python3 convert-pth-to-ggml.py models/7B/ 1 - -# quantize the model to 4-bits -./quantize.sh 7B +git clone XXX +cd XXX +make libllama.a +LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples/main.go -m ggml-alpaca-7b-q4.bin -n 10 ``` -When running the larger models, make sure you have enough disk space to store all the intermediate files. +## Model -## Usage +For a tiny model, you can use https://github.com/antimatter15/alpaca.cpp . -```bash -./llama-go -m ./models/13B/ggml-model-q4_0.bin -t 4 -n 128 +## License -Loading model ./models/13B/ggml-model-q4_0.bin... -Model loaded successfully. +MIT ->>> Some good pun names for a pet groomer: +## Acknowledgements -Some good pun names for a pet groomer: -Rub-a-Dub, Scooby Doo -Hair Force One -Duck and Cover, Two Fleas, One Duck -... - ->>> - -``` - -The settings can be changed at runtime, multiple values are possible: -```bash ->>> seed=1234 threads=8 -Settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95 -``` +- [llama.cpp](https://github.com/ggerganov/llama.cpp) +- https://github.com/cornelk/llama-go for the initial ideas +- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!) \ No newline at end of file From 2723820af86ca4366553725c20b1df41bcb23388 Mon Sep 17 00:00:00 2001 From: mudler Date: Fri, 17 Mar 2023 23:43:08 +0100 Subject: [PATCH 24/44] Rename import --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index b5878754e2ace..6a81f5b887b72 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ -module github.com/cornelk/llama-go +module github.com/go-skynet/llama-go go 1.19 From 2a81f9b893650f964dce74e1b61edc0dcd39edf9 Mon Sep 17 00:00:00 2001 From: mudler Date: Fri, 17 Mar 2023 23:43:15 +0100 Subject: [PATCH 25/44] Move to a lib --- examples/main.go | 81 +++++++++++++++++++ go/llama.go | 65 ++++++++++++++++ main.go | 198 ----------------------------------------------- 3 files changed, 146 insertions(+), 198 deletions(-) create mode 100644 examples/main.go create mode 100644 go/llama.go delete mode 100644 main.go diff --git a/examples/main.go b/examples/main.go new file mode 100644 index 0000000000000..bd1125cbad5f1 --- /dev/null +++ b/examples/main.go @@ -0,0 +1,81 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "os" + "runtime" + "strings" + + llama "github.com/go-skynet/llama-go/go" +) + +var ( + threads = 4 + tokens = 128 +) + +func main() { + var model string + + flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError) + flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load") + flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation") + flags.IntVar(&tokens, "n", 128, "number of tokens to predict") + + err := flags.Parse(os.Args[1:]) + if err != nil { + fmt.Printf("Parsing program arguments failed: %s", err) + os.Exit(1) + } + l := &llama.LLama{} + err = l.Load(model) + if err != nil { + fmt.Println("Loading the model failed:", err.Error()) + os.Exit(1) + } + fmt.Printf("Model loaded successfully.\n") + + reader := bufio.NewReader(os.Stdin) + + for { + text := readMultiLineInput(reader) + + res, err := l.Predict(threads, tokens, text) + if err != nil { + panic(err) + } + fmt.Printf("\ngolang: %s\n", res) + + fmt.Printf("\n\n") + } +} + +// readMultiLineInput reads input until an empty line is entered. +func readMultiLineInput(reader *bufio.Reader) string { + var lines []string + fmt.Print(">>> ") + + for { + line, err := reader.ReadString('\n') + if err != nil { + if err == io.EOF { + os.Exit(0) + } + fmt.Printf("Reading the prompt failed: %s", err) + os.Exit(1) + } + + if len(strings.TrimSpace(line)) == 0 { + break + } + + lines = append(lines, line) + } + + text := strings.Join(lines, "") + fmt.Println("Sending", text) + return text +} diff --git a/go/llama.go b/go/llama.go new file mode 100644 index 0000000000000..f8f8c47f5b07c --- /dev/null +++ b/go/llama.go @@ -0,0 +1,65 @@ +package llama + +// #cgo LDFLAGS: -lllama -lm -lstdc++ +// #include +import "C" +import ( + "fmt" + "strings" + "unsafe" +) + +var ( + repeatLastN = 64 + seed = -1 + threads = 4 + tokens = 128 + + topK = 40 + topP = 0.95 + temp = 0.80 + repeatPenalty = 1.30 + + nCtx = 512 // context size + + options = map[string]interface{}{ + "repeat_last_n": &repeatLastN, // last n tokens to penalize + "repeat_penalty": &repeatPenalty, + "seed": &seed, // RNG seed, -1 will seed based on current time + "temp": &temp, + "threads": &threads, + "tokens": &tokens, // new tokens to predict + "top_k": &topK, + "top_p": &topP, + } +) + +type LLama struct { + state unsafe.Pointer +} + +func (l *LLama) Load(model string) error { + state := C.llama_allocate_state() + modelPath := C.CString(model) + result := C.llama_bootstrap(modelPath, state, C.int(nCtx)) + if result != 0 { + return fmt.Errorf("failed loading model") + } + l.state = state + return nil +} + +func (l *LLama) Predict(threads int, tokens int, text string) (string, error) { + input := C.CString(text) + out := make([]byte, tokens) + params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), + C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN)) + C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) + res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) + + res = strings.TrimPrefix(res, " "+text) + + C.llama_free_params(params) + + return res, nil +} diff --git a/main.go b/main.go deleted file mode 100644 index 7a484ba961d49..0000000000000 --- a/main.go +++ /dev/null @@ -1,198 +0,0 @@ -package main - -// #cgo CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mavx -mavx2 -mfma -mf16c -msse3 -// #cgo CXXFLAGS: -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I. -// #include "lama.h" -import "C" -import ( - "bufio" - "flag" - "fmt" - "io" - "os" - "reflect" - "runtime" - "sort" - "strconv" - "strings" - "unsafe" -) - -var ( - repeatLastN = 64 - seed = -1 - threads = 4 - tokens = 128 - - topK = 40 - topP = 0.95 - temp = 0.80 - repeatPenalty = 1.30 - - nCtx = 512 // context size - - options = map[string]interface{}{ - "repeat_last_n": &repeatLastN, // last n tokens to penalize - "repeat_penalty": &repeatPenalty, - "seed": &seed, // RNG seed, -1 will seed based on current time - "temp": &temp, - "threads": &threads, - "tokens": &tokens, // new tokens to predict - "top_k": &topK, - "top_p": &topP, - } -) - -func main() { - var model string - - flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError) - flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load") - flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation") - flags.IntVar(&tokens, "n", 128, "number of tokens to predict") - - err := flags.Parse(os.Args[1:]) - if err != nil { - fmt.Printf("Parsing program arguments failed: %s", err) - os.Exit(1) - } - - state := C.llama_allocate_state() - - fmt.Printf("Loading model %s...\n", model) - modelPath := C.CString(model) - result := C.llama_bootstrap(modelPath, state, C.int(nCtx)) - if result != 0 { - fmt.Println("Loading the model failed") - os.Exit(1) - } - fmt.Printf("Model loaded successfully.\n") - - printSettings() - reader := bufio.NewReader(os.Stdin) - - for { - text := readMultiLineInput(reader) - - input := C.CString(text) - out := make([]byte, tokens) - params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), - C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN)) - C.llama_predict(params, state, (*C.char)(unsafe.Pointer(&out[0]))) - res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) - - res = strings.TrimPrefix(res, text) - fmt.Printf("\ngolang: %s\n", res) - - C.llama_free_params(params) - - fmt.Printf("\n\n") - } -} - -// readMultiLineInput reads input until an empty line is entered. -func readMultiLineInput(reader *bufio.Reader) string { - var lines []string - fmt.Print(">>> ") - - for { - line, err := reader.ReadString('\n') - if err != nil { - if err == io.EOF { - os.Exit(0) - } - fmt.Printf("Reading the prompt failed: %s", err) - os.Exit(1) - } - - if len(strings.TrimSpace(line)) == 0 { - break - } - - optionChanged, err := handleParameterChange(line) - if err != nil { - fmt.Printf("Reading the prompt failed: %s", err) - os.Exit(1) - } - if optionChanged { - lines = nil - fmt.Print(">>> ") - continue - } - - lines = append(lines, line) - } - - text := strings.Join(lines, "") - fmt.Println("Sending", text) - return text -} - -// handleParameterChange parses the input for any parameter changes. -// This is a generic function that can handle int and float type parameters. -// The parameters need to be referenced by pointer in the options map. -func handleParameterChange(input string) (bool, error) { - optionChanged := false - words := strings.Split(input, " ") - - for _, word := range words { - parsed := strings.Split(word, "=") - - if len(parsed) < 2 { - break - } - - s := strings.TrimSpace(parsed[0]) - opt, ok := options[s] - if !ok { - break - } - - val := reflect.ValueOf(opt) - if val.Kind() != reflect.Ptr { - return false, fmt.Errorf("option %s is not a pointer", s) - } - val = val.Elem() - argument := strings.TrimSpace(parsed[1]) - optionChanged = true - - switch val.Kind() { - case reflect.Int: - i, err := strconv.ParseInt(argument, 10, 64) - if err != nil { - return false, fmt.Errorf("parsing value '%s' as int: %w", argument, err) - } - val.SetInt(i) - - case reflect.Float32, reflect.Float64: - f, err := strconv.ParseFloat(argument, 64) - if err != nil { - return false, fmt.Errorf("parsing value '%s' as float: %w", argument, err) - } - val.SetFloat(f) - - default: - return false, fmt.Errorf("unsupported option %s type %T", s, opt) - } - } - - if optionChanged { - printSettings() - } - return optionChanged, nil -} - -// printSettings outputs the current settings, alphabetically sorted. -func printSettings() { - var settings sort.StringSlice - for setting, value := range options { - val := reflect.ValueOf(value) - if val.Kind() == reflect.Ptr { - val = val.Elem() - } - settings = append(settings, fmt.Sprintf("%s=%v", setting, val.Interface())) - } - sort.Sort(settings) - s := strings.Join(settings, " ") - fmt.Printf("Settings: %s\n\n", s) -} From bd93264ef1b07d9cd5f961d5817e1e860a768258 Mon Sep 17 00:00:00 2001 From: mudler Date: Fri, 17 Mar 2023 23:45:03 +0100 Subject: [PATCH 26/44] Rename imports --- README.md | 4 ++-- examples/main.go | 2 +- go.mod | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b521ce6ee934d..debad35532950 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ This is [llama.cpp](https://github.com/ggerganov/llama.cpp) port in golang to us ## Usage ``` -git clone XXX -cd XXX +git clone https://github.com/go-skynet/llama.git +cd llama make libllama.a LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples/main.go -m ggml-alpaca-7b-q4.bin -n 10 ``` diff --git a/examples/main.go b/examples/main.go index bd1125cbad5f1..d104f8277d02a 100644 --- a/examples/main.go +++ b/examples/main.go @@ -9,7 +9,7 @@ import ( "runtime" "strings" - llama "github.com/go-skynet/llama-go/go" + llama "github.com/go-skynet/llama/go" ) var ( diff --git a/go.mod b/go.mod index 6a81f5b887b72..85c809e1d8e21 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ -module github.com/go-skynet/llama-go +module github.com/go-skynet/llama go 1.19 From 5c6896a14de0dbe0ec9d7d06f08116d4b6d03592 Mon Sep 17 00:00:00 2001 From: mudler Date: Sat, 18 Mar 2023 00:05:25 +0100 Subject: [PATCH 27/44] Add options --- examples/main.go | 5 ++- go/llama.go | 49 +++++++++------------------ go/options.go | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 37 deletions(-) create mode 100644 go/options.go diff --git a/examples/main.go b/examples/main.go index d104f8277d02a..71034b53fd81c 100644 --- a/examples/main.go +++ b/examples/main.go @@ -30,8 +30,7 @@ func main() { fmt.Printf("Parsing program arguments failed: %s", err) os.Exit(1) } - l := &llama.LLama{} - err = l.Load(model) + l, err := llama.New(model, 0) if err != nil { fmt.Println("Loading the model failed:", err.Error()) os.Exit(1) @@ -43,7 +42,7 @@ func main() { for { text := readMultiLineInput(reader) - res, err := l.Predict(threads, tokens, text) + res, err := l.Predict(text, llama.SetTokens(tokens), llama.SetThreads(threads)) if err != nil { panic(err) } diff --git a/go/llama.go b/go/llama.go index f8f8c47f5b07c..2e13b5bf67ddd 100644 --- a/go/llama.go +++ b/go/llama.go @@ -9,51 +9,32 @@ import ( "unsafe" ) -var ( - repeatLastN = 64 - seed = -1 - threads = 4 - tokens = 128 - - topK = 40 - topP = 0.95 - temp = 0.80 - repeatPenalty = 1.30 - - nCtx = 512 // context size - - options = map[string]interface{}{ - "repeat_last_n": &repeatLastN, // last n tokens to penalize - "repeat_penalty": &repeatPenalty, - "seed": &seed, // RNG seed, -1 will seed based on current time - "temp": &temp, - "threads": &threads, - "tokens": &tokens, // new tokens to predict - "top_k": &topK, - "top_p": &topP, - } -) - type LLama struct { state unsafe.Pointer } -func (l *LLama) Load(model string) error { +func New(model string, ctxSize int) (*LLama, error) { + if ctxSize == 0 { + ctxSize = 512 + } state := C.llama_allocate_state() modelPath := C.CString(model) - result := C.llama_bootstrap(modelPath, state, C.int(nCtx)) + result := C.llama_bootstrap(modelPath, state, C.int(ctxSize)) if result != 0 { - return fmt.Errorf("failed loading model") + return nil, fmt.Errorf("failed loading model") } - l.state = state - return nil + + return &LLama{state: state}, nil } -func (l *LLama) Predict(threads int, tokens int, text string) (string, error) { +func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { + + po := NewPredictOptions(opts...) + input := C.CString(text) - out := make([]byte, tokens) - params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK), - C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN)) + out := make([]byte, po.Tokens) + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat)) C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) diff --git a/go/options.go b/go/options.go new file mode 100644 index 0000000000000..a98982996177a --- /dev/null +++ b/go/options.go @@ -0,0 +1,86 @@ +package llama + +import "runtime" + +type PredictOptions struct { + Seed, Threads, Tokens, TopK, Repeat int + TopP, Temperature, Penalty float64 +} + +type PredictOption func(p *PredictOptions) + +var DefaultOptions PredictOptions = PredictOptions{ + Seed: -1, + Threads: runtime.NumCPU(), + Tokens: 128, + TopK: 40, + TopP: 0.95, + Temperature: 0.80, + Penalty: 1.3, + Repeat: 64, +} + +// SetSeed sets the random seed for sampling text generation. +func SetSeed(seed int) PredictOption { + return func(p *PredictOptions) { + p.Seed = seed + } +} + +// SetThreads sets the number of threads to use for text generation. +func SetThreads(threads int) PredictOption { + return func(p *PredictOptions) { + p.Threads = threads + } +} + +// SetTokens sets the number of tokens to generate. +func SetTokens(tokens int) PredictOption { + return func(p *PredictOptions) { + p.Tokens = tokens + } +} + +// SetTopK sets the value for top-K sampling. +func SetTopK(topk int) PredictOption { + return func(p *PredictOptions) { + p.TopK = topk + } +} + +// SetTopP sets the value for nucleus sampling. +func SetTopP(topp float64) PredictOption { + return func(p *PredictOptions) { + p.TopP = topp + } +} + +// SetTemperature sets the temperature value for text generation. +func SetTemperature(temp float64) PredictOption { + return func(p *PredictOptions) { + p.Temperature = temp + } +} + +// SetPenalty sets the repetition penalty for text generation. +func SetPenalty(penalty float64) PredictOption { + return func(p *PredictOptions) { + p.Penalty = penalty + } +} + +// SetRepeat sets the number of times to repeat text generation. +func SetRepeat(repeat int) PredictOption { + return func(p *PredictOptions) { + p.Repeat = repeat + } +} + +// Create a new PredictOptions object with the given options. +func NewPredictOptions(opts ...PredictOption) PredictOptions { + p := DefaultOptions + for _, opt := range opts { + opt(&p) + } + return p +} From 56080ad745d1807e21ad3db421a005cfdc398dde Mon Sep 17 00:00:00 2001 From: mudler Date: Sat, 18 Mar 2023 11:17:59 +0100 Subject: [PATCH 28/44] Return errors if inference fails Signed-off-by: mudler --- go/llama.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/go/llama.go b/go/llama.go index 2e13b5bf67ddd..ededf51e86631 100644 --- a/go/llama.go +++ b/go/llama.go @@ -35,7 +35,10 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { out := make([]byte, po.Tokens) params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat)) - C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) + ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) + if ret != 0 { + return "", fmt.Errorf("inference failed") + } res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) res = strings.TrimPrefix(res, " "+text) From 8865f10a77344cd06beac89666428c654391a327 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Sat, 18 Mar 2023 04:17:19 -0700 Subject: [PATCH 29/44] Fix n^2 loop in tokenization (#254) This causes long prompts to parse very slowly. --- utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.cpp b/utils.cpp index 7539edd86d1a1..70115d04ccf23 100644 --- a/utils.cpp +++ b/utils.cpp @@ -290,7 +290,7 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st // Forward pass for (int i = 0; i < len; i++) { int max_len = std::min(len - i, MAX_TOKEN_LEN); - for (int sub_len = 1; sub_len <= len - i; sub_len++) { + for (int sub_len = 1; sub_len <= max_len; sub_len++) { auto sub = text.substr(i, sub_len); auto token = vocab.token_to_id.find(sub); if (token != vocab.token_to_id.end()) { From 8b9e5375998fb2aaac20569d62c1a047a5c20a28 Mon Sep 17 00:00:00 2001 From: Alex Nguyen Date: Sat, 18 Mar 2023 20:51:49 +0700 Subject: [PATCH 30/44] Remove unused code since n_vocab is model.hparams.n_vocab (#262) --- lama.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/lama.cpp b/lama.cpp index d79f1e3b37d04..3696ee0080f97 100644 --- a/lama.cpp +++ b/lama.cpp @@ -153,16 +153,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // load vocab { - const int32_t n_vocab = model.hparams.n_vocab; - - if (n_vocab != model.hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); - return false; - } - std::string word; - for (int i = 0; i < n_vocab; i++) { + for (int i = 0; i < model.hparams.n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); From e64e29db3709a7d5b1f85b2dc6102730b94ea956 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Mar 2023 17:30:00 +0200 Subject: [PATCH 31/44] Change RMSNorm eps to 1e-6 (#173) I think this is what is used in the Python code --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index c4f8389171026..e1da0c737fa67 100644 --- a/ggml.c +++ b/ggml.c @@ -5556,7 +5556,7 @@ static void ggml_compute_forward_rms_norm_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - const ggml_float eps = 1e-5f; // TODO: make this a parameter + const ggml_float eps = 1e-6f; // TODO: make this a parameter // TODO: optimize for (int i03 = 0; i03 < ne03; i03++) { @@ -5572,7 +5572,7 @@ static void ggml_compute_forward_rms_norm_f32( mean /= ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - + memcpy(y, x, ne00 * sizeof(float)); // for (int i00 = 0; i00 < ne00; i00++) { // y[i00] = x[i00]; From b4ad8f831effa1cd3db70bc6b65c7c436f922c65 Mon Sep 17 00:00:00 2001 From: mudler Date: Sun, 19 Mar 2023 19:38:00 +0100 Subject: [PATCH 32/44] Fix off by-one bug --- lama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lama.cpp b/lama.cpp index 3696ee0080f97..9d613c324d353 100644 --- a/lama.cpp +++ b/lama.cpp @@ -954,7 +954,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) { last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(embd_inp[input_consumed]); ++input_consumed; - if (embd.size() > params.n_batch) { + if ((int)embd.size() >= params.n_batch) { break; } } From 4fa8e4c4b1a66fdd2d183133d2b0a60bb7f1abc3 Mon Sep 17 00:00:00 2001 From: mudler Date: Sun, 19 Mar 2023 20:07:41 +0100 Subject: [PATCH 33/44] Allow to use f16memory Altought with alpaca does seems to have a huge impact on quality. --- examples/main.go | 2 +- go/llama.go | 8 +++----- go/options.go | 35 +++++++++++++++++++++++++++++++++-- lama.cpp | 25 ++++++++++++++++++------- lama.h | 2 +- 5 files changed, 56 insertions(+), 16 deletions(-) diff --git a/examples/main.go b/examples/main.go index 71034b53fd81c..32d5453ffeabf 100644 --- a/examples/main.go +++ b/examples/main.go @@ -30,7 +30,7 @@ func main() { fmt.Printf("Parsing program arguments failed: %s", err) os.Exit(1) } - l, err := llama.New(model, 0) + l, err := llama.New(model, llama.EnableF16Memory) if err != nil { fmt.Println("Loading the model failed:", err.Error()) os.Exit(1) diff --git a/go/llama.go b/go/llama.go index ededf51e86631..291a5c3e827e5 100644 --- a/go/llama.go +++ b/go/llama.go @@ -13,13 +13,11 @@ type LLama struct { state unsafe.Pointer } -func New(model string, ctxSize int) (*LLama, error) { - if ctxSize == 0 { - ctxSize = 512 - } +func New(model string, opts ...ModelOption) (*LLama, error) { + mo := NewModelOptions(opts...) state := C.llama_allocate_state() modelPath := C.CString(model) - result := C.llama_bootstrap(modelPath, state, C.int(ctxSize)) + result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory)) if result != 0 { return nil, fmt.Errorf("failed loading model") } diff --git a/go/options.go b/go/options.go index a98982996177a..c5e397f83eae4 100644 --- a/go/options.go +++ b/go/options.go @@ -2,24 +2,55 @@ package llama import "runtime" +type ModelOptions struct { + ContextSize int + F16Memory bool +} + type PredictOptions struct { Seed, Threads, Tokens, TopK, Repeat int TopP, Temperature, Penalty float64 } type PredictOption func(p *PredictOptions) +type ModelOption func(p *ModelOptions) + +var DefaultModelOptions ModelOptions = ModelOptions{ + ContextSize: 512, + F16Memory: false, +} var DefaultOptions PredictOptions = PredictOptions{ Seed: -1, Threads: runtime.NumCPU(), Tokens: 128, TopK: 40, - TopP: 0.95, - Temperature: 0.80, + TopP: 0.90, + Temperature: 0.95, Penalty: 1.3, Repeat: 64, } +// SetContext sets the context size. +func SetContext(c int) ModelOption { + return func(p *ModelOptions) { + p.ContextSize = c + } +} + +var EnableF16Memory ModelOption = func(p *ModelOptions) { + p.F16Memory = true +} + +// Create a new PredictOptions object with the given options. +func NewModelOptions(opts ...ModelOption) ModelOptions { + p := DefaultModelOptions + for _, opt := range opts { + opt(&p) + } + return p +} + // SetSeed sets the random seed for sampling text generation. func SetSeed(seed int) PredictOption { return func(p *PredictOptions) { diff --git a/lama.cpp b/lama.cpp index 9d613c324d353..77263af53db05 100644 --- a/lama.cpp +++ b/lama.cpp @@ -96,7 +96,7 @@ struct llama_state { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { +bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory) { // fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); std::vector f_buf(1024*1024); @@ -219,8 +219,14 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + if (f16memory) { + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v + + } else { + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + } ctx_size += (5 + 10*n_layer)*256; // object overhead @@ -306,8 +312,13 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + if (f16memory) { + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + } else { + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + } const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); @@ -824,14 +835,14 @@ int main(int argc, char ** argv) { */ -int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx) +int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory) // load the model { ggml_time_init(); llama_state* state = (llama_state*) state_pr; const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(model_path, state->model, state->vocab, n_ctx)) { + if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path); return 1; } diff --git a/lama.h b/lama.h index fc708bce51031..60ee6e673e6e2 100644 --- a/lama.h +++ b/lama.h @@ -6,7 +6,7 @@ extern "C" { void *llama_allocate_state(); -int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx); +int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory); void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n); From 47719aabb652e34510afb997697b6cefda8b04f0 Mon Sep 17 00:00:00 2001 From: mudler Date: Sun, 19 Mar 2023 20:14:59 +0100 Subject: [PATCH 34/44] Add ignore EOS --- go/llama.go | 2 +- go/options.go | 5 +++++ lama.cpp | 13 +++++++++++-- lama.h | 2 +- utils.h | 2 ++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/go/llama.go b/go/llama.go index 291a5c3e827e5..af4ea3bc8f877 100644 --- a/go/llama.go +++ b/go/llama.go @@ -32,7 +32,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { input := C.CString(text) out := make([]byte, po.Tokens) params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), - C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat)) + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS)) ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) if ret != 0 { return "", fmt.Errorf("inference failed") diff --git a/go/options.go b/go/options.go index c5e397f83eae4..c1be66f1cb477 100644 --- a/go/options.go +++ b/go/options.go @@ -10,6 +10,7 @@ type ModelOptions struct { type PredictOptions struct { Seed, Threads, Tokens, TopK, Repeat int TopP, Temperature, Penalty float64 + IgnoreEOS bool } type PredictOption func(p *PredictOptions) @@ -51,6 +52,10 @@ func NewModelOptions(opts ...ModelOption) ModelOptions { return p } +var IgnoreEOS PredictOption = func(p *PredictOptions) { + p.IgnoreEOS = true +} + // SetSeed sets the random seed for sampling text generation. func SetSeed(seed int) PredictOption { return func(p *PredictOptions) { diff --git a/lama.cpp b/lama.cpp index 77263af53db05..660ba11432494 100644 --- a/lama.cpp +++ b/lama.cpp @@ -27,6 +27,8 @@ #define ANSI_COLOR_RESET "\x1b[0m" #define ANSI_BOLD "\x1b[1m" +static const int EOS_TOKEN_ID = 2; + // determine number of model parts based on the dimension static const std::map LLAMA_N_PARTS = { { 4096, 1 }, @@ -942,6 +944,11 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) { { const int64_t t_start_sample_us = ggml_time_us(); + if (params.ignore_eos) { + // set the logit of the eos token to zero to avoid sampling it + logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0; + } + id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); last_n_tokens.erase(last_n_tokens.begin()); @@ -980,7 +987,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) { // end of text token - if (embd.back() == 2) { + if (embd.back() == EOS_TOKEN_ID) { // fprintf(stderr, " [end of text]\n"); // return 2; break; @@ -1018,7 +1025,7 @@ void* llama_allocate_state() { } void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, - float top_p, float temp, float repeat_penalty, int repeat_last_n) { + float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos) { gpt_params* params = new gpt_params; params->seed = seed; params->n_threads = threads; @@ -1031,6 +1038,8 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token params->repeat_penalty = repeat_penalty; params->prompt = prompt; + params->ignore_eos = ignore_eos; + return params; } diff --git a/lama.h b/lama.h index 60ee6e673e6e2..6a7b7fc60b36e 100644 --- a/lama.h +++ b/lama.h @@ -9,7 +9,7 @@ void *llama_allocate_state(); int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory); void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, - int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n); + int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos); void llama_free_params(void* params_ptr); int llama_predict(void* params_ptr, void* state_pr, char* result); diff --git a/utils.h b/utils.h index 021120b0513c7..946ab312ec792 100644 --- a/utils.h +++ b/utils.h @@ -35,6 +35,8 @@ struct gpt_params { bool interactive = false; // interactive mode bool interactive_start = false; // reverse prompt immediately std::string antiprompt = ""; // string upon seeing which more user input is prompted + + bool ignore_eos = false; // do not stop generating after eos }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); From 12f4f4a39b7a4d83e9c6d2f49ead3fe09b731cba Mon Sep 17 00:00:00 2001 From: mudler Date: Sun, 19 Mar 2023 20:15:46 +0100 Subject: [PATCH 35/44] Disable f16 on examples --- examples/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main.go b/examples/main.go index 32d5453ffeabf..66d7386cd7df3 100644 --- a/examples/main.go +++ b/examples/main.go @@ -30,7 +30,7 @@ func main() { fmt.Printf("Parsing program arguments failed: %s", err) os.Exit(1) } - l, err := llama.New(model, llama.EnableF16Memory) + l, err := llama.New(model) if err != nil { fmt.Println("Loading the model failed:", err.Error()) os.Exit(1) From e3c6247a81690e9830cc7f56a31a576a989e5a15 Mon Sep 17 00:00:00 2001 From: mudler Date: Sun, 19 Mar 2023 23:39:06 +0100 Subject: [PATCH 36/44] Set better defaults --- go/options.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/options.go b/go/options.go index c1be66f1cb477..346fff0dd5f43 100644 --- a/go/options.go +++ b/go/options.go @@ -25,10 +25,10 @@ var DefaultOptions PredictOptions = PredictOptions{ Seed: -1, Threads: runtime.NumCPU(), Tokens: 128, - TopK: 40, + TopK: 10000, TopP: 0.90, - Temperature: 0.95, - Penalty: 1.3, + Temperature: 0.96, + Penalty: 1, Repeat: 64, } From 0076188dd5481a8ca86af992896fbaa58ecbe011 Mon Sep 17 00:00:00 2001 From: mudler Date: Sun, 19 Mar 2023 23:39:17 +0100 Subject: [PATCH 37/44] Update README with useful links --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index debad35532950..7968ce8f61841 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples/main.go -m ggml-alpaca-7 ## Model -For a tiny model, you can use https://github.com/antimatter15/alpaca.cpp . +For a tiny model, you can use https://github.com/antimatter15/alpaca.cpp . For how to use the prompt, check: https://github.com/tatsu-lab/stanford_alpaca ## License @@ -22,5 +22,6 @@ MIT ## Acknowledgements - [llama.cpp](https://github.com/ggerganov/llama.cpp) +- https://github.com/tatsu-lab/stanford_alpaca - https://github.com/cornelk/llama-go for the initial ideas - https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!) \ No newline at end of file From 4b6c39d812bb383aa057d71083891bfb380e0ab6 Mon Sep 17 00:00:00 2001 From: Mack Straight Date: Mon, 20 Mar 2023 03:17:23 -0700 Subject: [PATCH 38/44] sentencepiece bpe compatible tokenizer (#252) * potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> --- Makefile | 2 +- lama.cpp | 21 ++++++- quantize.cpp | 24 +++++++- utils.cpp | 171 +++++++++++++++++++++++++++++++++++++++------------ utils.h | 3 +- 5 files changed, 176 insertions(+), 45 deletions(-) diff --git a/Makefile b/Makefile index cb9ccd644fd16..9095c586b5254 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ endif # CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC +CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC LDFLAGS = # OS specific diff --git a/lama.cpp b/lama.cpp index 660ba11432494..12417af0057fc 100644 --- a/lama.cpp +++ b/lama.cpp @@ -3,6 +3,7 @@ #include "utils.h" #include +#include #include #include #include @@ -114,10 +115,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != 0x67676d6c) { + if (magic == 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", + __func__, fname.c_str()); + return false; + } + if (magic != 0x67676d66) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); return false; } + + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n", + __func__, fname.c_str(), format_version); + return false; + } } int n_ff = 0; @@ -163,8 +178,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); + float score; + fin.read((char *) &score, sizeof(score)); + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; + vocab.score[i] = score; //if (i < 30000) { // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); diff --git a/quantize.cpp b/quantize.cpp index 9ff579eb9bafe..1ee8d2d9de862 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -3,6 +3,7 @@ #include "utils.h" #include +#include #include #include #include @@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna { uint32_t magic; finp.read((char *) &magic, sizeof(magic)); - if (magic != 0x67676d6c) { + if (magic == 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", + __func__, fname_inp.c_str()); + return false; + } + if (magic != 0x67676d66) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); return false; } fout.write((char *) &magic, sizeof(magic)); + + uint32_t format_version; + finp.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n", + __func__, fname_inp.c_str(), format_version); + return false; + } + + fout.write((char *) &format_version, sizeof(format_version)); } llama_hparams hparams; @@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna finp.read ((char *) word.data(), len); fout.write((char *) word.data(), len); + float score; + finp.read ((char *) &score, sizeof(score)); + fout.write((char *) &score, sizeof(score)); + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; + vocab.score[i] = score; } } diff --git a/utils.cpp b/utils.cpp index 70115d04ccf23..c4903f5ab310b 100644 --- a/utils.cpp +++ b/utils.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -275,58 +276,146 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } -// TODO: Calculate this constant from the vocabulary -#define MAX_TOKEN_LEN 18 -// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - std::vector res; - std::vector score; - std::vector prev; - int len = text.length(); - - score.resize(len + 1); - prev.resize(len + 1); - - // Forward pass - for (int i = 0; i < len; i++) { - int max_len = std::min(len - i, MAX_TOKEN_LEN); - for (int sub_len = 1; sub_len <= max_len; sub_len++) { - auto sub = text.substr(i, sub_len); - auto token = vocab.token_to_id.find(sub); - if (token != vocab.token_to_id.end()) { - int token_score = sub.length() * sub.length(); - int local_score = score[i] + token_score; - int next = i + sub_len; - if (score[next] < local_score) { - score[next] = local_score; - prev[next] = (*token).second; +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +struct llama_sp_symbol { + using index = int; + index prev; + index next; + std::string_view text; +}; + +struct llama_sp_bigram { + struct comparator { + bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { + return (l.score < r.score) || (l.score == r.score && l.left > r.left); + } + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llama_sp_symbol::index left; + llama_sp_symbol::index right; + float score; + size_t size; +}; + +struct llama_tokenizer { + llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {} + + void tokenize(std::string_view text, std::vector & output) { + // split string into utf8 chars + int index = 0; + while (!text.empty()) { + llama_sp_symbol sym; + size_t char_len = std::min(text.size(), utf8_len(text.data()[0])); + sym.text = std::string_view(text.data(), char_len); + sym.prev = index - 1; + text.remove_prefix(char_len); + sym.next = text.empty() ? -1 : index + 1; + index++; + symbols_.emplace_back(std::move(sym)); + } + + // seed the work queue with all possible 2-character tokens. + for (size_t i = 1; i < symbols_.size(); ++i) { + try_add_bigram(i - 1, i); + } + + // keep substituting the highest frequency pairs for as long as we can. + while (!work_queue_.empty()) { + auto bigram = work_queue_.top(); + work_queue_.pop(); + + auto & left_sym = symbols_[bigram.left]; + auto & right_sym = symbols_[bigram.right]; + + // if one of the symbols already got merged, skip it. + if (left_sym.text.empty() || right_sym.text.empty() || + left_sym.text.size() + right_sym.text.size() != bigram.size) { + continue; + } + + // merge the right sym into the left one + left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size()); + right_sym.text = std::string_view(""); + + // remove the right sym from the chain + left_sym.next = right_sym.next; + if (right_sym.next >= 0) { + symbols_[right_sym.next].prev = bigram.left; + } + + // find more substitutions + try_add_bigram(left_sym.prev, bigram.left); + try_add_bigram(bigram.left, left_sym.next); + } + + for (int i = 0; i != -1; i = symbols_[i].next) { + auto& symbol = symbols_[i]; + auto token = vocab_.token_to_id.find(std::string(symbol.text)); + + if (token == vocab_.token_to_id.end()) { + // output any symbols that did not form tokens as bytes. + for (int j = 0; j < symbol.text.size(); ++j) { + gpt_vocab::id token_id = static_cast(symbol.text[j]) + 3; + output.push_back(token_id); } + } else { + output.push_back((*token).second); } } } - // Backward pass - int i = len; - while (i > 0) { - gpt_vocab::id token_id = prev[i]; - if (token_id == 0) { - // TODO: Return error or something more meaningful - printf("failed to tokenize string!\n"); - break; +private: + void try_add_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size()); + auto token = vocab_.token_to_id.find(std::string(text)); + + if (token == vocab_.token_to_id.end()) { + return; } - res.push_back(token_id); - auto token = (*vocab.id_to_token.find(token_id)).second; - i -= token.length(); + + auto score = vocab_.score.find((*token).second); + + if (score == vocab_.score.end()) { + return; + } + + llama_sp_bigram bigram; + bigram.left = left; + bigram.right = right; + bigram.score = (*score).second; + bigram.size = text.size(); + work_queue_.push(bigram); } - if (bos) { - res.push_back(1); // TODO: replace with vocab.bos + const gpt_vocab & vocab_; + std::vector symbols_; + llama_sp_bigram::queue work_queue_; +}; + +std::vector llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) { + llama_tokenizer tokenizer(vocab); + std::vector output; + + if (text.size() == 0) { + return output; } - // Pieces are in reverse order so correct that - std::reverse(res.begin(), res.end()); + if (bos) { + output.push_back(1); + } - return res; + tokenizer.tokenize(text, output); + return output; } bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { diff --git a/utils.h b/utils.h index 946ab312ec792..7ff8a12a7c68e 100644 --- a/utils.h +++ b/utils.h @@ -55,6 +55,7 @@ struct gpt_vocab { std::map token_to_id; std::map id_to_token; + std::map score; }; void replace(std::string & str, const std::string & needle, const std::string & replacement); @@ -76,7 +77,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos); // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); From 7c2170ecfa55e4def9f5bb81470c9f26a2ccedd7 Mon Sep 17 00:00:00 2001 From: mudler Date: Tue, 21 Mar 2023 18:22:33 +0100 Subject: [PATCH 39/44] Enable loading 13b and 30b alpaca models --- go/llama.go | 2 +- go/options.go | 6 ++++++ lama.cpp | 21 ++++++++++++++++----- lama.h | 2 +- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/go/llama.go b/go/llama.go index af4ea3bc8f877..d815c936284c9 100644 --- a/go/llama.go +++ b/go/llama.go @@ -17,7 +17,7 @@ func New(model string, opts ...ModelOption) (*LLama, error) { mo := NewModelOptions(opts...) state := C.llama_allocate_state() modelPath := C.CString(model) - result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory)) + result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory), C.bool(mo.Alpaca)) if result != 0 { return nil, fmt.Errorf("failed loading model") } diff --git a/go/options.go b/go/options.go index 346fff0dd5f43..fd646d4e829a6 100644 --- a/go/options.go +++ b/go/options.go @@ -5,6 +5,7 @@ import "runtime" type ModelOptions struct { ContextSize int F16Memory bool + Alpaca bool } type PredictOptions struct { @@ -19,6 +20,7 @@ type ModelOption func(p *ModelOptions) var DefaultModelOptions ModelOptions = ModelOptions{ ContextSize: 512, F16Memory: false, + Alpaca: false, } var DefaultOptions PredictOptions = PredictOptions{ @@ -43,6 +45,10 @@ var EnableF16Memory ModelOption = func(p *ModelOptions) { p.F16Memory = true } +var EnableAlpaca ModelOption = func(p *ModelOptions) { + p.Alpaca = true +} + // Create a new PredictOptions object with the given options. func NewModelOptions(opts ...ModelOption) ModelOptions { p := DefaultModelOptions diff --git a/lama.cpp b/lama.cpp index 12417af0057fc..071b100207e6c 100644 --- a/lama.cpp +++ b/lama.cpp @@ -38,6 +38,14 @@ static const std::map LLAMA_N_PARTS = { { 8192, 8 }, }; +// determine number of model parts based on the dimension +static const std::map ALPACA_N_PARTS = { + { 4096, 1 }, + { 5120, 1 }, + { 6656, 1 }, + { 8192, 1 }, +}; + // default hparams (LLaMA 7B) struct llama_hparams { int32_t n_vocab = 32000; @@ -99,7 +107,7 @@ struct llama_state { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory) { +bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory, bool alpaca) { // fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); std::vector f_buf(1024*1024); @@ -154,7 +162,11 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab hparams.n_ctx = n_ctx; n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - n_parts = LLAMA_N_PARTS.at(hparams.n_embd); + if (alpaca) { + n_parts = ALPACA_N_PARTS.at(hparams.n_embd); + } else { + n_parts = LLAMA_N_PARTS.at(hparams.n_embd); + } /* fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -856,14 +868,14 @@ int main(int argc, char ** argv) { */ -int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory) +int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory, bool alpaca) // load the model { ggml_time_init(); llama_state* state = (llama_state*) state_pr; const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory)) { + if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory, alpaca)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path); return 1; } @@ -928,7 +940,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) { bool input_noecho = false; std::string res = ""; - while (true) { if (params.n_predict != 0 && remaining_tokens <= 0) { break; diff --git a/lama.h b/lama.h index 6a7b7fc60b36e..434a7367945d3 100644 --- a/lama.h +++ b/lama.h @@ -6,7 +6,7 @@ extern "C" { void *llama_allocate_state(); -int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory); +int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory, bool alpaca); void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos); From 7be5326e18ccef816d5cc4486a19653e922c4bc9 Mon Sep 17 00:00:00 2001 From: mudler Date: Tue, 21 Mar 2023 18:22:41 +0100 Subject: [PATCH 40/44] Stab a high number of tokens when 0 is supplied --- go/llama.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/go/llama.go b/go/llama.go index d815c936284c9..217e11421774c 100644 --- a/go/llama.go +++ b/go/llama.go @@ -30,6 +30,9 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { po := NewPredictOptions(opts...) input := C.CString(text) + if po.Tokens == 0 { + po.Tokens = 99999999 + } out := make([]byte, po.Tokens) params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS)) From 623e9d24922528cd20b78105dd117f4553ab36ad Mon Sep 17 00:00:00 2001 From: Casey Primozic Date: Tue, 21 Mar 2023 07:35:42 -0700 Subject: [PATCH 41/44] Add initial AVX512 support for dot product on Linux (#320) * Update Makefile to detect AVX512 support and add compiler flags if it's available * Based on existing AVX2 implementation, dot product on one 32-value block of 4-bit quantized ints at a time * Perform 8 bit -> 16 bit sign extension and multiply+add on 32 values at time instead of 16 * Use built-in AVX512 horizontal reduce add to get sum at the end * Manual unrolling on inner dot product loop to reduce loop counter overhead --- Makefile | 32 +++++++++++++++++++++++ ggml.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 9095c586b5254..1cae8bda9d65d 100644 --- a/Makefile +++ b/Makefile @@ -95,6 +95,38 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) ifneq (,$(findstring sse3,$(SSE3_M))) CFLAGS += -msse3 endif + AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo) + ifneq (,$(findstring avx512f,$(AVX512F_M))) + CFLAGS += -mavx512f + endif + AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo) + ifneq (,$(findstring avx512bw,$(AVX512BW_M))) + CFLAGS += -mavx512bw + endif + AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo) + ifneq (,$(findstring avx512dq,$(AVX512DQ_M))) + CFLAGS += -mavx512dq + endif + AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo) + ifneq (,$(findstring avx512vl,$(AVX512VL_M))) + CFLAGS += -mavx512vl + endif + AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo) + ifneq (,$(findstring avx512cd,$(AVX512CD_M))) + CFLAGS += -mavx512cd + endif + AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo) + ifneq (,$(findstring avx512er,$(AVX512ER_M))) + CFLAGS += -mavx512er + endif + AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo) + ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M))) + CFLAGS += -mavx512ifma + endif + AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo) + ifneq (,$(findstring avx512pf,$(AVX512PF_M))) + CFLAGS += -mavx512pf + endif else ifeq ($(UNAME_S),Haiku) AVX1_M := $(shell sysinfo -cpu | grep "AVX ") ifneq (,$(findstring avx,$(AVX1_M))) diff --git a/ggml.c b/ggml.c index e1da0c737fa67..f5f14c0ebd0da 100644 --- a/ggml.c +++ b/ggml.c @@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // AVX routines provided by GH user Const-me // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 -#if __AVX2__ +#if __AVX2__ || __AVX512F__ // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytesFromNibbles( const uint8_t* rsi ) @@ -397,7 +397,6 @@ static inline __m128i packNibbles( __m256i bytes ) } #endif - // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) @@ -1262,6 +1261,47 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } +#if __AVX512F__ && QK == 32 +static inline __m512 dot_q4_0_oneblock_avx512( + __m512 acc, + const uint8_t * pd0, + const uint8_t * pd1, + const uint8_t * pb0, + const uint8_t * pb1, + size_t bs, + int i +) { + const float * d0_0 = (const float *) (pd0 + i*bs); + const float * d1_0 = (const float *) (pd1 + i*bs); + + const uint8_t * restrict p0 = pb0 + (i+0)*bs; + const uint8_t * restrict p1 = pb1 + (i+0)*bs; + + // Compute combined scale for the block + float scaleScalar = d0_0[0] * d1_0[0]; + __m512 scale = _mm512_set1_ps( scaleScalar ); + + __m256i bx = bytesFromNibbles( p0 ); + __m256i by = bytesFromNibbles( p1 ); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + by = _mm256_sub_epi8( by, off ); + + // Sign-extend 16 signed bytes into int16_t + __m512i x32 = _mm512_cvtepi8_epi16( bx ); + __m512i y32 = _mm512_cvtepi8_epi16( by ); + // Compute products of int16_t integers, add pairwise + __m512i i64 = _mm512_madd_epi16( x32, y32 ); + + // Convert int32_t to float + __m512 p = _mm512_cvtepi32_ps( i64 ); + // Apply the scale, and accumulate + return _mm512_fmadd_ps( scale, p, acc ); +} +#endif + inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; @@ -1417,6 +1457,40 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void #else #error "not implemented for QK" #endif +#elif defined(__AVX512F__) + +#if QK == 32 + // Initialize accumulator with zeros + __m512 acc0 = _mm512_setzero_ps(); + __m512 acc1 = _mm512_setzero_ps(); + + const int superblock_size = 8; + const int superblock_count = nb / superblock_size; + const int remainder = nb % superblock_size; + + for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { + int i = superblock_ix * superblock_size; + + acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 ); + } + + // Remainders + for (int i = superblock_count * superblock_size; i < nb; ++i) { + acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i ); + } + + // Horizontal sum of all lanes of the accumulator + sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); +#else +#error "not implemented for QK" +#endif #elif defined(__AVX2__) #if QK == 32 const size_t countBlocks = nb; @@ -1928,7 +2002,7 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res const size_t bs = 2*sizeof(float) + QK/2; const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { From 7e0ecbd19c759bf9cc59b86f1ced893f7271a0b8 Mon Sep 17 00:00:00 2001 From: mudler Date: Thu, 23 Mar 2023 21:52:05 +0100 Subject: [PATCH 42/44] Revert "Add initial AVX512 support for dot product on Linux" This reverts commit 623e9d24922528cd20b78105dd117f4553ab36ad. --- Makefile | 32 ----------------------- ggml.c | 80 +++----------------------------------------------------- 2 files changed, 3 insertions(+), 109 deletions(-) diff --git a/Makefile b/Makefile index 1cae8bda9d65d..9095c586b5254 100644 --- a/Makefile +++ b/Makefile @@ -95,38 +95,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) ifneq (,$(findstring sse3,$(SSE3_M))) CFLAGS += -msse3 endif - AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo) - ifneq (,$(findstring avx512f,$(AVX512F_M))) - CFLAGS += -mavx512f - endif - AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo) - ifneq (,$(findstring avx512bw,$(AVX512BW_M))) - CFLAGS += -mavx512bw - endif - AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo) - ifneq (,$(findstring avx512dq,$(AVX512DQ_M))) - CFLAGS += -mavx512dq - endif - AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo) - ifneq (,$(findstring avx512vl,$(AVX512VL_M))) - CFLAGS += -mavx512vl - endif - AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo) - ifneq (,$(findstring avx512cd,$(AVX512CD_M))) - CFLAGS += -mavx512cd - endif - AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo) - ifneq (,$(findstring avx512er,$(AVX512ER_M))) - CFLAGS += -mavx512er - endif - AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo) - ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M))) - CFLAGS += -mavx512ifma - endif - AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo) - ifneq (,$(findstring avx512pf,$(AVX512PF_M))) - CFLAGS += -mavx512pf - endif else ifeq ($(UNAME_S),Haiku) AVX1_M := $(shell sysinfo -cpu | grep "AVX ") ifneq (,$(findstring avx,$(AVX1_M))) diff --git a/ggml.c b/ggml.c index f5f14c0ebd0da..e1da0c737fa67 100644 --- a/ggml.c +++ b/ggml.c @@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // AVX routines provided by GH user Const-me // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 -#if __AVX2__ || __AVX512F__ +#if __AVX2__ // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytesFromNibbles( const uint8_t* rsi ) @@ -397,6 +397,7 @@ static inline __m128i packNibbles( __m256i bytes ) } #endif + // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) @@ -1261,47 +1262,6 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -#if __AVX512F__ && QK == 32 -static inline __m512 dot_q4_0_oneblock_avx512( - __m512 acc, - const uint8_t * pd0, - const uint8_t * pd1, - const uint8_t * pb0, - const uint8_t * pb1, - size_t bs, - int i -) { - const float * d0_0 = (const float *) (pd0 + i*bs); - const float * d1_0 = (const float *) (pd1 + i*bs); - - const uint8_t * restrict p0 = pb0 + (i+0)*bs; - const uint8_t * restrict p1 = pb1 + (i+0)*bs; - - // Compute combined scale for the block - float scaleScalar = d0_0[0] * d1_0[0]; - __m512 scale = _mm512_set1_ps( scaleScalar ); - - __m256i bx = bytesFromNibbles( p0 ); - __m256i by = bytesFromNibbles( p1 ); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - by = _mm256_sub_epi8( by, off ); - - // Sign-extend 16 signed bytes into int16_t - __m512i x32 = _mm512_cvtepi8_epi16( bx ); - __m512i y32 = _mm512_cvtepi8_epi16( by ); - // Compute products of int16_t integers, add pairwise - __m512i i64 = _mm512_madd_epi16( x32, y32 ); - - // Convert int32_t to float - __m512 p = _mm512_cvtepi32_ps( i64 ); - // Apply the scale, and accumulate - return _mm512_fmadd_ps( scale, p, acc ); -} -#endif - inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; @@ -1457,40 +1417,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void #else #error "not implemented for QK" #endif -#elif defined(__AVX512F__) - -#if QK == 32 - // Initialize accumulator with zeros - __m512 acc0 = _mm512_setzero_ps(); - __m512 acc1 = _mm512_setzero_ps(); - - const int superblock_size = 8; - const int superblock_count = nb / superblock_size; - const int remainder = nb % superblock_size; - - for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { - int i = superblock_ix * superblock_size; - - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 ); - } - - // Remainders - for (int i = superblock_count * superblock_size; i < nb; ++i) { - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i ); - } - - // Horizontal sum of all lanes of the accumulator - sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); -#else -#error "not implemented for QK" -#endif #elif defined(__AVX2__) #if QK == 32 const size_t countBlocks = nb; @@ -2002,7 +1928,7 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res const size_t bs = 2*sizeof(float) + QK/2; const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { From a3563a2690baef395edb7ee41a96acf5376862d3 Mon Sep 17 00:00:00 2001 From: mudler Date: Sat, 25 Mar 2023 23:37:42 +0100 Subject: [PATCH 43/44] Trim newline returned by model --- go/llama.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/go/llama.go b/go/llama.go index 217e11421774c..151bbe94047d6 100644 --- a/go/llama.go +++ b/go/llama.go @@ -42,7 +42,9 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { } res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) - res = strings.TrimPrefix(res, " "+text) + res = strings.TrimPrefix(res, " ") + res = strings.TrimPrefix(res, text) + res = strings.TrimPrefix(res, "\n") C.llama_free_params(params) From 84efc8db364743915a17f52fd22e2afb4c2e948d Mon Sep 17 00:00:00 2001 From: mudler Date: Wed, 29 Mar 2023 18:52:01 +0200 Subject: [PATCH 44/44] Add compatibility to gpt4all models --- go/llama.go | 3 ++- go/options.go | 6 ++++++ lama.cpp | 21 ++++++++++++++++----- lama.h | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/go/llama.go b/go/llama.go index 151bbe94047d6..51f336dc34c75 100644 --- a/go/llama.go +++ b/go/llama.go @@ -17,7 +17,7 @@ func New(model string, opts ...ModelOption) (*LLama, error) { mo := NewModelOptions(opts...) state := C.llama_allocate_state() modelPath := C.CString(model) - result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory), C.bool(mo.Alpaca)) + result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory), C.bool(mo.Alpaca), C.bool(mo.GPT4all)) if result != 0 { return nil, fmt.Errorf("failed loading model") } @@ -34,6 +34,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { po.Tokens = 99999999 } out := make([]byte, po.Tokens) + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS)) ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) diff --git a/go/options.go b/go/options.go index fd646d4e829a6..3bcadfbf19d92 100644 --- a/go/options.go +++ b/go/options.go @@ -6,6 +6,7 @@ type ModelOptions struct { ContextSize int F16Memory bool Alpaca bool + GPT4all bool } type PredictOptions struct { @@ -21,6 +22,7 @@ var DefaultModelOptions ModelOptions = ModelOptions{ ContextSize: 512, F16Memory: false, Alpaca: false, + GPT4all: false, } var DefaultOptions PredictOptions = PredictOptions{ @@ -49,6 +51,10 @@ var EnableAlpaca ModelOption = func(p *ModelOptions) { p.Alpaca = true } +var EnableGPT4All ModelOption = func(p *ModelOptions) { + p.GPT4all = true +} + // Create a new PredictOptions object with the given options. func NewModelOptions(opts ...ModelOption) ModelOptions { p := DefaultModelOptions diff --git a/lama.cpp b/lama.cpp index 071b100207e6c..500fd7f0a6e43 100644 --- a/lama.cpp +++ b/lama.cpp @@ -107,7 +107,7 @@ struct llama_state { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory, bool alpaca) { +bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory, bool alpaca, bool gpt4all) { // fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); std::vector f_buf(1024*1024); @@ -149,7 +149,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // load hparams { auto & hparams = model.hparams; - + if (gpt4all) { + model.hparams.n_vocab++; + } fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); @@ -183,7 +185,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // load vocab { std::string word; - for (int i = 0; i < model.hparams.n_vocab; i++) { + int n_vocab = model.hparams.n_vocab; + if (gpt4all) { + n_vocab = n_vocab - 1; + } + + for (int i = 0; i < n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); @@ -197,6 +204,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab vocab.id_to_token[i] = word; vocab.score[i] = score; + if (gpt4all) { + vocab.token_to_id[""] = n_vocab - 1; + vocab.id_to_token[ n_vocab - 1] = ""; + } //if (i < 30000) { // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); //} @@ -868,14 +879,14 @@ int main(int argc, char ** argv) { */ -int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory, bool alpaca) +int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory, bool alpaca, bool gpt4all) // load the model { ggml_time_init(); llama_state* state = (llama_state*) state_pr; const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory, alpaca)) { + if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory, alpaca, gpt4all)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path); return 1; } diff --git a/lama.h b/lama.h index 434a7367945d3..5c9847f299548 100644 --- a/lama.h +++ b/lama.h @@ -6,7 +6,7 @@ extern "C" { void *llama_allocate_state(); -int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory, bool alpaca); +int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory, bool alpaca, bool gpt4all); void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos);