From bb95f56538559ebde1484e32c80a2aaf00b8940c Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Sun, 12 Mar 2023 20:49:42 -0600
Subject: [PATCH 01/44] first go embedded version

---
 .gitignore   |   3 +
 Makefile     |  13 ++--
 go.mod       |   3 +
 main.cpp     | 177 ++++++++++++++++++++++++++++++++-------------------
 main.go      |  61 ++++++++++++++++++
 main.h       |  19 ++++++
 quantize.cpp |   2 +
 7 files changed, 208 insertions(+), 70 deletions(-)
 create mode 100644 go.mod
 create mode 100644 main.go
 create mode 100644 main.h
diff --git a/.gitignore b/.gitignore
index 5eb1ff1b873f1..4414b8428d40c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.o
+*.so
 *.a
 .cache/
+.idea
 .vs/
 .vscode/
 .DS_Store
@@ -18,6 +20,7 @@ models/*
 
 /main
 /quantize
+/llama-go
 
 arm_neon.h
 compile_commands.json
diff --git a/Makefile b/Makefile
index 8388c290d75ce..95a2e3b97cf4e 100644
--- a/Makefile
+++ b/Makefile
@@ -172,7 +172,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: main quantize
+default: main.o quantize libllama.a
 
 #
 # Build library
@@ -185,11 +185,14 @@ utils.o: utils.cpp utils.h
 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
 
 clean:
-	rm -f *.o main quantize
+	rm -f *.o *.a quantize
 
-main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
-	./main -h
+main.o: ggml.o utils.o
+	$(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS)
+	#./main -h
+
+libllama.a: main.o ggml.o utils.o
+	ar src libllama.a main.o ggml.o utils.o
 
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000000000..b5878754e2ace
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/cornelk/llama-go
+
+go 1.19
diff --git a/main.cpp b/main.cpp
index 2f47480698f1e..a7d312b1bc4d4 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-
+#include "main.h"
 #include "utils.h"
 
 #include <cassert>
@@ -69,9 +69,19 @@ struct llama_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
+struct llama_state {
+    gpt_vocab vocab;
+    llama_model model;
+    struct {
+        int64_t t_load_us = -1;
+        int64_t t_sample_us = -1;
+        int64_t t_predict_us = -1;
+    } timing;
+};
+
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+//    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
     if (!fin) {
@@ -110,16 +120,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
 
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_mult  = %d\n", __func__, hparams.n_mult);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
-        printf("%s: n_ff    = %d\n", __func__, n_ff);
-        printf("%s: n_parts = %d\n", __func__, n_parts);
+//        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+//        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+//        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+//        printf("%s: n_mult  = %d\n", __func__, hparams.n_mult);
+//        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+//        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+//        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+//        printf("%s: f16     = %d\n", __func__, hparams.f16);
+//        printf("%s: n_ff    = %d\n", __func__, n_ff);
+//        printf("%s: n_parts = %d\n", __func__, n_parts);
     }
 
     // load vocab
@@ -203,7 +213,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         ctx_size += (5 + 10*n_layer)*256; // object overhead
 
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+//        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
 
     // create the ggml context
@@ -290,7 +300,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
-        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+//        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
 
     const size_t file_offset = fin.tellg();
@@ -308,7 +318,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             fname_part += "." + std::to_string(i);
         }
 
-        printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
+//        printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 
         fin = std::ifstream(fname_part, std::ios::binary);
         fin.seekg(file_offset);
@@ -318,7 +328,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             int n_tensors = 0;
             size_t total_size = 0;
 
-            printf("%s: ", __func__);
+//            printf("%s: ", __func__);
 
             while (true) {
                 int32_t n_dims;
@@ -482,15 +492,15 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                 }
 
                 //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-                if (++n_tensors % 8 == 0) {
-                    printf(".");
-                    fflush(stdout);
-                }
+//                if (++n_tensors % 8 == 0) {
+//                    printf(".");
+//                    fflush(stdout);
+//                }
             }
 
-            printf(" done\n");
+//            printf(" done\n");
 
-            printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+//            printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
         }
 
         fin.close();
@@ -732,6 +742,8 @@ bool llama_eval(
     return true;
 }
 
+/*
+
 int main(int argc, char ** argv) {
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -761,57 +773,89 @@ int main(int argc, char ** argv) {
     gpt_vocab vocab;
     llama_model model;
 
+ */
+
+void* llama_allocate_state() {
+    return new llama_state;
+}
+
+void* llama_allocate_params(const char *input, int threads, int tokens) {
+    gpt_params* params = new gpt_params;
+    params->prompt = input;
+    params->n_threads = threads;
+    params->n_predict = tokens;
+    return params;
+}
+
+void llama_free_params(void* params_ptr) {
+    gpt_params* params = (gpt_params*) params_ptr;
+    delete params;
+}
+
+bool llama_bootstrap(const char *model_path, void* state_pr)
     // load the model
     {
+        llama_state* state = (llama_state*) state_pr;
         const int64_t t_start_us = ggml_time_us();
 
-        if (!llama_model_load(params.model, model, vocab, 512)) {  // TODO: set context from user input ??
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
+        if (!llama_model_load(model_path, state->model, state->vocab, 512)) {  // TODO: set context from user input ??
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path);
+            return false;
         }
 
-        t_load_us = ggml_time_us() - t_start_us;
+        state->timing.t_load_us = ggml_time_us() - t_start_us;
+        return true;
     }
 
+int llama_predict(void* params_ptr, void* state_pr) {
+    gpt_params* params = (gpt_params*) params_ptr;
+    llama_state* state = (llama_state*) state_pr;
+
+    const int64_t t_main_start_us = ggml_time_us();
     int n_past = 0;
 
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
+    state->timing.t_sample_us = 0;
+    state->timing.t_predict_us = 0;
 
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector <gpt_vocab::id> embd_inp = ::llama_tokenize(state->vocab, params->prompt, true);
 
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+    params->n_predict = std::min(params->n_predict, state->model.hparams.n_ctx - (int) embd_inp.size());
 
     printf("\n");
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (int i = 0; i < (int) embd_inp.size(); i++) {
-        printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+//    printf("%s: prompt: '%s'\n", __func__, params->prompt.c_str());
+//    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+//    for (int i = 0; i < (int) embd_inp.size(); i++) {
+//        printf("%6d -> '%s'\n", embd_inp[i], state->vocab.id_to_token.at(embd_inp[i]).c_str());
+//    }
+//    printf("\n");
+//    printf("sampling parameters: temp = %f, top_k = %d, top_p = %f\n", params->temp, params->top_k, params->top_p);
+//    printf("\n\n");
+
+    std::vector <gpt_vocab::id> embd;
+
+    if (params->seed < 0) {
+        params->seed = time(NULL);
     }
-    printf("\n");
-    printf("sampling parameters: temp = %f, top_k = %d, top_p = %f\n", params.temp, params.top_k, params.top_p);
-    printf("\n\n");
-
-    std::vector<gpt_vocab::id> embd;
+    std::mt19937 rng(params->seed);
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
-    llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+    llama_eval(state->model, params->n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
 
-    for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+    for (int i = embd.size(); i < embd_inp.size() + params->n_predict; i++) {
         // predict
         if (embd.size() > 0) {
             const int64_t t_start_us = ggml_time_us();
 
-            if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+            if (!llama_eval(state->model, params->n_threads, n_past, embd, logits, mem_per_token)) {
                 printf("Failed to predict\n");
                 return 1;
             }
 
-            t_predict_us += ggml_time_us() - t_start_us;
+            state->timing.t_predict_us += ggml_time_us() - t_start_us;
         }
 
         n_past += embd.size();
@@ -819,19 +863,19 @@ int main(int argc, char ** argv) {
 
         if (i >= embd_inp.size()) {
             // sample next token
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
+            const float top_p = params->top_p;
+            const float temp = params->temp;
 
-            const int n_vocab = model.hparams.n_vocab;
+            const int n_vocab = state->model.hparams.n_vocab;
 
             gpt_vocab::id id = 0;
 
             {
                 const int64_t t_start_sample_us = ggml_time_us();
 
-                id = llama_sample_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_p, temp, rng);
+                id = llama_sample_top_p(state->vocab, logits.data() + (logits.size() - n_vocab), top_p, temp, rng);
 
-                t_sample_us += ggml_time_us() - t_start_sample_us;
+                state->timing.t_sample_us += ggml_time_us() - t_start_sample_us;
             }
 
             // add it to the context
@@ -840,7 +884,7 @@ int main(int argc, char ** argv) {
             // if here, it means we are still processing the input prompt
             for (int k = i; k < embd_inp.size(); k++) {
                 embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
+                if (embd.size() > params->n_batch) {
                     break;
                 }
             }
@@ -848,31 +892,34 @@ int main(int argc, char ** argv) {
         }
 
         // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
+        for (auto id: embd) {
+            printf("%s", state->vocab.id_to_token[id].c_str());
         }
         fflush(stdout);
 
         // end of text token
         if (embd.back() == 2) {
-            printf(" [end of text]\n");
-            break;
+//            printf(" [end of text]\n");
+            return 2;
         }
     }
 
     // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
+//    {
+//        const int64_t t_main_end_us = ggml_time_us();
+//
+//        printf("\n\n");
+//        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+//        printf("%s:     load time = %8.2f ms\n", __func__, state->timing.t_load_us / 1000.0f);
+//        printf("%s:   sample time = %8.2f ms\n", __func__, state->timing.t_sample_us / 1000.0f);
+//        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, state->timing.t_predict_us / 1000.0f, state->timing.t_predict_us / 1000.0f / n_past);
+//        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+//    }
+    return 0;
+}
 
-    ggml_free(model.ctx);
+void llama_finalize(llama_state &state) {
+    ggml_free(state.model.ctx);
 
-    return 0;
+//    return 0;
 }
diff --git a/main.go b/main.go
new file mode 100644
index 0000000000000..0dbaf2e8b2089
--- /dev/null
+++ b/main.go
@@ -0,0 +1,61 @@
+package main
+
+// #cgo CFLAGS:   -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mavx -mavx2 -mfma -mf16c -msse3
+// #cgo CXXFLAGS: -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I.
+// #include "main.h"
+import "C"
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"os"
+)
+
+func main() {
+	var model string
+	var threads, tokens int
+
+	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
+	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
+	flags.IntVar(&threads, "t", 4, "number of threads to use during computation")
+	flags.IntVar(&tokens, "n", 128, "number of tokens to predict")
+
+	err := flags.Parse(os.Args[1:])
+	if err != nil {
+		fmt.Printf("Parsing program arguments failed: %s", err)
+		os.Exit(1)
+	}
+
+	state := C.llama_allocate_state()
+
+	fmt.Printf("Loading model %s...", model)
+	modelPath := C.CString(model)
+	success := C.llama_bootstrap(modelPath, state)
+	if !success {
+		fmt.Println("Loading the model failed")
+		os.Exit(1)
+	}
+	fmt.Printf("Model loaded successfully.\n\n")
+
+	reader := bufio.NewReader(os.Stdin)
+	for {
+		fmt.Print("Enter prompt: ")
+		text, err := reader.ReadString('\n')
+		if err != nil {
+			fmt.Printf("Reading the prompt failed: %s", err)
+			os.Exit(1)
+		}
+
+		input := C.CString(text)
+		params := C.llama_allocate_params(input, C.int(threads), C.int(tokens))
+		result := C.llama_predict(params, state)
+		if result == 2 {
+			fmt.Println("Predicting failed")
+			os.Exit(1)
+		}
+
+		C.llama_free_params(params)
+
+		fmt.Printf("\n\n")
+	}
+}
diff --git a/main.h b/main.h
new file mode 100644
index 0000000000000..a30304372125f
--- /dev/null
+++ b/main.h
@@ -0,0 +1,19 @@
+// num.h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+
+void *llama_allocate_state();
+
+bool llama_bootstrap(const char *model_path, void *state_pr);
+
+void* llama_allocate_params(const char *input, int threads, int tokens);
+void llama_free_params(void* params_ptr);
+
+int llama_predict(void* params_ptr, void* state_pr);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/quantize.cpp b/quantize.cpp
index 0ae537339ecf3..a2b2f574369e4 100644
--- a/quantize.cpp
+++ b/quantize.cpp
@@ -288,6 +288,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
 // usage:
 //  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
+/*
 int main(int argc, char ** argv) {
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
@@ -335,3 +336,4 @@ int main(int argc, char ** argv) {
 
     return 0;
 }
+*/

From d0cc36c131f61e8d539156c2130959df631f553f Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Sun, 12 Mar 2023 21:29:55 -0600
Subject: [PATCH 02/44] improve prompt

---
 main.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.go b/main.go
index 0dbaf2e8b2089..718610f62b224 100644
--- a/main.go
+++ b/main.go
@@ -28,7 +28,7 @@ func main() {
 
 	state := C.llama_allocate_state()
 
-	fmt.Printf("Loading model %s...", model)
+	fmt.Printf("Loading model %s...\n", model)
 	modelPath := C.CString(model)
 	success := C.llama_bootstrap(modelPath, state)
 	if !success {
@@ -39,7 +39,7 @@ func main() {
 
 	reader := bufio.NewReader(os.Stdin)
 	for {
-		fmt.Print("Enter prompt: ")
+		fmt.Print(">>> ")
 		text, err := reader.ReadString('\n')
 		if err != nil {
 			fmt.Printf("Reading the prompt failed: %s", err)

From 97a9a9aaa64c96e7819ea2bcf12dee456f4d96d6 Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Sun, 12 Mar 2023 21:30:10 -0600
Subject: [PATCH 03/44] fix compilation of quantize

---
 Makefile     | 3 +--
 quantize.cpp | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 95a2e3b97cf4e..4dc4fa4d7e1d3 100644
--- a/Makefile
+++ b/Makefile
@@ -189,13 +189,12 @@ clean:
 
 main.o: ggml.o utils.o
 	$(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS)
-	#./main -h
 
 libllama.a: main.o ggml.o utils.o
 	ar src libllama.a main.o ggml.o utils.o
 
 quantize: quantize.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
 
 #
 # Tests
diff --git a/quantize.cpp b/quantize.cpp
index a2b2f574369e4..f8b1c4440b3bd 100644
--- a/quantize.cpp
+++ b/quantize.cpp
@@ -288,7 +288,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
 // usage:
 //  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
-/*
+#ifdef QUANTIZE
 int main(int argc, char ** argv) {
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
@@ -336,4 +336,4 @@ int main(int argc, char ** argv) {
 
     return 0;
 }
-*/
+#endif

From f090fdfb02f38ed7526a6d8d23f67fa53fb7458e Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Sun, 12 Mar 2023 21:30:37 -0600
Subject: [PATCH 04/44] update readme

---
 README.md | 170 +++++++++++++-----------------------------------------
 1 file changed, 39 insertions(+), 131 deletions(-)

diff --git a/README.md b/README.md
index 5194f6efc7b9d..d45b3e78f77c0 100644
--- a/README.md
+++ b/README.md
@@ -1,128 +1,39 @@
-# llama.cpp
+# llama-go
 
-Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++
+Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in Golang with embedded C/C++.
 
-**Hot topics**
+## Description
 
-- Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22
+This project embeds the work of [llama.cpp](https://github.com/ggerganov/llama.cpp) in a Golang binary.
+The main goal is to run the model using 4-bit quantization using CPU on Consumer-Grade hardware.
 
-## Description
+At startup, the model is loaded and a prompt is offered to enter a prompt,
+after the results have been printed another prompt can be entered.
+The program can be quit using ctrl+c.
 
-The main goal is to run the model using 4-bit quantization on a MacBook.
-
-- Plain C/C++ implementation without dependencies
-- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
-- AVX2 support for x86 architectures
-- Mixed F16 / F32 precision
-- 4-bit quantization support
-- Runs on the CPU
-
-This was hacked in an evening - I have no idea if it works correctly.
-Please do not make conclusions about the models based on the results from this implementation.
-For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly.
-New features will probably be added mostly through community contributions, if any.
-
----
-
-Here is a typical run using LLaMA-7B:
-
-```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
-I llama.cpp build info:
-I UNAME_S:  Darwin
-I UNAME_P:  arm
-I UNAME_M:  arm64
-I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
-I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.0 (clang-1400.0.29.202)
-I CXX:      Apple clang version 14.0.0 (clang-1400.0.29.202)
-
-make: Nothing to be done for `default'.
-main: seed = 1678486056
-llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
-llama_model_load: n_vocab = 32000
-llama_model_load: n_ctx   = 512
-llama_model_load: n_embd  = 4096
-llama_model_load: n_mult  = 256
-llama_model_load: n_head  = 32
-llama_model_load: n_layer = 32
-llama_model_load: n_rot   = 128
-llama_model_load: f16     = 2
-llama_model_load: n_ff    = 11008
-llama_model_load: ggml ctx size = 4529.34 MB
-llama_model_load: memory_size =   512.00 MB, n_mem = 16384
-llama_model_load: .................................... done
-llama_model_load: model size =  4017.27 MB / num tensors = 291
-
-main: prompt: 'Building a website can be done in 10 simple steps:'
-main: number of tokens in prompt = 15
-     1 -> ''
-  8893 -> 'Build'
-   292 -> 'ing'
-   263 -> ' a'
-  4700 -> ' website'
-   508 -> ' can'
-   367 -> ' be'
-  2309 -> ' done'
-   297 -> ' in'
- 29871 -> ' '
- 29896 -> '1'
- 29900 -> '0'
-  2560 -> ' simple'
-  6576 -> ' steps'
- 29901 -> ':'
-
-sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000
-
-
-Building a website can be done in 10 simple steps:
-1) Select a domain name and web hosting plan
-2) Complete a sitemap
-3) List your products
-4) Write product descriptions
-5) Create a user account
-6) Build the template
-7) Start building the website
-8) Advertise the website
-9) Provide email support
-10) Submit the website to search engines
-A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
-The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser.
-The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer.
-A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
-A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen.
-A website can also be viewed on different devices such as desktops, tablets and smartphones.
-Hence, to have a website displayed on a browser, the website must be hosted.
-A domain name is an address of a website. It is the name of the website.
-The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
-A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen.
-A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted.
-A domain name is an address of a website. It is the name of the website.
-A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
-The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser.
-A website is known as a website when it is hosted
-
-main: mem per token = 14434244 bytes
-main:     load time =  1332.48 ms
-main:   sample time =  1081.40 ms
-main:  predict time = 31378.77 ms / 61.41 ms per token
-main:    total time = 34036.74 ms
-```
+This project was tested on Linux but should be able to get to work on macOS as well.
 
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
+## Requirements
 
-https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
+The memory requirements for the models are approximately:
 
-## Usage
+```
+7B  -> 4 GB
+13B -> 8 GB
+30B -> 16 GB
+65B -> 32 GB
+```
+
+## Installation
 
 Here are the step for the LLaMA-7B model:
 
 ```bash
 # build this repo
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
+git clone https://github.com/cornelk/llama-go
+cd llama-go
 make
+CGO_CFLAGS_ALLOW='-mf.*' go build .
 
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
@@ -136,9 +47,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 
 # quantize the model to 4-bits
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
-
-# run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
 ```
 
 For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
@@ -156,12 +64,6 @@ You need to quantize each of them separately like this:
 ./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
 ```
 
-Everything else is the same. Simply run:
-
-```bash
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128
-```
-
 The number of files generated for each model is as follows:
 
 ```
@@ -173,16 +75,22 @@ The number of files generated for each model is as follows:
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 
-## Limitations
+## Usage
+
+```bash
+./llama-go -m ./models/13B/ggml-model-q4_0.bin -t 4 -n 128
 
-- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
-  - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
-  - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
-  In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
-- I don't know yet how much the quantization affects the quality of the generated text
-- Probably the token sampling can be improved
-- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
-  there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simlpy don't
-  know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the
-  performance will be the same, since no BLAS calls are invoked by the current implementation
+Loading model ./models/13B/ggml-model-q4_0.bin...
+Model loaded successfully.
 
+>>> Some good pun names for a pet groomer:
+
+Some good pun names for a pet groomer:
+Rub-a-Dub, Scooby Doo
+Hair Force One
+Duck and Cover, Two Fleas, One Duck
+...
+
+>>>
+
+```

From f86c433d466ea276497e612f44482be55db741ba Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Sun, 12 Mar 2023 22:30:41 -0600
Subject: [PATCH 05/44] update readme

---
 README.md | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index d45b3e78f77c0..3bfab0f0de264 100644
--- a/README.md
+++ b/README.md
@@ -18,16 +18,14 @@ This project was tested on Linux but should be able to get to work on macOS as w
 The memory requirements for the models are approximately:
 
 ```
-7B  -> 4 GB
-13B -> 8 GB
-30B -> 16 GB
-65B -> 32 GB
+7B  -> 4 GB (1 file)
+13B -> 8 GB (2 files)
+30B -> 16 GB (4 files)
+65B -> 32 GB (8 files)
 ```
 
 ## Installation
 
-Here are the step for the LLaMA-7B model:
-
 ```bash
 # build this repo
 git clone https://github.com/cornelk/llama-go
@@ -35,12 +33,18 @@ cd llama-go
 make
 CGO_CFLAGS_ALLOW='-mf.*' go build .
 
-# obtain the original LLaMA model weights and place them in ./models
-ls ./models
-65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
-
 # install Python dependencies
 python3 -m pip install torch numpy sentencepiece
+```
+
+Obtain the original LLaMA model weights and place them in ./models - 
+for example by using the https://github.com/shawwn/llama-dl script to download them.
+
+Use the following steps to convert the LLaMA-7B model to a format that is compatible:
+
+```bash
+ls ./models
+65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
 
 # convert the 7B model to ggml FP16 format
 python3 convert-pth-to-ggml.py models/7B/ 1
@@ -64,15 +68,6 @@ You need to quantize each of them separately like this:
 ./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
 ```
 
-The number of files generated for each model is as follows:
-
-```
-7B  -> 1 file
-13B -> 2 files
-30B -> 4 files
-65B -> 8 files
-```
-
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 
 ## Usage

From deb304d6dbb3775740e4447951293fafa28ff331 Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 09:00:18 -0600
Subject: [PATCH 06/44] add llama-go compilation to makefile

---
 Makefile  | 6 ++++--
 README.md | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 4dc4fa4d7e1d3..e8ea69e737f7c 100644
--- a/Makefile
+++ b/Makefile
@@ -172,7 +172,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: main.o quantize libllama.a
+default: main.o quantize libllama.a llama-go
 
 #
 # Build library
@@ -185,7 +185,7 @@ utils.o: utils.cpp utils.h
 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
 
 clean:
-	rm -f *.o *.a quantize
+	rm -f *.o *.a quantize llama-go
 
 main.o: ggml.o utils.o
 	$(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS)
@@ -196,6 +196,8 @@ libllama.a: main.o ggml.o utils.o
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
 
+llama-go:
+	CGO_CFLAGS_ALLOW='-mf.*' go build .
 #
 # Tests
 #
diff --git a/README.md b/README.md
index 3bfab0f0de264..6b1614ae872fb 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,6 @@ The memory requirements for the models are approximately:
 git clone https://github.com/cornelk/llama-go
 cd llama-go
 make
-CGO_CFLAGS_ALLOW='-mf.*' go build .
 
 # install Python dependencies
 python3 -m pip install torch numpy sentencepiece

From ea0ff166a0930c8c84c10070cf9f1d12ff9dd82f Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 13:08:37 -0600
Subject: [PATCH 07/44] reduce code changes

---
 main.cpp | 106 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 50 insertions(+), 56 deletions(-)

diff --git a/main.cpp b/main.cpp
index d1ec26e7cbc30..ea17e5f1dd3a2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -136,18 +136,18 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
-
-//        fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-//        fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-//        fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
-//        fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
-//        fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
-//        fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
-//        fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
-//        fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
-//        fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
-//        fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
-    }
+/*
+        fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
+        fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
+*/    }
 
     // load vocab
     {
@@ -515,11 +515,11 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                     fflush(stderr);
                 }*/
             }
+/*
+            fprintf(stderr, " done\n");
 
-//            fprintf(stderr, " done\n");
-
-//            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-        }
+            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+*/        }
 
         fin.close();
     }
@@ -867,27 +867,27 @@ bool llama_bootstrap(const char *model_path, void* state_pr)
     }
 */
 int llama_predict(void* params_ptr, void* state_pr) {
-    gpt_params* params = (gpt_params*) params_ptr;
-    llama_state* state = (llama_state*) state_pr;
-    if (params->seed < 0) {
-        params->seed = time(NULL);
+    gpt_params params = *(gpt_params*) params_ptr;
+    llama_state state = *(llama_state*) state_pr;
+    if (params.seed < 0) {
+        params.seed = time(NULL);
     }
-    std::mt19937 rng(params->seed);
+    std::mt19937 rng(params.seed);
     
     int n_past = 0;
 
-    state->timing.t_sample_us = 0;
-    state->timing.t_predict_us = 0;
+    state.timing.t_sample_us = 0;
+    state.timing.t_predict_us = 0;
 
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector <gpt_vocab::id> embd_inp = ::llama_tokenize(state->vocab, params->prompt, true);
+    std::vector <gpt_vocab::id> embd_inp = ::llama_tokenize(state.vocab, params.prompt, true);
 
-    params->n_predict = std::min(params->n_predict, state->model.hparams.n_ctx - (int) embd_inp.size());
+    params.n_predict = std::min(params.n_predict, state.model.hparams.n_ctx - (int) embd_inp.size());
 
     // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(state->vocab, params->antiprompt, false);
+    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(state.vocab, params.antiprompt, false);
 
     fprintf(stderr, "\n");
     /*fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@@ -896,7 +896,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
         fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
     }
     fprintf(stderr, "\n");
-    if (params->interactive) {
+    if (params.interactive) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
         struct sigaction sigint_action;
         sigint_action.sa_handler = sigint_handler;
@@ -923,16 +923,16 @@ int llama_predict(void* params_ptr, void* state_pr) {
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
-    llama_eval(state->model, params->n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
+    llama_eval(state.model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
 
-    llama_eval(state->model, params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+    llama_eval(state.model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
 
-    int last_n_size = params->repeat_last_n;
+    int last_n_size = params.repeat_last_n;
     std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 
 
-    if (params->interactive) {
+    if (params.interactive) {
         fprintf(stderr, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
                " - Press Ctrl+C to interject at any time.\n"
@@ -941,17 +941,17 @@ int llama_predict(void* params_ptr, void* state_pr) {
                " - If you want to submit another line, end your input in '\\'.\n");
     }
 
-    int remaining_tokens = params->n_predict;
+    int remaining_tokens = params.n_predict;
     int input_consumed = 0;
     bool input_noecho = false;
 
     // prompt user immediately after the starting prompt has been loaded
-    if (params->interactive_start) {
+    if (params.interactive_start) {
         is_interacting = true;
     }
 
     // set the color for the prompt which will be output initially
-    if (params->use_color) {
+    if (params.use_color) {
         printf(ANSI_COLOR_YELLOW);
     }
 
@@ -960,12 +960,12 @@ int llama_predict(void* params_ptr, void* state_pr) {
         if (embd.size() > 0) {
             const int64_t t_start_us = ggml_time_us();
 
-            if (!llama_eval(state->model, params->n_threads, n_past, embd, logits, mem_per_token)) {
+            if (!llama_eval(state.model, params.n_threads, n_past, embd, logits, mem_per_token)) {
                 fprintf(stderr, "Failed to predict\n");
                 return 1;
             }
 
-            state->timing.t_predict_us += ggml_time_us() - t_start_us;
+            state.timing.t_predict_us += ggml_time_us() - t_start_us;
         }
 
         n_past += embd.size();
@@ -973,24 +973,24 @@ int llama_predict(void* params_ptr, void* state_pr) {
 
         if (embd_inp.size() <= input_consumed) {
             // out of user input, sample next token
-            const float top_k = params->top_k;
-            const float top_p = params->top_p;
-            const float temp  = params->temp;
-            const float repeat_penalty = params->repeat_penalty;
+            const float top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+            const float repeat_penalty = params.repeat_penalty;
 
-            const int n_vocab = state->model.hparams.n_vocab;
+            const int n_vocab = state.model.hparams.n_vocab;
 
             gpt_vocab::id id = 0;
 
             {
                 const int64_t t_start_sample_us = ggml_time_us();
 
-                id = llama_sample_top_p_top_k(state->vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
+                id = llama_sample_top_p_top_k(state.vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
 
-                state->timing.t_sample_us += ggml_time_us() - t_start_sample_us;
+                state.timing.t_sample_us += ggml_time_us() - t_start_sample_us;
             }
 
             // add it to the context
@@ -1008,13 +1008,13 @@ int llama_predict(void* params_ptr, void* state_pr) {
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(embd_inp[input_consumed]);
                 ++input_consumed;
-                if (embd.size() > params->n_batch) {
+                if (embd.size() > params.n_batch) {
                     break;
                 }
             }
 
             // reset color to default if we there is no pending user input
-            if (!input_noecho && params->use_color && embd_inp.size() == input_consumed) {
+            if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
                 printf(ANSI_COLOR_RESET);
             }
         }
@@ -1022,14 +1022,14 @@ int llama_predict(void* params_ptr, void* state_pr) {
         // display text
         if (!input_noecho) {
             for (auto id : embd) {
-                printf("%s", state->vocab.id_to_token[id].c_str());
+                printf("%s", state.vocab.id_to_token[id].c_str());
             }
             fflush(stdout);
         }
 
         // in interactive mode, and not currently processing queued inputs;
         // check if we should prompt the user for more
-        if (params->interactive && embd_inp.size() <= input_consumed) {
+        if (params.interactive && embd_inp.size() <= input_consumed) {
             // check for reverse prompt
             if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
                 // reverse prompt found
@@ -1042,13 +1042,13 @@ int llama_predict(void* params_ptr, void* state_pr) {
                     fflush(stdout);
                     char buf[256] = {0};
                     int n_read;
-                    if(params->use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
+                    if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
                     if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
                         // presumable empty line, consume the newline
                         scanf("%*c");
                         n_read=0;
                     }
-                    if(params->use_color) printf(ANSI_COLOR_RESET);
+                    if(params.use_color) printf(ANSI_COLOR_RESET);
 
                     if (n_read > 0 && buf[n_read-1]=='\\') {
                         another_line = true;
@@ -1060,7 +1060,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
                         buf[n_read+1] = 0;
                     }
 
-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(state->vocab, buf, false);
+                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(state.vocab, buf, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     remaining_tokens -= line_inp.size();
@@ -1096,9 +1096,3 @@ int llama_predict(void* params_ptr, void* state_pr) {
 */
     return 0;
 }
-
-void llama_finalize(llama_state &state) {
-    ggml_free(state.model.ctx);
-
-//    return 0;
-}

From f8f93b8d77d07c0ed15356838787f13eee9f532b Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 13:16:46 -0600
Subject: [PATCH 08/44] reduce code changes

---
 main.cpp | 27 ++++++++++++++++-----------
 main.h   |  1 -
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/main.cpp b/main.cpp
index ea17e5f1dd3a2..56c319d72b9ec 100644
--- a/main.cpp
+++ b/main.cpp
@@ -509,7 +509,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                     total_size += ggml_nbytes(tensor)/n_parts;
                 }
 /*
-                fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
                 if (++n_tensors % 8 == 0) {
                     fprintf(stderr, ".");
                     fflush(stderr);
@@ -762,6 +762,7 @@ bool llama_eval(
 }
 
 static bool is_interacting = false;
+
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 void sigint_handler(int signo) {
     if (signo == SIGINT) {
@@ -869,6 +870,10 @@ bool llama_bootstrap(const char *model_path, void* state_pr)
 int llama_predict(void* params_ptr, void* state_pr) {
     gpt_params params = *(gpt_params*) params_ptr;
     llama_state state = *(llama_state*) state_pr;
+    gpt_vocab vocab = state.vocab;
+    llama_model model = state.model;
+
+
     if (params.seed < 0) {
         params.seed = time(NULL);
     }
@@ -882,12 +887,12 @@ int llama_predict(void* params_ptr, void* state_pr) {
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector <gpt_vocab::id> embd_inp = ::llama_tokenize(state.vocab, params.prompt, true);
+    std::vector <gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 
-    params.n_predict = std::min(params.n_predict, state.model.hparams.n_ctx - (int) embd_inp.size());
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 
     // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(state.vocab, params.antiprompt, false);
+    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
 
     fprintf(stderr, "\n");
     /*fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@@ -923,9 +928,9 @@ int llama_predict(void* params_ptr, void* state_pr) {
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
-    llama_eval(state.model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
+    llama_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
 
-    llama_eval(state.model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+    llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
 
     int last_n_size = params.repeat_last_n;
     std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
@@ -960,7 +965,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
         if (embd.size() > 0) {
             const int64_t t_start_us = ggml_time_us();
 
-            if (!llama_eval(state.model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+            if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
                 fprintf(stderr, "Failed to predict\n");
                 return 1;
             }
@@ -978,14 +983,14 @@ int llama_predict(void* params_ptr, void* state_pr) {
             const float temp  = params.temp;
             const float repeat_penalty = params.repeat_penalty;
 
-            const int n_vocab = state.model.hparams.n_vocab;
+            const int n_vocab = model.hparams.n_vocab;
 
             gpt_vocab::id id = 0;
 
             {
                 const int64_t t_start_sample_us = ggml_time_us();
 
-                id = llama_sample_top_p_top_k(state.vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
+                id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
@@ -1022,7 +1027,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
         // display text
         if (!input_noecho) {
             for (auto id : embd) {
-                printf("%s", state.vocab.id_to_token[id].c_str());
+                printf("%s", vocab.id_to_token[id].c_str());
             }
             fflush(stdout);
         }
@@ -1060,7 +1065,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
                         buf[n_read+1] = 0;
                     }
 
-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(state.vocab, buf, false);
+                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     remaining_tokens -= line_inp.size();
diff --git a/main.h b/main.h
index a30304372125f..536ebd209018b 100644
--- a/main.h
+++ b/main.h
@@ -1,4 +1,3 @@
-// num.h
 #ifdef __cplusplus
 extern "C" {
 #endif

From bf383913ab43b5845dce9da8982ade91a334af6f Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 13:21:31 -0600
Subject: [PATCH 09/44] simplify quantize instructions

---
 README.md | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 6b1614ae872fb..6ba7de8db8b6e 100644
--- a/README.md
+++ b/README.md
@@ -49,22 +49,7 @@ ls ./models
 python3 convert-pth-to-ggml.py models/7B/ 1
 
 # quantize the model to 4-bits
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
-```
-
-For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
-will create 2 ggml files, instead of one:
-
-```bash
-ggml-model-f16.bin
-ggml-model-f16.bin.1
-```
-
-You need to quantize each of them separately like this:
-
-```bash
-./quantize ./models/13B/ggml-model-f16.bin   ./models/13B/ggml-model-q4_0.bin 2
-./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
+./quantize.sh 7B
 ```
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.

From 9f9d3838913528bb10c17286fb24a4c467eac318 Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 13:42:25 -0600
Subject: [PATCH 10/44] do not print error on ctrl-c

---
 main.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/main.go b/main.go
index 718610f62b224..cde0d43b45680 100644
--- a/main.go
+++ b/main.go
@@ -8,6 +8,7 @@ import (
 	"bufio"
 	"flag"
 	"fmt"
+	"io"
 	"os"
 )
 
@@ -42,6 +43,9 @@ func main() {
 		fmt.Print(">>> ")
 		text, err := reader.ReadString('\n')
 		if err != nil {
+			if err == io.EOF {
+				os.Exit(0)
+			}
 			fmt.Printf("Reading the prompt failed: %s", err)
 			os.Exit(1)
 		}

From 86770550bdb999515e87ba7608e357195538a67f Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 18:30:56 -0600
Subject: [PATCH 11/44] minor improvements

---
 Makefile |  9 +++++----
 main.cpp | 45 ++++++++++++++++++++++-----------------------
 main.go  |  6 +++---
 main.h   |  2 +-
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/Makefile b/Makefile
index b96d694e799bb..d2862bb6f05d1 100644
--- a/Makefile
+++ b/Makefile
@@ -189,10 +189,11 @@ utils.o: utils.cpp utils.h
 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
 
 clean:
-	rm -f *.o *.a quantize llama-go
+	rm -f *.o main quantize
+	rm -f *.a llama-go
 
-main.o: ggml.o utils.o
-	$(CXX) $(CXXFLAGS) -c main.cpp -o main.o $(LDFLAGS)
+main.o: main.cpp ggml.o utils.o
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main.o -c $(LDFLAGS)
 
 libllama.a: main.o ggml.o utils.o
 	ar src libllama.a main.o ggml.o utils.o
@@ -200,7 +201,7 @@ libllama.a: main.o ggml.o utils.o
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
 
-llama-go:
+llama-go: main.go main.cpp main.h
 	CGO_CFLAGS_ALLOW='-mf.*' go build .
 #
 # Tests
diff --git a/main.cpp b/main.cpp
index 56c319d72b9ec..28074175675c7 100644
--- a/main.cpp
+++ b/main.cpp
@@ -827,37 +827,21 @@ int main(int argc, char ** argv) {
 
  */
 
-void* llama_allocate_state() {
-    return new llama_state;
-}
-
-void* llama_allocate_params(const char *input, int threads, int tokens) {
-    gpt_params* params = new gpt_params;
-    params->prompt = input;
-    params->n_threads = threads;
-    params->n_predict = tokens;
-    return params;
-}
-
-void llama_free_params(void* params_ptr) {
-    gpt_params* params = (gpt_params*) params_ptr;
-    delete params;
-}
-
-bool llama_bootstrap(const char *model_path, void* state_pr)
+int llama_bootstrap(const char *model_path, void* state_pr)
     // load the model
     {
         ggml_time_init();
         llama_state* state = (llama_state*) state_pr;
+
         const int64_t t_start_us = ggml_time_us();
 
         if (!llama_model_load(model_path, state->model, state->vocab, 512)) {  // TODO: set context from user input ??
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path);
-            return false;
+            return 1;
         }
 
         state->timing.t_load_us = ggml_time_us() - t_start_us;
-        return true;
+        return 0;
     }
 /*
     // print system information
@@ -887,7 +871,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector <gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 
     params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 
@@ -928,8 +912,6 @@ int llama_predict(void* params_ptr, void* state_pr) {
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
-    llama_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
-
     llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
 
     int last_n_size = params.repeat_last_n;
@@ -1101,3 +1083,20 @@ int llama_predict(void* params_ptr, void* state_pr) {
 */
     return 0;
 }
+
+void* llama_allocate_state() {
+    return new llama_state;
+}
+
+void* llama_allocate_params(const char *input, int threads, int tokens) {
+    gpt_params* params = new gpt_params;
+    params->prompt = input;
+    params->n_threads = threads;
+    params->n_predict = tokens;
+    return params;
+}
+
+void llama_free_params(void* params_ptr) {
+    gpt_params* params = (gpt_params*) params_ptr;
+    delete params;
+}
diff --git a/main.go b/main.go
index cde0d43b45680..c3f265ab30fe4 100644
--- a/main.go
+++ b/main.go
@@ -31,8 +31,8 @@ func main() {
 
 	fmt.Printf("Loading model %s...\n", model)
 	modelPath := C.CString(model)
-	success := C.llama_bootstrap(modelPath, state)
-	if !success {
+	result := C.llama_bootstrap(modelPath, state)
+	if result != 0 {
 		fmt.Println("Loading the model failed")
 		os.Exit(1)
 	}
@@ -52,7 +52,7 @@ func main() {
 
 		input := C.CString(text)
 		params := C.llama_allocate_params(input, C.int(threads), C.int(tokens))
-		result := C.llama_predict(params, state)
+		result = C.llama_predict(params, state)
 		if result == 2 {
 			fmt.Println("Predicting failed")
 			os.Exit(1)
diff --git a/main.h b/main.h
index 536ebd209018b..5cf54c62ce3d5 100644
--- a/main.h
+++ b/main.h
@@ -6,7 +6,7 @@ extern "C" {
 
 void *llama_allocate_state();
 
-bool llama_bootstrap(const char *model_path, void *state_pr);
+int llama_bootstrap(const char *model_path, void *state_pr);
 
 void* llama_allocate_params(const char *input, int threads, int tokens);
 void llama_free_params(void* params_ptr);

From b1f9a795505ad5db984c24ceed43a537ac63b836 Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 21:05:40 -0600
Subject: [PATCH 12/44] allow modifying parameters at runtime

---
 README.md |   6 +++
 main.cpp  |  11 +++++-
 main.go   | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 main.h    |   3 +-
 4 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 6ba7de8db8b6e..e42c1a86f6c15 100644
--- a/README.md
+++ b/README.md
@@ -73,3 +73,9 @@ Duck and Cover, Two Fleas, One Duck
 >>>
 
 ```
+
+The settings can be changed at runtime, multiple values are possible:
+```bash
+>>> seed=1234 threads=8
+Current settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95
+```
diff --git a/main.cpp b/main.cpp
index 28074175675c7..415b1d5bde62c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1088,11 +1088,18 @@ void* llama_allocate_state() {
     return new llama_state;
 }
 
-void* llama_allocate_params(const char *input, int threads, int tokens) {
+void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
+                            float top_p, float temp, float repeat_penalty) {
     gpt_params* params = new gpt_params;
-    params->prompt = input;
+    params->seed = seed;
     params->n_threads = threads;
     params->n_predict = tokens;
+    params->top_k = top_k;
+    params->top_p = top_p;
+    params->n_predict = tokens;
+    params->temp = temp;
+    params->repeat_penalty = repeat_penalty;
+    params->prompt = prompt;
     return params;
 }
 
diff --git a/main.go b/main.go
index c3f265ab30fe4..9226c59a06d70 100644
--- a/main.go
+++ b/main.go
@@ -10,11 +10,35 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"reflect"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+var (
+	seed    = -1
+	threads = 0
+	tokens  = 0
+
+	topK          = 40
+	topP          = 0.95
+	temp          = 0.80
+	repeatPenalty = 1.30
+
+	options = map[string]any{
+		"repeat_penalty": &repeatPenalty,
+		"seed":           &seed,
+		"temp":           &temp,
+		"threads":        &threads,
+		"tokens":         &tokens,
+		"top_k":          &topK,
+		"top_p":          &topP,
+	}
 )
 
 func main() {
 	var model string
-	var threads, tokens int
 
 	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
 	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
@@ -36,9 +60,11 @@ func main() {
 		fmt.Println("Loading the model failed")
 		os.Exit(1)
 	}
-	fmt.Printf("Model loaded successfully.\n\n")
+	fmt.Printf("Model loaded successfully.\n")
 
+	printSettings()
 	reader := bufio.NewReader(os.Stdin)
+
 	for {
 		fmt.Print(">>> ")
 		text, err := reader.ReadString('\n')
@@ -50,8 +76,18 @@ func main() {
 			os.Exit(1)
 		}
 
+		optionChanged, err := handleParameterChange(text)
+		if err != nil {
+			fmt.Printf("Reading the prompt failed: %s", err)
+			os.Exit(1)
+		}
+		if optionChanged {
+			continue
+		}
+
 		input := C.CString(text)
-		params := C.llama_allocate_params(input, C.int(threads), C.int(tokens))
+		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
+			C.float(topP), C.float(temp), C.float(repeatPenalty))
 		result = C.llama_predict(params, state)
 		if result == 2 {
 			fmt.Println("Predicting failed")
@@ -63,3 +99,71 @@ func main() {
 		fmt.Printf("\n\n")
 	}
 }
+
+// handleParameterChange parses the input for any parameter changes.
+// This is a generic function that can handle int and float type parameters.
+// The parameters need to be referenced by pointer in the options map.
+func handleParameterChange(input string) (bool, error) {
+	optionChanged := false
+	words := strings.Split(input, " ")
+
+	for _, word := range words {
+		parsed := strings.Split(word, "=")
+
+		if len(parsed) < 2 {
+			break
+		}
+
+		s := strings.TrimSpace(parsed[0])
+		opt, ok := options[s]
+		if !ok {
+			break
+		}
+
+		val := reflect.ValueOf(opt)
+		if val.Kind() != reflect.Ptr {
+			return false, fmt.Errorf("option %s is not a pointer", s)
+		}
+		val = val.Elem()
+		argument := strings.TrimSpace(parsed[1])
+		optionChanged = true
+
+		switch val.Kind() {
+		case reflect.Int:
+			i, err := strconv.ParseInt(argument, 10, 64)
+			if err != nil {
+				return false, fmt.Errorf("parsing value '%s' as int: %w", argument, err)
+			}
+			val.SetInt(i)
+
+		case reflect.Float32, reflect.Float64:
+			f, err := strconv.ParseFloat(argument, 64)
+			if err != nil {
+				return false, fmt.Errorf("parsing value '%s' as float: %w", argument, err)
+			}
+			val.SetFloat(f)
+
+		default:
+			return false, fmt.Errorf("unsupported option %s type %T", s, opt)
+		}
+	}
+
+	if optionChanged {
+		printSettings()
+	}
+	return optionChanged, nil
+}
+
+func printSettings() {
+	var settings sort.StringSlice
+	for setting, value := range options {
+		val := reflect.ValueOf(value)
+		if val.Kind() == reflect.Ptr {
+			val = val.Elem()
+		}
+		settings = append(settings, fmt.Sprintf("%s=%v", setting, val.Interface()))
+	}
+	sort.Sort(settings)
+	s := strings.Join(settings, " ")
+	fmt.Printf("Current settings: %s\n\n", s)
+}
diff --git a/main.h b/main.h
index 5cf54c62ce3d5..f8881d82479f2 100644
--- a/main.h
+++ b/main.h
@@ -8,7 +8,8 @@ void *llama_allocate_state();
 
 int llama_bootstrap(const char *model_path, void *state_pr);
 
-void* llama_allocate_params(const char *input, int threads, int tokens);
+void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
+                                                        float top_p, float temp, float repeat_penalty);
 void llama_free_params(void* params_ptr);
 
 int llama_predict(void* params_ptr, void* state_pr);

From 65e4616f6f8670e691fb418a8d9288015bc940ba Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 21:10:25 -0600
Subject: [PATCH 13/44] change settings string and fix ci macos compilation

---
 README.md | 2 +-
 main.go   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e42c1a86f6c15..63e462356a5f6 100644
--- a/README.md
+++ b/README.md
@@ -77,5 +77,5 @@ Duck and Cover, Two Fleas, One Duck
 The settings can be changed at runtime, multiple values are possible:
 ```bash
 >>> seed=1234 threads=8
-Current settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95
+Settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95
 ```
diff --git a/main.go b/main.go
index 9226c59a06d70..11a16fb7d7877 100644
--- a/main.go
+++ b/main.go
@@ -26,7 +26,7 @@ var (
 	temp          = 0.80
 	repeatPenalty = 1.30
 
-	options = map[string]any{
+	options = map[string]interface{}{
 		"repeat_penalty": &repeatPenalty,
 		"seed":           &seed,
 		"temp":           &temp,
@@ -165,5 +165,5 @@ func printSettings() {
 	}
 	sort.Sort(settings)
 	s := strings.Join(settings, " ")
-	fmt.Printf("Current settings: %s\n\n", s)
+	fmt.Printf("Settings: %s\n\n", s)
 }

From 5594cda385e753928e1dc702b3ed2f57e1a99f8c Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 22:56:04 -0600
Subject: [PATCH 14/44] fix prediction result error handling

---
 main.cpp | 2 +-
 main.go  | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/main.cpp b/main.cpp
index 415b1d5bde62c..9817108821fca 100644
--- a/main.cpp
+++ b/main.cpp
@@ -948,7 +948,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
             const int64_t t_start_us = ggml_time_us();
 
             if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                fprintf(stderr, "Failed to predict\n");
+//                fprintf(stderr, "Failed to predict\n");
                 return 1;
             }
 
diff --git a/main.go b/main.go
index 11a16fb7d7877..39790ec9fe326 100644
--- a/main.go
+++ b/main.go
@@ -89,8 +89,10 @@ func main() {
 		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
 			C.float(topP), C.float(temp), C.float(repeatPenalty))
 		result = C.llama_predict(params, state)
-		if result == 2 {
-			fmt.Println("Predicting failed")
+		switch result {
+		case 0, 2:
+		case 1:
+			fmt.Println("\nPredicting failed")
 			os.Exit(1)
 		}
 

From 20080e8cdbc62f192cf944fe9b35da352cce059e Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Mon, 13 Mar 2023 23:13:27 -0600
Subject: [PATCH 15/44] allow multiline input for prompts

---
 main.go | 56 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/main.go b/main.go
index 39790ec9fe326..7af938f626c7f 100644
--- a/main.go
+++ b/main.go
@@ -66,8 +66,34 @@ func main() {
 	reader := bufio.NewReader(os.Stdin)
 
 	for {
-		fmt.Print(">>> ")
-		text, err := reader.ReadString('\n')
+		text := readMultiLineInput(reader)
+
+		input := C.CString(text)
+		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
+			C.float(topP), C.float(temp), C.float(repeatPenalty))
+		result = C.llama_predict(params, state)
+		switch result {
+		case 0:
+		case 1:
+			fmt.Println("\nPredicting failed")
+			os.Exit(1)
+		case 2:
+			fmt.Printf(" <more token available>")
+		}
+
+		C.llama_free_params(params)
+
+		fmt.Printf("\n\n")
+	}
+}
+
+// readMultiLineInput reads input until an empty line is entered.
+func readMultiLineInput(reader *bufio.Reader) string {
+	var lines []string
+	fmt.Print(">>> ")
+
+	for {
+		line, err := reader.ReadString('\n')
 		if err != nil {
 			if err == io.EOF {
 				os.Exit(0)
@@ -76,30 +102,26 @@ func main() {
 			os.Exit(1)
 		}
 
-		optionChanged, err := handleParameterChange(text)
+		if len(strings.TrimSpace(line)) == 0 {
+			break
+		}
+
+		optionChanged, err := handleParameterChange(line)
 		if err != nil {
 			fmt.Printf("Reading the prompt failed: %s", err)
 			os.Exit(1)
 		}
 		if optionChanged {
+			lines = nil
+			fmt.Print(">>> ")
 			continue
 		}
 
-		input := C.CString(text)
-		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
-			C.float(topP), C.float(temp), C.float(repeatPenalty))
-		result = C.llama_predict(params, state)
-		switch result {
-		case 0, 2:
-		case 1:
-			fmt.Println("\nPredicting failed")
-			os.Exit(1)
-		}
-
-		C.llama_free_params(params)
-
-		fmt.Printf("\n\n")
+		lines = append(lines, line)
 	}
+
+	text := strings.Join(lines, "")
+	return text
 }
 
 // handleParameterChange parses the input for any parameter changes.

From b94e476c3d353861fb1c4537af6af913efdf581c Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Tue, 14 Mar 2023 18:43:15 -0600
Subject: [PATCH 16/44] allow setting of repeat_last_n

---
 main.cpp |  6 ++++--
 main.go  | 15 +++++++++------
 main.h   |  4 ++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/main.cpp b/main.cpp
index 9817108821fca..4aff0c90b813a 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1089,16 +1089,18 @@ void* llama_allocate_state() {
 }
 
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
-                            float top_p, float temp, float repeat_penalty) {
+                            float top_p, float temp, float repeat_penalty, int repeat_last_n) {
     gpt_params* params = new gpt_params;
     params->seed = seed;
     params->n_threads = threads;
     params->n_predict = tokens;
+    params->repeat_last_n = repeat_last_n;
+
     params->top_k = top_k;
     params->top_p = top_p;
-    params->n_predict = tokens;
     params->temp = temp;
     params->repeat_penalty = repeat_penalty;
+
     params->prompt = prompt;
     return params;
 }
diff --git a/main.go b/main.go
index 7af938f626c7f..7513a74db0abf 100644
--- a/main.go
+++ b/main.go
@@ -17,9 +17,10 @@ import (
 )
 
 var (
-	seed    = -1
-	threads = 0
-	tokens  = 0
+	repeatLastN = 64
+	seed        = -1
+	threads     = 4
+	tokens      = 128
 
 	topK          = 40
 	topP          = 0.95
@@ -27,11 +28,12 @@ var (
 	repeatPenalty = 1.30
 
 	options = map[string]interface{}{
+		"repeat_last_n":  &repeatLastN, // last n tokens to penalize
 		"repeat_penalty": &repeatPenalty,
-		"seed":           &seed,
+		"seed":           &seed, // RNG seed, -1 will seed based on current time
 		"temp":           &temp,
 		"threads":        &threads,
-		"tokens":         &tokens,
+		"tokens":         &tokens, // new tokens to predict
 		"top_k":          &topK,
 		"top_p":          &topP,
 	}
@@ -70,7 +72,7 @@ func main() {
 
 		input := C.CString(text)
 		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
-			C.float(topP), C.float(temp), C.float(repeatPenalty))
+			C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN))
 		result = C.llama_predict(params, state)
 		switch result {
 		case 0:
@@ -178,6 +180,7 @@ func handleParameterChange(input string) (bool, error) {
 	return optionChanged, nil
 }
 
+// printSettings outputs the current settings, alphabetically sorted.
 func printSettings() {
 	var settings sort.StringSlice
 	for setting, value := range options {
diff --git a/main.h b/main.h
index f8881d82479f2..9e2ad5203de30 100644
--- a/main.h
+++ b/main.h
@@ -8,8 +8,8 @@ void *llama_allocate_state();
 
 int llama_bootstrap(const char *model_path, void *state_pr);
 
-void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
-                                                        float top_p, float temp, float repeat_penalty);
+void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
+                            int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n);
 void llama_free_params(void* params_ptr);
 
 int llama_predict(void* params_ptr, void* state_pr);

From 9f57e389da313e0f9fe74841963e41e00df5c03d Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Tue, 14 Mar 2023 18:56:26 -0600
Subject: [PATCH 17/44] disable windows build in ci

---
 .github/workflows/build.yml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1a068ae75f966..c4377bef8fd4b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,19 +33,19 @@ jobs:
         run: |
           make
 
-  windows-latest:
-    runs-on: windows-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v1
-
-      - name: Build
-        run: |
-          mkdir build
-          cd build
-          cmake ..
-          cmake --build . --config Release
+#  windows-latest:
+#    runs-on: windows-latest
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Build
+#        run: |
+#          mkdir build
+#          cd build
+#          cmake ..
+#          cmake --build . --config Release
 
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest

From 3406313d5a591dc9f693c1b3f9edf51038874059 Mon Sep 17 00:00:00 2001
From: Cornel <github@codejuggle.dj>
Date: Tue, 14 Mar 2023 20:28:18 -0600
Subject: [PATCH 18/44] fix and reenable windows ci build (#1)

---
 .github/workflows/build.yml | 26 +++++++++++++-------------
 CMakeLists.txt              |  4 ++--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c4377bef8fd4b..1a068ae75f966 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,19 +33,19 @@ jobs:
         run: |
           make
 
-#  windows-latest:
-#    runs-on: windows-latest
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Build
-#        run: |
-#          mkdir build
-#          cd build
-#          cmake ..
-#          cmake --build . --config Release
+  windows-latest:
+    runs-on: windows-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
 
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca3be38a55740..fc488f9eb715f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,7 +104,7 @@ endif()
 #     set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
 # endif()
 
-add_executable(llama
+add_library(llama
     main.cpp
     utils.cpp
     utils.h)
@@ -120,7 +120,7 @@ add_library(ggml
 
 target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
 target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
-target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
+target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS} -DQUANTIZE)
 
 target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
 target_include_directories(ggml PUBLIC .)

From 6a646ea7515d8b6651f53a195f9a827c542591cb Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Fri, 17 Mar 2023 21:19:22 +0100
Subject: [PATCH 19/44] Make model return strings

---
 Makefile             |  14 ++---
 main.cpp => lama.cpp | 121 ++++++-------------------------------------
 main.h => lama.h     |   2 +-
 main.go              |  22 ++++----
 4 files changed, 35 insertions(+), 124 deletions(-)
 rename main.cpp => lama.cpp (88%)
 rename main.h => lama.h (86%)

diff --git a/Makefile b/Makefile
index d2862bb6f05d1..6281674cce1bc 100644
--- a/Makefile
+++ b/Makefile
@@ -176,7 +176,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: main.o quantize libllama.a llama-go
+default: lama.o quantize libllama.a llama-go
 
 #
 # Build library
@@ -189,19 +189,19 @@ utils.o: utils.cpp utils.h
 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
 
 clean:
-	rm -f *.o main quantize
+	rm -f *.o lama quantize
 	rm -f *.a llama-go
 
-main.o: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main.o -c $(LDFLAGS)
+lama.o: lama.cpp ggml.o utils.o
+	$(CXX) $(CXXFLAGS) lama.cpp ggml.o utils.o -o lama.o -c $(LDFLAGS)
 
-libllama.a: main.o ggml.o utils.o
-	ar src libllama.a main.o ggml.o utils.o
+libllama.a: lama.o ggml.o utils.o
+	ar src libllama.a lama.o ggml.o utils.o
 
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
 
-llama-go: main.go main.cpp main.h
+llama-go: main.go lama.cpp main.h
 	CGO_CFLAGS_ALLOW='-mf.*' go build .
 #
 # Tests
diff --git a/main.cpp b/lama.cpp
similarity index 88%
rename from main.cpp
rename to lama.cpp
index e9293c63a3428..efd2ac28ceff0 100644
--- a/main.cpp
+++ b/lama.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "main.h"
+#include "lama.h"
 #include "utils.h"
 
 #include <cassert>
@@ -855,7 +855,7 @@ int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx)
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 */
-int llama_predict(void* params_ptr, void* state_pr) {
+int llama_predict(void* params_ptr, void* state_pr, char* result) {
     gpt_params params = *(gpt_params*) params_ptr;
     llama_state state = *(llama_state*) state_pr;
     gpt_vocab vocab = state.vocab;
@@ -882,38 +882,8 @@ int llama_predict(void* params_ptr, void* state_pr) {
     // tokenize the reverse prompt
     std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
 
-    fprintf(stderr, "\n");
-    /*fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (int i = 0; i < (int) embd_inp.size(); i++) {
-        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    fprintf(stderr, "\n");
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        signal(SIGINT, sigint_handler);
-#endif
-
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
-
-        if(antiprompt_inp.size()) {
-            fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
-            fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
-            for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
-            }
-            fprintf(stderr, "\n");
-        }
-    }
-    fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    fprintf(stderr, "\n\n");
-*/
+    //fprintf(stderr, "\n");
+   
     std::vector<gpt_vocab::id> embd;
 
     // determine the required inference memory per token:
@@ -925,36 +895,25 @@ int llama_predict(void* params_ptr, void* state_pr) {
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 
 
-    if (params.interactive) {
-        fprintf(stderr, "== Running in interactive mode. ==\n"
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-               " - Press Ctrl+C to interject at any time.\n"
-#endif
-               " - Press Return to return control to LLaMa.\n"
-               " - If you want to submit another line, end your input in '\\'.\n");
-    }
+    
 
     int remaining_tokens = params.n_predict;
     int input_consumed = 0;
     bool input_noecho = false;
 
-    // prompt user immediately after the starting prompt has been loaded
-    if (params.interactive_start) {
-        is_interacting = true;
-    }
+    std::string res = "";
 
-    // set the color for the prompt which will be output initially
-    if (params.use_color) {
-        printf(ANSI_COLOR_YELLOW);
-    }
-
-    while (remaining_tokens > 0) {
+    while (true) {
+         if (params.n_predict != 0 && remaining_tokens <= 0) {
+            break;
+        }   
         // predict
         if (embd.size() > 0) {
             const int64_t t_start_us = ggml_time_us();
 
             if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
 //                fprintf(stderr, "Failed to predict\n");
+                strcpy(result, res.c_str()); 
                 return 1;
             }
 
@@ -1005,70 +964,21 @@ int llama_predict(void* params_ptr, void* state_pr) {
                     break;
                 }
             }
-
-            // reset color to default if we there is no pending user input
-            if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
-                printf(ANSI_COLOR_RESET);
-            }
         }
 
         // display text
         if (!input_noecho) {
             for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
-            fflush(stdout);
-        }
-
-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && embd_inp.size() <= input_consumed) {
-            // check for reverse prompt
-            if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
-                // reverse prompt found
-                is_interacting = true;
-            }
-            if (is_interacting) {
-                // currently being interactive
-                bool another_line=true;
-                while (another_line) {
-                    fflush(stdout);
-                    char buf[256] = {0};
-                    int n_read;
-                    if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
-                    if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
-                        // presumable empty line, consume the newline
-                        scanf("%*c");
-                        n_read=0;
-                    }
-                    if(params.use_color) printf(ANSI_COLOR_RESET);
-
-                    if (n_read > 0 && buf[n_read-1]=='\\') {
-                        another_line = true;
-                        buf[n_read-1] = '\n';
-                        buf[n_read] = 0;
-                    } else {
-                        another_line = false;
-                        buf[n_read] = '\n';
-                        buf[n_read+1] = 0;
-                    }
-
-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
-                    remaining_tokens -= line_inp.size();
-
-                    input_noecho = true; // do not echo this again
-                }
-
-                is_interacting = false;
+                res += vocab.id_to_token[id].c_str();
             }
         }
 
+   
         // end of text token
         if (embd.back() == 2) {
 //            fprintf(stderr, " [end of text]\n");
-            return 2;
+          //  return 2;
+          break;
         }
     }
 /*
@@ -1094,6 +1004,7 @@ int llama_predict(void* params_ptr, void* state_pr) {
         printf(ANSI_COLOR_RESET);
     }
 */
+    strcpy(result, res.c_str()); 
     return 0;
 }
 
diff --git a/main.h b/lama.h
similarity index 86%
rename from main.h
rename to lama.h
index 2b6b77f5f4d43..fc708bce51031 100644
--- a/main.h
+++ b/lama.h
@@ -12,7 +12,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
                             int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n);
 void llama_free_params(void* params_ptr);
 
-int llama_predict(void* params_ptr, void* state_pr);
+int llama_predict(void* params_ptr, void* state_pr, char* result);
 
 #ifdef __cplusplus
 }
diff --git a/main.go b/main.go
index afa210e164393..7a484ba961d49 100644
--- a/main.go
+++ b/main.go
@@ -2,7 +2,7 @@ package main
 
 // #cgo CFLAGS:   -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mavx -mavx2 -mfma -mf16c -msse3
 // #cgo CXXFLAGS: -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I.
-// #include "main.h"
+// #include "lama.h"
 import "C"
 import (
 	"bufio"
@@ -11,9 +11,11 @@ import (
 	"io"
 	"os"
 	"reflect"
+	"runtime"
 	"sort"
 	"strconv"
 	"strings"
+	"unsafe"
 )
 
 var (
@@ -46,7 +48,7 @@ func main() {
 
 	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
 	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
-	flags.IntVar(&threads, "t", 4, "number of threads to use during computation")
+	flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation")
 	flags.IntVar(&tokens, "n", 128, "number of tokens to predict")
 
 	err := flags.Parse(os.Args[1:])
@@ -73,17 +75,14 @@ func main() {
 		text := readMultiLineInput(reader)
 
 		input := C.CString(text)
+		out := make([]byte, tokens)
 		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
 			C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN))
-		result = C.llama_predict(params, state)
-		switch result {
-		case 0:
-		case 1:
-			fmt.Println("\nPredicting failed")
-			os.Exit(1)
-		case 2:
-			fmt.Printf(" <more token available>")
-		}
+		C.llama_predict(params, state, (*C.char)(unsafe.Pointer(&out[0])))
+		res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
+
+		res = strings.TrimPrefix(res, text)
+		fmt.Printf("\ngolang: %s\n", res)
 
 		C.llama_free_params(params)
 
@@ -125,6 +124,7 @@ func readMultiLineInput(reader *bufio.Reader) string {
 	}
 
 	text := strings.Join(lines, "")
+	fmt.Println("Sending", text)
 	return text
 }
 

From 00f34c1452aa2ee5a423efffd1dc6b21d5e659c8 Mon Sep 17 00:00:00 2001
From: Matvey Soloviev <blackhole89@gmail.com>
Date: Fri, 17 Mar 2023 05:48:39 +0100
Subject: [PATCH 20/44] Q4_1 quantization (#193)

* Add AVX2 version of ggml_vec_dot_q4_1

* Small optimisations to q4_1 dot product (@Const-me)

* Rearrange Q4_1 quantization to work for multipart models. (Fix #152)

* Fix ggml_vec_mad_q4_1 too

* Fix non-vectorised q4_1 vec mul
---
 ggml.c    | 149 ++++++++++++++++++++++++++++++++++++++++++------------
 utils.cpp |  20 +++++---
 2 files changed, 130 insertions(+), 39 deletions(-)

diff --git a/ggml.c b/ggml.c
index 535c7b7d281dd..c4f8389171026 100644
--- a/ggml.c
+++ b/ggml.c
@@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
     assert(k % QK == 0);
 
     const int nb = k / QK;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    float   * restrict pm = (float *)   (y);
-    float   * restrict pd = (float *)   (pm + nb);
-    uint8_t * restrict pb = (uint8_t *) (pd + nb);
+    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
+    uint8_t * restrict pm = ((uint8_t *)y + 0*bs +   sizeof(float));
+    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float));
 
     uint8_t pp[QK/2];
 
@@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
         const float d = (max - min) / ((1 << 4) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
-        pm[i] = min;
-        pd[i] = d;
+        *(float *)pm = min;
+        *(float *)pd = d;
+        pm += bs;
+        pd += bs;
 
         for (int l = 0; l < QK; l += 2) {
             const float v0 = (x[i*QK + l + 0] - min)*id;
@@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
             pp[l/2] = vi0 | (vi1 << 4);
         }
 
-        memcpy(pb + i*QK/2, pp, sizeof(pp));
+        memcpy(pb, pp, sizeof(pp));
+        pb += bs;
     }
 }
 
@@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
     assert(k % QK == 0);
 
     const int nb = k / QK;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    const float   * restrict pm = (const float *)   (x);
-    const float   * restrict pd = (const float *)   (pm + nb);
-    const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
+    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
+    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
     for (int i = 0; i < nb; i++) {
-        const float m = pm[i];
-        const float d = pd[i];
+        const float d = *(const float *) (pd + i*bs);
+        const float m = *(const float *) (pm + i*bs);
 
-        const uint8_t * restrict pp = pb + i*QK/2;
+        const uint8_t * restrict pp = pb + i*bs;
 
         for (int l = 0; l < QK; l += 2) {
             const uint8_t vi = pp[l/2];
@@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
     const int nb = n / QK;
 
-    const float * restrict pm0 = (const float *) x;
-    const float * restrict pm1 = (const float *) y;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    const float * restrict pd0 = (const float *) (pm0 + nb);
-    const float * restrict pd1 = (const float *) (pm1 + nb);
+    const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
+
+    const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float));
+    const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float));
 
-    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
-    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
+    const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
+    const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float));
 
     float sumf = 0.0;
 
-#if 1
+#if defined(__AVX2__)
+#if QK == 32
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    // Accumulator for constant offsets
+    float acc_offset = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float * m0 = (const float *) (pm0 + i*bs);
+        const float * m1 = (const float *) (pm1 + i*bs);
+
+        const float * d0 = (const float *) (pd0 + i*bs);
+        const float * d1 = (const float *) (pd1 + i*bs);
+
+        const uint8_t * restrict p0 = pb0 + i*bs;
+        const uint8_t * restrict p1 = pb1 + i*bs;
+
+        const __m256 d0v = _mm256_broadcast_ss( d0 );
+        const __m256 d1v = _mm256_broadcast_ss( d1 );
+        const __m256 m0v = _mm256_broadcast_ss( m0 );
+        const __m256 m1v = _mm256_broadcast_ss( m1 );
+
+
+        // Compute combined scale for the block
+        const __m256 scale_01 = _mm256_mul_ps( d0v, d1v );
+
+        // Compute cross scales for the block
+        const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
+        const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
+        const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        __m256i bx = bytesFromNibbles( p0 );
+        __m256i by = bytesFromNibbles( p1 );
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval.
+
+        // Sign-extend first 16 signed bytes into int16_t
+        __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
+        __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
+        // Compute products of int16_t integers, add pairwise
+        __m256i i32 = _mm256_madd_epi16( x16, y16 );
+
+        // Sign-extend last 16 signed bytes into int16_t vectors
+        __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
+        __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
+        // Accumulate products of int16_t integers
+        i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) );
+
+        // compute sums of unsigned bytes in bx, by in blocks of 8.
+        // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000,
+        // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400.
+        // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ]
+        __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() );
+        __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() );
+        __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) );
+        __m256  sums  = _mm256_cvtepi32_ps( sumsi );
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps( i32 );
+        // Apply the scale, and accumulate
+        // acc += d0*d1*x*y + d0*m1*x + d1*m0*y
+        acc = _mm256_fmadd_ps( scale_01, p, acc );
+        acc = _mm256_fmadd_ps( cross_scales, sums, acc );
+        // acc_offset += m0*m1 (for each entry in the block)
+        acc_offset += (*m0)*(*m1);
+    }
+
+    // Return horizontal sum of the acc vector
+    __m128 res = _mm256_extractf128_ps( acc, 1 );
+    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
+    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
+    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
+
+    sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
+#else
+#error "not implemented for QK"
+#endif
+#else
     // scalar
     for (int i = 0; i < nb; i++) {
-        const float m0 = pm0[i];
-        const float m1 = pm1[i];
+        const float m0 = *(const float *) (pm0 + i*bs);
+        const float m1 = *(const float *) (pm1 + i*bs);
 
-        const float d0 = pd0[i];
-        const float d1 = pd1[i];
+        const float d0 = *(const float *) (pd0 + i*bs);
+        const float d1 = *(const float *) (pd1 + i*bs);
 
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
+        const uint8_t * restrict p0 = pb0 + i*bs;
+        const uint8_t * restrict p1 = pb1 + i*bs;
 
         for (int j = 0; j < QK/2; j++) {
             const uint8_t v0 = p0[j];
@@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
     assert(n % QK == 0);
 
     const int nb = n / QK;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    const float   * restrict pm = (const float *)   (x);
-    const float   * restrict pd = (const float *)   (pm + nb);
-    const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
+    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float)); 
+    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
     for (int i = 0; i < nb; i++) {
-        const float m = pm[i];
-        const float d = pd[i];
+        const float d = *(const float *) (pd + i*bs);
+        const float m = *(const float *) (pm + i*bs);
 
-        const uint8_t * restrict pp = pb + i*QK/2;
+        const uint8_t * restrict pp = pb + i*bs;
 
         for (int l = 0; l < QK; l += 2) {
             const uint8_t vi = pp[l/2];
diff --git a/utils.cpp b/utils.cpp
index aa3ad1053da02..26e313d5f1bf9 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -489,7 +489,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
 
 size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
     const int nb = k / qk;
-    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
+    const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
+    const size_t row_size = nb*bs;
 
     assert(k % qk == 0);
 
@@ -498,10 +499,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
 
     char * pdst = (char *) dst;
 
-    for (int j = 0; j < n; j += k) {
-        float   * pm = (float *)   (pdst + (j/k)*row_size);
-        float   * pd = (float *)   (pm + nb);
-        uint8_t * pb = (uint8_t *) (pd + nb);
+    for (int j = 0; j < n; j += k) { 
+        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
+        uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs +   sizeof(float));
+        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
 
         //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
 
@@ -519,8 +520,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
                 const float d = (max - min) / ((1 << 4) - 1);
                 const float id = d ? 1.0f/d : 0.0f;
 
-                pm[i] = min;
-                pd[i] = d;
+                *(float *) pd = d;
+                *(float *) pm = min;
+                pd += bs; 
+                pm += bs;
 
                 for (int l = 0; l < qk; l += 2) {
                     const float v0 = (src[j + i*qk + l + 0] - min)*id;
@@ -538,7 +541,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
                     pp[l/2] = vi0 | (vi1 << 4);
                 }
 
-                memcpy(pb + i*qk/2, pp, pp_size);
+                memcpy(pb, pp, pp_size);
+                pb += bs;
             }
         }
     }

From d79f4df0d55a3153121d6e200fa7334cc2be0d4d Mon Sep 17 00:00:00 2001
From: thement <40525767+thement@users.noreply.github.com>
Date: Fri, 17 Mar 2023 21:05:58 +0100
Subject: [PATCH 21/44] Implement non-greedy tokenizer that tries to maximize
 token lengths (#242)

* Implement non-greedy tokenizer that tries to maximize token lengths

* Insert single space in front of the prompt

- this is to match original llama tokenizer behavior

---------

Co-authored-by: Jakub Horak <jakub.horak@ibawizard.net>
---
 lama.cpp  |  2 ++
 utils.cpp | 68 ++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/lama.cpp b/lama.cpp
index efd2ac28ceff0..d79f1e3b37d04 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -874,6 +874,8 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) {
 
     std::vector<float> logits;
 
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
     // tokenize the prompt
     std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 
diff --git a/utils.cpp b/utils.cpp
index 26e313d5f1bf9..7539edd86d1a1 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -275,41 +275,57 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
     return tokens;
 }
 
+// TODO: Calculate this constant from the vocabulary
+#define MAX_TOKEN_LEN 18
+// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    //auto res = gpt_tokenize(vocab, text);
-
-    //if (bos) {
-    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
-    //}
-
     std::vector<gpt_vocab::id> res;
-
-    if (bos) {
-        res.push_back(1); // TODO: replace with vocab.bos
-    }
-
-     //find the longest token that matches the text
-    int pos = 0;
-    while (true) {
-        int l = 0;
-        int t = 0;
-        for (const auto & kv : vocab.id_to_token) {
-            if (kv.second.size() < l) continue;
-            if (kv.second.size() > text.size() - pos) continue;
-            if (text.substr(pos, kv.second.size()) == kv.second) {
-                l = kv.second.size();
-                t = kv.first;
+    std::vector<int> score;
+    std::vector<gpt_vocab::id> prev;
+    int len = text.length();
+
+    score.resize(len + 1);
+    prev.resize(len + 1);
+
+    // Forward pass
+    for (int i = 0; i < len; i++) {
+        int max_len = std::min(len - i, MAX_TOKEN_LEN);
+        for (int sub_len = 1; sub_len <= len - i; sub_len++) {
+            auto sub = text.substr(i, sub_len);
+            auto token = vocab.token_to_id.find(sub);
+            if (token != vocab.token_to_id.end()) {
+                int token_score = sub.length() * sub.length();
+                int local_score = score[i] + token_score;
+                int next = i + sub_len;
+                if (score[next] < local_score) {
+                    score[next] = local_score;
+                    prev[next] = (*token).second;
+                }
             }
         }
+    }
 
-        if (l == 0) {
-            break;
+    // Backward pass
+    int i = len;
+    while (i > 0) {
+        gpt_vocab::id token_id = prev[i];
+        if (token_id == 0) {
+	    // TODO: Return error or something more meaningful
+            printf("failed to tokenize string!\n");
+	    break;
         }
+        res.push_back(token_id);
+        auto token = (*vocab.id_to_token.find(token_id)).second;
+        i -= token.length();
+    }
 
-        res.push_back(t);
-        pos += l;
+    if (bos) {
+        res.push_back(1); // TODO: replace with vocab.bos
     }
 
+    // Pieces are in reverse order so correct that
+    std::reverse(res.begin(), res.end());
+
     return res;
 }
 

From 655d8dd0e6b3bcbf7ad6c6aceeab7ebdac0f7068 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Fri, 17 Mar 2023 21:24:34 +0100
Subject: [PATCH 22/44] Fix makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6281674cce1bc..cb9ccd644fd16 100644
--- a/Makefile
+++ b/Makefile
@@ -201,7 +201,7 @@ libllama.a: lama.o ggml.o utils.o
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) -DQUANTIZE quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
 
-llama-go: main.go lama.cpp main.h
+llama-go: main.go lama.cpp lama.h
 	CGO_CFLAGS_ALLOW='-mf.*' go build .
 #
 # Tests

From d04e7fc005e358d52a4a5b27880e83eefc933360 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Fri, 17 Mar 2023 22:09:13 +0100
Subject: [PATCH 23/44] Update README

---
 README.md | 83 ++++++++++---------------------------------------------
 1 file changed, 14 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index 63e462356a5f6..b521ce6ee934d 100644
--- a/README.md
+++ b/README.md
@@ -1,81 +1,26 @@
 # llama-go
 
-Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in Golang with embedded C/C++.
+This is [llama.cpp](https://github.com/ggerganov/llama.cpp) port in golang to use as a library.
 
-## Description
-
-This project embeds the work of [llama.cpp](https://github.com/ggerganov/llama.cpp) in a Golang binary.
-The main goal is to run the model using 4-bit quantization using CPU on Consumer-Grade hardware.
-
-At startup, the model is loaded and a prompt is offered to enter a prompt,
-after the results have been printed another prompt can be entered.
-The program can be quit using ctrl+c.
-
-This project was tested on Linux but should be able to get to work on macOS as well.
-
-## Requirements
-
-The memory requirements for the models are approximately:
-
-```
-7B  -> 4 GB (1 file)
-13B -> 8 GB (2 files)
-30B -> 16 GB (4 files)
-65B -> 32 GB (8 files)
-```
-
-## Installation
-
-```bash
-# build this repo
-git clone https://github.com/cornelk/llama-go
-cd llama-go
-make
+## Usage
 
-# install Python dependencies
-python3 -m pip install torch numpy sentencepiece
 ```
-
-Obtain the original LLaMA model weights and place them in ./models - 
-for example by using the https://github.com/shawwn/llama-dl script to download them.
-
-Use the following steps to convert the LLaMA-7B model to a format that is compatible:
-
-```bash
-ls ./models
-65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
-
-# convert the 7B model to ggml FP16 format
-python3 convert-pth-to-ggml.py models/7B/ 1
-
-# quantize the model to 4-bits
-./quantize.sh 7B
+git clone XXX
+cd XXX
+make libllama.a
+LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples/main.go -m ggml-alpaca-7b-q4.bin -n 10
 ```
 
-When running the larger models, make sure you have enough disk space to store all the intermediate files.
+## Model
 
-## Usage
+For a tiny model, you can use https://github.com/antimatter15/alpaca.cpp .
 
-```bash
-./llama-go -m ./models/13B/ggml-model-q4_0.bin -t 4 -n 128
+## License
 
-Loading model ./models/13B/ggml-model-q4_0.bin...
-Model loaded successfully.
+MIT
 
->>> Some good pun names for a pet groomer:
+## Acknowledgements
 
-Some good pun names for a pet groomer:
-Rub-a-Dub, Scooby Doo
-Hair Force One
-Duck and Cover, Two Fleas, One Duck
-...
-
->>>
-
-```
-
-The settings can be changed at runtime, multiple values are possible:
-```bash
->>> seed=1234 threads=8
-Settings: repeat_penalty=1.3 seed=1234 temp=0.8 threads=8 tokens=128 top_k=40 top_p=0.95
-```
+- [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- https://github.com/cornelk/llama-go for the initial ideas
+- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
\ No newline at end of file

From 2723820af86ca4366553725c20b1df41bcb23388 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Fri, 17 Mar 2023 23:43:08 +0100
Subject: [PATCH 24/44] Rename import

---
 go.mod | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go.mod b/go.mod
index b5878754e2ace..6a81f5b887b72 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,3 @@
-module github.com/cornelk/llama-go
+module github.com/go-skynet/llama-go
 
 go 1.19

From 2a81f9b893650f964dce74e1b61edc0dcd39edf9 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Fri, 17 Mar 2023 23:43:15 +0100
Subject: [PATCH 25/44] Move to a lib

---
 examples/main.go |  81 +++++++++++++++++++
 go/llama.go      |  65 ++++++++++++++++
 main.go          | 198 -----------------------------------------------
 3 files changed, 146 insertions(+), 198 deletions(-)
 create mode 100644 examples/main.go
 create mode 100644 go/llama.go
 delete mode 100644 main.go

diff --git a/examples/main.go b/examples/main.go
new file mode 100644
index 0000000000000..bd1125cbad5f1
--- /dev/null
+++ b/examples/main.go
@@ -0,0 +1,81 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"runtime"
+	"strings"
+
+	llama "github.com/go-skynet/llama-go/go"
+)
+
+var (
+	threads = 4
+	tokens  = 128
+)
+
+func main() {
+	var model string
+
+	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
+	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
+	flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation")
+	flags.IntVar(&tokens, "n", 128, "number of tokens to predict")
+
+	err := flags.Parse(os.Args[1:])
+	if err != nil {
+		fmt.Printf("Parsing program arguments failed: %s", err)
+		os.Exit(1)
+	}
+	l := &llama.LLama{}
+	err = l.Load(model)
+	if err != nil {
+		fmt.Println("Loading the model failed:", err.Error())
+		os.Exit(1)
+	}
+	fmt.Printf("Model loaded successfully.\n")
+
+	reader := bufio.NewReader(os.Stdin)
+
+	for {
+		text := readMultiLineInput(reader)
+
+		res, err := l.Predict(threads, tokens, text)
+		if err != nil {
+			panic(err)
+		}
+		fmt.Printf("\ngolang: %s\n", res)
+
+		fmt.Printf("\n\n")
+	}
+}
+
+// readMultiLineInput reads input until an empty line is entered.
+func readMultiLineInput(reader *bufio.Reader) string {
+	var lines []string
+	fmt.Print(">>> ")
+
+	for {
+		line, err := reader.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				os.Exit(0)
+			}
+			fmt.Printf("Reading the prompt failed: %s", err)
+			os.Exit(1)
+		}
+
+		if len(strings.TrimSpace(line)) == 0 {
+			break
+		}
+
+		lines = append(lines, line)
+	}
+
+	text := strings.Join(lines, "")
+	fmt.Println("Sending", text)
+	return text
+}
diff --git a/go/llama.go b/go/llama.go
new file mode 100644
index 0000000000000..f8f8c47f5b07c
--- /dev/null
+++ b/go/llama.go
@@ -0,0 +1,65 @@
+package llama
+
+// #cgo LDFLAGS: -lllama -lm -lstdc++
+// #include <lama.h>
+import "C"
+import (
+	"fmt"
+	"strings"
+	"unsafe"
+)
+
+var (
+	repeatLastN = 64
+	seed        = -1
+	threads     = 4
+	tokens      = 128
+
+	topK          = 40
+	topP          = 0.95
+	temp          = 0.80
+	repeatPenalty = 1.30
+
+	nCtx = 512 // context size
+
+	options = map[string]interface{}{
+		"repeat_last_n":  &repeatLastN, // last n tokens to penalize
+		"repeat_penalty": &repeatPenalty,
+		"seed":           &seed, // RNG seed, -1 will seed based on current time
+		"temp":           &temp,
+		"threads":        &threads,
+		"tokens":         &tokens, // new tokens to predict
+		"top_k":          &topK,
+		"top_p":          &topP,
+	}
+)
+
+type LLama struct {
+	state unsafe.Pointer
+}
+
+func (l *LLama) Load(model string) error {
+	state := C.llama_allocate_state()
+	modelPath := C.CString(model)
+	result := C.llama_bootstrap(modelPath, state, C.int(nCtx))
+	if result != 0 {
+		return fmt.Errorf("failed loading model")
+	}
+	l.state = state
+	return nil
+}
+
+func (l *LLama) Predict(threads int, tokens int, text string) (string, error) {
+	input := C.CString(text)
+	out := make([]byte, tokens)
+	params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
+		C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN))
+	C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
+	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
+
+	res = strings.TrimPrefix(res, " "+text)
+
+	C.llama_free_params(params)
+
+	return res, nil
+}
diff --git a/main.go b/main.go
deleted file mode 100644
index 7a484ba961d49..0000000000000
--- a/main.go
+++ /dev/null
@@ -1,198 +0,0 @@
-package main
-
-// #cgo CFLAGS:   -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mavx -mavx2 -mfma -mf16c -msse3
-// #cgo CXXFLAGS: -O3 -DNDEBUG -std=c++11 -fPIC -pthread -I.
-// #include "lama.h"
-import "C"
-import (
-	"bufio"
-	"flag"
-	"fmt"
-	"io"
-	"os"
-	"reflect"
-	"runtime"
-	"sort"
-	"strconv"
-	"strings"
-	"unsafe"
-)
-
-var (
-	repeatLastN = 64
-	seed        = -1
-	threads     = 4
-	tokens      = 128
-
-	topK          = 40
-	topP          = 0.95
-	temp          = 0.80
-	repeatPenalty = 1.30
-
-	nCtx = 512 // context size
-
-	options = map[string]interface{}{
-		"repeat_last_n":  &repeatLastN, // last n tokens to penalize
-		"repeat_penalty": &repeatPenalty,
-		"seed":           &seed, // RNG seed, -1 will seed based on current time
-		"temp":           &temp,
-		"threads":        &threads,
-		"tokens":         &tokens, // new tokens to predict
-		"top_k":          &topK,
-		"top_p":          &topP,
-	}
-)
-
-func main() {
-	var model string
-
-	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
-	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
-	flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation")
-	flags.IntVar(&tokens, "n", 128, "number of tokens to predict")
-
-	err := flags.Parse(os.Args[1:])
-	if err != nil {
-		fmt.Printf("Parsing program arguments failed: %s", err)
-		os.Exit(1)
-	}
-
-	state := C.llama_allocate_state()
-
-	fmt.Printf("Loading model %s...\n", model)
-	modelPath := C.CString(model)
-	result := C.llama_bootstrap(modelPath, state, C.int(nCtx))
-	if result != 0 {
-		fmt.Println("Loading the model failed")
-		os.Exit(1)
-	}
-	fmt.Printf("Model loaded successfully.\n")
-
-	printSettings()
-	reader := bufio.NewReader(os.Stdin)
-
-	for {
-		text := readMultiLineInput(reader)
-
-		input := C.CString(text)
-		out := make([]byte, tokens)
-		params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
-			C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN))
-		C.llama_predict(params, state, (*C.char)(unsafe.Pointer(&out[0])))
-		res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
-
-		res = strings.TrimPrefix(res, text)
-		fmt.Printf("\ngolang: %s\n", res)
-
-		C.llama_free_params(params)
-
-		fmt.Printf("\n\n")
-	}
-}
-
-// readMultiLineInput reads input until an empty line is entered.
-func readMultiLineInput(reader *bufio.Reader) string {
-	var lines []string
-	fmt.Print(">>> ")
-
-	for {
-		line, err := reader.ReadString('\n')
-		if err != nil {
-			if err == io.EOF {
-				os.Exit(0)
-			}
-			fmt.Printf("Reading the prompt failed: %s", err)
-			os.Exit(1)
-		}
-
-		if len(strings.TrimSpace(line)) == 0 {
-			break
-		}
-
-		optionChanged, err := handleParameterChange(line)
-		if err != nil {
-			fmt.Printf("Reading the prompt failed: %s", err)
-			os.Exit(1)
-		}
-		if optionChanged {
-			lines = nil
-			fmt.Print(">>> ")
-			continue
-		}
-
-		lines = append(lines, line)
-	}
-
-	text := strings.Join(lines, "")
-	fmt.Println("Sending", text)
-	return text
-}
-
-// handleParameterChange parses the input for any parameter changes.
-// This is a generic function that can handle int and float type parameters.
-// The parameters need to be referenced by pointer in the options map.
-func handleParameterChange(input string) (bool, error) {
-	optionChanged := false
-	words := strings.Split(input, " ")
-
-	for _, word := range words {
-		parsed := strings.Split(word, "=")
-
-		if len(parsed) < 2 {
-			break
-		}
-
-		s := strings.TrimSpace(parsed[0])
-		opt, ok := options[s]
-		if !ok {
-			break
-		}
-
-		val := reflect.ValueOf(opt)
-		if val.Kind() != reflect.Ptr {
-			return false, fmt.Errorf("option %s is not a pointer", s)
-		}
-		val = val.Elem()
-		argument := strings.TrimSpace(parsed[1])
-		optionChanged = true
-
-		switch val.Kind() {
-		case reflect.Int:
-			i, err := strconv.ParseInt(argument, 10, 64)
-			if err != nil {
-				return false, fmt.Errorf("parsing value '%s' as int: %w", argument, err)
-			}
-			val.SetInt(i)
-
-		case reflect.Float32, reflect.Float64:
-			f, err := strconv.ParseFloat(argument, 64)
-			if err != nil {
-				return false, fmt.Errorf("parsing value '%s' as float: %w", argument, err)
-			}
-			val.SetFloat(f)
-
-		default:
-			return false, fmt.Errorf("unsupported option %s type %T", s, opt)
-		}
-	}
-
-	if optionChanged {
-		printSettings()
-	}
-	return optionChanged, nil
-}
-
-// printSettings outputs the current settings, alphabetically sorted.
-func printSettings() {
-	var settings sort.StringSlice
-	for setting, value := range options {
-		val := reflect.ValueOf(value)
-		if val.Kind() == reflect.Ptr {
-			val = val.Elem()
-		}
-		settings = append(settings, fmt.Sprintf("%s=%v", setting, val.Interface()))
-	}
-	sort.Sort(settings)
-	s := strings.Join(settings, " ")
-	fmt.Printf("Settings: %s\n\n", s)
-}

From bd93264ef1b07d9cd5f961d5817e1e860a768258 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Fri, 17 Mar 2023 23:45:03 +0100
Subject: [PATCH 26/44] Rename imports

---
 README.md        | 4 ++--
 examples/main.go | 2 +-
 go.mod           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b521ce6ee934d..debad35532950 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@ This is [llama.cpp](https://github.com/ggerganov/llama.cpp) port in golang to us
 ## Usage
 
 ```
-git clone XXX
-cd XXX
+git clone https://github.com/go-skynet/llama.git
+cd llama
 make libllama.a
 LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples/main.go -m ggml-alpaca-7b-q4.bin -n 10
 ```
diff --git a/examples/main.go b/examples/main.go
index bd1125cbad5f1..d104f8277d02a 100644
--- a/examples/main.go
+++ b/examples/main.go
@@ -9,7 +9,7 @@ import (
 	"runtime"
 	"strings"
 
-	llama "github.com/go-skynet/llama-go/go"
+	llama "github.com/go-skynet/llama/go"
 )
 
 var (
diff --git a/go.mod b/go.mod
index 6a81f5b887b72..85c809e1d8e21 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,3 @@
-module github.com/go-skynet/llama-go
+module github.com/go-skynet/llama
 
 go 1.19

From 5c6896a14de0dbe0ec9d7d06f08116d4b6d03592 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sat, 18 Mar 2023 00:05:25 +0100
Subject: [PATCH 27/44] Add options

---
 examples/main.go |  5 ++-
 go/llama.go      | 49 +++++++++------------------
 go/options.go    | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 37 deletions(-)
 create mode 100644 go/options.go

diff --git a/examples/main.go b/examples/main.go
index d104f8277d02a..71034b53fd81c 100644
--- a/examples/main.go
+++ b/examples/main.go
@@ -30,8 +30,7 @@ func main() {
 		fmt.Printf("Parsing program arguments failed: %s", err)
 		os.Exit(1)
 	}
-	l := &llama.LLama{}
-	err = l.Load(model)
+	l, err := llama.New(model, 0)
 	if err != nil {
 		fmt.Println("Loading the model failed:", err.Error())
 		os.Exit(1)
@@ -43,7 +42,7 @@ func main() {
 	for {
 		text := readMultiLineInput(reader)
 
-		res, err := l.Predict(threads, tokens, text)
+		res, err := l.Predict(text, llama.SetTokens(tokens), llama.SetThreads(threads))
 		if err != nil {
 			panic(err)
 		}
diff --git a/go/llama.go b/go/llama.go
index f8f8c47f5b07c..2e13b5bf67ddd 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -9,51 +9,32 @@ import (
 	"unsafe"
 )
 
-var (
-	repeatLastN = 64
-	seed        = -1
-	threads     = 4
-	tokens      = 128
-
-	topK          = 40
-	topP          = 0.95
-	temp          = 0.80
-	repeatPenalty = 1.30
-
-	nCtx = 512 // context size
-
-	options = map[string]interface{}{
-		"repeat_last_n":  &repeatLastN, // last n tokens to penalize
-		"repeat_penalty": &repeatPenalty,
-		"seed":           &seed, // RNG seed, -1 will seed based on current time
-		"temp":           &temp,
-		"threads":        &threads,
-		"tokens":         &tokens, // new tokens to predict
-		"top_k":          &topK,
-		"top_p":          &topP,
-	}
-)
-
 type LLama struct {
 	state unsafe.Pointer
 }
 
-func (l *LLama) Load(model string) error {
+func New(model string, ctxSize int) (*LLama, error) {
+	if ctxSize == 0 {
+		ctxSize = 512
+	}
 	state := C.llama_allocate_state()
 	modelPath := C.CString(model)
-	result := C.llama_bootstrap(modelPath, state, C.int(nCtx))
+	result := C.llama_bootstrap(modelPath, state, C.int(ctxSize))
 	if result != 0 {
-		return fmt.Errorf("failed loading model")
+		return nil, fmt.Errorf("failed loading model")
 	}
-	l.state = state
-	return nil
+
+	return &LLama{state: state}, nil
 }
 
-func (l *LLama) Predict(threads int, tokens int, text string) (string, error) {
+func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
+
+	po := NewPredictOptions(opts...)
+
 	input := C.CString(text)
-	out := make([]byte, tokens)
-	params := C.llama_allocate_params(input, C.int(seed), C.int(threads), C.int(tokens), C.int(topK),
-		C.float(topP), C.float(temp), C.float(repeatPenalty), C.int(repeatLastN))
+	out := make([]byte, po.Tokens)
+	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
+		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat))
 	C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
 	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
 
diff --git a/go/options.go b/go/options.go
new file mode 100644
index 0000000000000..a98982996177a
--- /dev/null
+++ b/go/options.go
@@ -0,0 +1,86 @@
+package llama
+
+import "runtime"
+
+type PredictOptions struct {
+	Seed, Threads, Tokens, TopK, Repeat int
+	TopP, Temperature, Penalty          float64
+}
+
+type PredictOption func(p *PredictOptions)
+
+var DefaultOptions PredictOptions = PredictOptions{
+	Seed:        -1,
+	Threads:     runtime.NumCPU(),
+	Tokens:      128,
+	TopK:        40,
+	TopP:        0.95,
+	Temperature: 0.80,
+	Penalty:     1.3,
+	Repeat:      64,
+}
+
+// SetSeed sets the random seed for sampling text generation.
+func SetSeed(seed int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Seed = seed
+	}
+}
+
+// SetThreads sets the number of threads to use for text generation.
+func SetThreads(threads int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Threads = threads
+	}
+}
+
+// SetTokens sets the number of tokens to generate.
+func SetTokens(tokens int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Tokens = tokens
+	}
+}
+
+// SetTopK sets the value for top-K sampling.
+func SetTopK(topk int) PredictOption {
+	return func(p *PredictOptions) {
+		p.TopK = topk
+	}
+}
+
+// SetTopP sets the value for nucleus sampling.
+func SetTopP(topp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.TopP = topp
+	}
+}
+
+// SetTemperature sets the temperature value for text generation.
+func SetTemperature(temp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.Temperature = temp
+	}
+}
+
+// SetPenalty sets the repetition penalty for text generation.
+func SetPenalty(penalty float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.Penalty = penalty
+	}
+}
+
+// SetRepeat sets the number of times to repeat text generation.
+func SetRepeat(repeat int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Repeat = repeat
+	}
+}
+
+// Create a new PredictOptions object with the given options.
+func NewPredictOptions(opts ...PredictOption) PredictOptions {
+	p := DefaultOptions
+	for _, opt := range opts {
+		opt(&p)
+	}
+	return p
+}

From 56080ad745d1807e21ad3db421a005cfdc398dde Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sat, 18 Mar 2023 11:17:59 +0100
Subject: [PATCH 28/44] Return errors if inference fails

Signed-off-by: mudler <mudler@c3os.io>
---
 go/llama.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/go/llama.go b/go/llama.go
index 2e13b5bf67ddd..ededf51e86631 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -35,7 +35,10 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	out := make([]byte, po.Tokens)
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat))
-	C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
+	ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
+	if ret != 0 {
+		return "", fmt.Errorf("inference failed")
+	}
 	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
 
 	res = strings.TrimPrefix(res, " "+text)

From 8865f10a77344cd06beac89666428c654391a327 Mon Sep 17 00:00:00 2001
From: Gary Linscott <glinscott@gmail.com>
Date: Sat, 18 Mar 2023 04:17:19 -0700
Subject: [PATCH 29/44] Fix n^2 loop in tokenization (#254)

This causes long prompts to parse very slowly.
---
 utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils.cpp b/utils.cpp
index 7539edd86d1a1..70115d04ccf23 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -290,7 +290,7 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
     // Forward pass
     for (int i = 0; i < len; i++) {
         int max_len = std::min(len - i, MAX_TOKEN_LEN);
-        for (int sub_len = 1; sub_len <= len - i; sub_len++) {
+        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
             auto sub = text.substr(i, sub_len);
             auto token = vocab.token_to_id.find(sub);
             if (token != vocab.token_to_id.end()) {

From 8b9e5375998fb2aaac20569d62c1a047a5c20a28 Mon Sep 17 00:00:00 2001
From: Alex Nguyen <tiendung@users.noreply.github.com>
Date: Sat, 18 Mar 2023 20:51:49 +0700
Subject: [PATCH 30/44] Remove unused code since n_vocab is
 model.hparams.n_vocab (#262)

---
 lama.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/lama.cpp b/lama.cpp
index d79f1e3b37d04..3696ee0080f97 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -153,16 +153,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
     // load vocab
     {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
         std::string word;
-        for (int i = 0; i < n_vocab; i++) {
+        for (int i = 0; i < model.hparams.n_vocab; i++) {
             uint32_t len;
             fin.read((char *) &len, sizeof(len));
 

From e64e29db3709a7d5b1f85b2dc6102730b94ea956 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 19 Mar 2023 17:30:00 +0200
Subject: [PATCH 31/44] Change RMSNorm eps to 1e-6 (#173)

I think this is what is used in the Python code
---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index c4f8389171026..e1da0c737fa67 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5556,7 +5556,7 @@ static void ggml_compute_forward_rms_norm_f32(
     const size_t nb2 = dst->nb[2];
     const size_t nb3 = dst->nb[3];
 
-    const ggml_float eps = 1e-5f; // TODO: make this a parameter
+    const ggml_float eps = 1e-6f; // TODO: make this a parameter
 
     // TODO: optimize
     for (int i03 = 0; i03 < ne03; i03++) {
@@ -5572,7 +5572,7 @@ static void ggml_compute_forward_rms_norm_f32(
                 mean /= ne00;
 
                 float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-                
+
                 memcpy(y, x, ne00 * sizeof(float));
                 // for (int i00 = 0; i00 < ne00; i00++) {
                 //     y[i00] = x[i00];

From b4ad8f831effa1cd3db70bc6b65c7c436f922c65 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sun, 19 Mar 2023 19:38:00 +0100
Subject: [PATCH 32/44] Fix off by-one bug

---
 lama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lama.cpp b/lama.cpp
index 3696ee0080f97..9d613c324d353 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -954,7 +954,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) {
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(embd_inp[input_consumed]);
                 ++input_consumed;
-                if (embd.size() > params.n_batch) {
+                if ((int)embd.size() >= params.n_batch) {
                     break;
                 }
             }

From 4fa8e4c4b1a66fdd2d183133d2b0a60bb7f1abc3 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sun, 19 Mar 2023 20:07:41 +0100
Subject: [PATCH 33/44] Allow to use f16memory

Altought with alpaca does seems to have a huge impact on quality.
---
 examples/main.go |  2 +-
 go/llama.go      |  8 +++-----
 go/options.go    | 35 +++++++++++++++++++++++++++++++++--
 lama.cpp         | 25 ++++++++++++++++++-------
 lama.h           |  2 +-
 5 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/examples/main.go b/examples/main.go
index 71034b53fd81c..32d5453ffeabf 100644
--- a/examples/main.go
+++ b/examples/main.go
@@ -30,7 +30,7 @@ func main() {
 		fmt.Printf("Parsing program arguments failed: %s", err)
 		os.Exit(1)
 	}
-	l, err := llama.New(model, 0)
+	l, err := llama.New(model, llama.EnableF16Memory)
 	if err != nil {
 		fmt.Println("Loading the model failed:", err.Error())
 		os.Exit(1)
diff --git a/go/llama.go b/go/llama.go
index ededf51e86631..291a5c3e827e5 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -13,13 +13,11 @@ type LLama struct {
 	state unsafe.Pointer
 }
 
-func New(model string, ctxSize int) (*LLama, error) {
-	if ctxSize == 0 {
-		ctxSize = 512
-	}
+func New(model string, opts ...ModelOption) (*LLama, error) {
+	mo := NewModelOptions(opts...)
 	state := C.llama_allocate_state()
 	modelPath := C.CString(model)
-	result := C.llama_bootstrap(modelPath, state, C.int(ctxSize))
+	result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory))
 	if result != 0 {
 		return nil, fmt.Errorf("failed loading model")
 	}
diff --git a/go/options.go b/go/options.go
index a98982996177a..c5e397f83eae4 100644
--- a/go/options.go
+++ b/go/options.go
@@ -2,24 +2,55 @@ package llama
 
 import "runtime"
 
+type ModelOptions struct {
+	ContextSize int
+	F16Memory   bool
+}
+
 type PredictOptions struct {
 	Seed, Threads, Tokens, TopK, Repeat int
 	TopP, Temperature, Penalty          float64
 }
 
 type PredictOption func(p *PredictOptions)
+type ModelOption func(p *ModelOptions)
+
+var DefaultModelOptions ModelOptions = ModelOptions{
+	ContextSize: 512,
+	F16Memory:   false,
+}
 
 var DefaultOptions PredictOptions = PredictOptions{
 	Seed:        -1,
 	Threads:     runtime.NumCPU(),
 	Tokens:      128,
 	TopK:        40,
-	TopP:        0.95,
-	Temperature: 0.80,
+	TopP:        0.90,
+	Temperature: 0.95,
 	Penalty:     1.3,
 	Repeat:      64,
 }
 
+// SetContext sets the context size.
+func SetContext(c int) ModelOption {
+	return func(p *ModelOptions) {
+		p.ContextSize = c
+	}
+}
+
+var EnableF16Memory ModelOption = func(p *ModelOptions) {
+	p.F16Memory = true
+}
+
+// Create a new PredictOptions object with the given options.
+func NewModelOptions(opts ...ModelOption) ModelOptions {
+	p := DefaultModelOptions
+	for _, opt := range opts {
+		opt(&p)
+	}
+	return p
+}
+
 // SetSeed sets the random seed for sampling text generation.
 func SetSeed(seed int) PredictOption {
 	return func(p *PredictOptions) {
diff --git a/lama.cpp b/lama.cpp
index 9d613c324d353..77263af53db05 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -96,7 +96,7 @@ struct llama_state {
 };
 
 // load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
+bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory) {
 //    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024*1024);
@@ -219,8 +219,14 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
 
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        if (f16memory) {
+            ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
+            ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+
+        } else {
+            ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+            ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        }
 
         ctx_size += (5 + 10*n_layer)*256; // object overhead
 
@@ -306,8 +312,13 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         const int n_mem      = n_layer*n_ctx;
         const int n_elements = n_embd*n_mem;
 
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        if (f16memory) {
+            model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+            model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        } else {
+            model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+            model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        }
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
@@ -824,14 +835,14 @@ int main(int argc, char ** argv) {
 
  */
 
-int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx)
+int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory)
     // load the model
     {
         ggml_time_init();
         llama_state* state = (llama_state*) state_pr;
 
         const int64_t t_start_us = ggml_time_us();
-        if (!llama_model_load(model_path, state->model, state->vocab, n_ctx)) {
+        if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory)) {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path);
             return 1;
         }
diff --git a/lama.h b/lama.h
index fc708bce51031..60ee6e673e6e2 100644
--- a/lama.h
+++ b/lama.h
@@ -6,7 +6,7 @@ extern "C" {
 
 void *llama_allocate_state();
 
-int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx);
+int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory);
 
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
                             int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n);

From 47719aabb652e34510afb997697b6cefda8b04f0 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sun, 19 Mar 2023 20:14:59 +0100
Subject: [PATCH 34/44] Add ignore EOS

---
 go/llama.go   |  2 +-
 go/options.go |  5 +++++
 lama.cpp      | 13 +++++++++++--
 lama.h        |  2 +-
 utils.h       |  2 ++
 5 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/go/llama.go b/go/llama.go
index 291a5c3e827e5..af4ea3bc8f877 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -32,7 +32,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	input := C.CString(text)
 	out := make([]byte, po.Tokens)
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
-		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat))
+		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS))
 	ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
 	if ret != 0 {
 		return "", fmt.Errorf("inference failed")
diff --git a/go/options.go b/go/options.go
index c5e397f83eae4..c1be66f1cb477 100644
--- a/go/options.go
+++ b/go/options.go
@@ -10,6 +10,7 @@ type ModelOptions struct {
 type PredictOptions struct {
 	Seed, Threads, Tokens, TopK, Repeat int
 	TopP, Temperature, Penalty          float64
+	IgnoreEOS                           bool
 }
 
 type PredictOption func(p *PredictOptions)
@@ -51,6 +52,10 @@ func NewModelOptions(opts ...ModelOption) ModelOptions {
 	return p
 }
 
+var IgnoreEOS PredictOption = func(p *PredictOptions) {
+	p.IgnoreEOS = true
+}
+
 // SetSeed sets the random seed for sampling text generation.
 func SetSeed(seed int) PredictOption {
 	return func(p *PredictOptions) {
diff --git a/lama.cpp b/lama.cpp
index 77263af53db05..660ba11432494 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -27,6 +27,8 @@
 #define ANSI_COLOR_RESET   "\x1b[0m"
 #define ANSI_BOLD          "\x1b[1m"
 
+static const int EOS_TOKEN_ID = 2;
+
 // determine number of model parts based on the dimension
 static const std::map<int, int> LLAMA_N_PARTS = {
     { 4096, 1 },
@@ -942,6 +944,11 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) {
             {
                 const int64_t t_start_sample_us = ggml_time_us();
 
+                if (params.ignore_eos) {
+                    // set the logit of the eos token to zero to avoid sampling it
+                    logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
+                }
+
                 id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
 
                 last_n_tokens.erase(last_n_tokens.begin());
@@ -980,7 +987,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) {
 
    
         // end of text token
-        if (embd.back() == 2) {
+        if (embd.back() == EOS_TOKEN_ID) {
 //            fprintf(stderr, " [end of text]\n");
           //  return 2;
           break;
@@ -1018,7 +1025,7 @@ void* llama_allocate_state() {
 }
 
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
-                            float top_p, float temp, float repeat_penalty, int repeat_last_n) {
+                            float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos) {
     gpt_params* params = new gpt_params;
     params->seed = seed;
     params->n_threads = threads;
@@ -1031,6 +1038,8 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
     params->repeat_penalty = repeat_penalty;
 
     params->prompt = prompt;
+    params->ignore_eos = ignore_eos;
+    
     return params;
 }
 
diff --git a/lama.h b/lama.h
index 60ee6e673e6e2..6a7b7fc60b36e 100644
--- a/lama.h
+++ b/lama.h
@@ -9,7 +9,7 @@ void *llama_allocate_state();
 int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory);
 
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
-                            int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n);
+                            int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos);
 void llama_free_params(void* params_ptr);
 
 int llama_predict(void* params_ptr, void* state_pr, char* result);
diff --git a/utils.h b/utils.h
index 021120b0513c7..946ab312ec792 100644
--- a/utils.h
+++ b/utils.h
@@ -35,6 +35,8 @@ struct gpt_params {
     bool interactive = false; // interactive mode
     bool interactive_start = false; // reverse prompt immediately
     std::string antiprompt = ""; // string upon seeing which more user input is prompted
+
+    bool ignore_eos = false; // do not stop generating after eos
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

From 12f4f4a39b7a4d83e9c6d2f49ead3fe09b731cba Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sun, 19 Mar 2023 20:15:46 +0100
Subject: [PATCH 35/44] Disable f16 on examples

---
 examples/main.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main.go b/examples/main.go
index 32d5453ffeabf..66d7386cd7df3 100644
--- a/examples/main.go
+++ b/examples/main.go
@@ -30,7 +30,7 @@ func main() {
 		fmt.Printf("Parsing program arguments failed: %s", err)
 		os.Exit(1)
 	}
-	l, err := llama.New(model, llama.EnableF16Memory)
+	l, err := llama.New(model)
 	if err != nil {
 		fmt.Println("Loading the model failed:", err.Error())
 		os.Exit(1)

From e3c6247a81690e9830cc7f56a31a576a989e5a15 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sun, 19 Mar 2023 23:39:06 +0100
Subject: [PATCH 36/44] Set better defaults

---
 go/options.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/options.go b/go/options.go
index c1be66f1cb477..346fff0dd5f43 100644
--- a/go/options.go
+++ b/go/options.go
@@ -25,10 +25,10 @@ var DefaultOptions PredictOptions = PredictOptions{
 	Seed:        -1,
 	Threads:     runtime.NumCPU(),
 	Tokens:      128,
-	TopK:        40,
+	TopK:        10000,
 	TopP:        0.90,
-	Temperature: 0.95,
-	Penalty:     1.3,
+	Temperature: 0.96,
+	Penalty:     1,
 	Repeat:      64,
 }
 

From 0076188dd5481a8ca86af992896fbaa58ecbe011 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sun, 19 Mar 2023 23:39:17 +0100
Subject: [PATCH 37/44] Update README with useful links

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index debad35532950..7968ce8f61841 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples/main.go -m ggml-alpaca-7
 
 ## Model
 
-For a tiny model, you can use https://github.com/antimatter15/alpaca.cpp .
+For a tiny model, you can use https://github.com/antimatter15/alpaca.cpp . For how to use the prompt, check: https://github.com/tatsu-lab/stanford_alpaca
 
 ## License
 
@@ -22,5 +22,6 @@ MIT
 ## Acknowledgements
 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- https://github.com/tatsu-lab/stanford_alpaca
 - https://github.com/cornelk/llama-go for the initial ideas
 - https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
\ No newline at end of file

From 4b6c39d812bb383aa057d71083891bfb380e0ab6 Mon Sep 17 00:00:00 2001
From: Mack Straight <eiz@users.noreply.github.com>
Date: Mon, 20 Mar 2023 03:17:23 -0700
Subject: [PATCH 38/44] sentencepiece bpe compatible tokenizer (#252)

* potential out of bounds read

* fix quantize

* style

* Update convert-pth-to-ggml.py

* mild cleanup

* don't need the space-prefixing here rn since main.cpp already does it

* new file magic + version header field

* readme notice

* missing newlines

Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
---
 Makefile     |   2 +-
 lama.cpp     |  21 ++++++-
 quantize.cpp |  24 +++++++-
 utils.cpp    | 171 +++++++++++++++++++++++++++++++++++++++------------
 utils.h      |   3 +-
 5 files changed, 176 insertions(+), 45 deletions(-)

diff --git a/Makefile b/Makefile
index cb9ccd644fd16..9095c586b5254 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ endif
 #
 
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
 LDFLAGS  =
 
 # OS specific
diff --git a/lama.cpp b/lama.cpp
index 660ba11432494..12417af0057fc 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -3,6 +3,7 @@
 #include "utils.h"
 
 #include <cassert>
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -114,10 +115,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic == 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
+                    __func__, fname.c_str());
+            return false;
+        }
+        if (magic != 0x67676d66) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
         }
+
+        uint32_t format_version;
+        fin.read((char *) &format_version, sizeof(format_version));
+
+        if (format_version != 1) {
+            fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
+                    __func__, fname.c_str(), format_version);
+            return false;
+        }
     }
 
     int n_ff = 0;
@@ -163,8 +178,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             word.resize(len);
             fin.read((char *) word.data(), len);
 
+            float score;
+            fin.read((char *) &score, sizeof(score));
+
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
+            vocab.score[i] = score;
 
             //if (i < 30000) {
             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
diff --git a/quantize.cpp b/quantize.cpp
index 9ff579eb9bafe..1ee8d2d9de862 100644
--- a/quantize.cpp
+++ b/quantize.cpp
@@ -3,6 +3,7 @@
 #include "utils.h"
 
 #include <cassert>
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
     {
         uint32_t magic;
         finp.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic == 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
+                    __func__, fname_inp.c_str());
+            return false;
+        }
+        if (magic != 0x67676d66) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
             return false;
         }
 
         fout.write((char *) &magic, sizeof(magic));
+
+        uint32_t format_version;
+        finp.read((char *) &format_version, sizeof(format_version));
+
+        if (format_version != 1) {
+            fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
+                    __func__, fname_inp.c_str(), format_version);
+            return false;
+        }
+
+        fout.write((char *) &format_version, sizeof(format_version));
     }
 
     llama_hparams hparams;
@@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
             finp.read ((char *) word.data(), len);
             fout.write((char *) word.data(), len);
 
+            float score;
+            finp.read ((char *) &score, sizeof(score));
+            fout.write((char *) &score, sizeof(score));
+
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
+            vocab.score[i] = score;
         }
     }
 
diff --git a/utils.cpp b/utils.cpp
index 70115d04ccf23..c4903f5ab310b 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -6,6 +6,7 @@
 #include <regex>
 #include <iostream>
 #include <iterator>
+#include <queue>
 #include <string>
 #include <math.h>
 
@@ -275,58 +276,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
     return tokens;
 }
 
-// TODO: Calculate this constant from the vocabulary
-#define MAX_TOKEN_LEN 18
-// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    std::vector<gpt_vocab::id> res;
-    std::vector<int> score;
-    std::vector<gpt_vocab::id> prev;
-    int len = text.length();
-
-    score.resize(len + 1);
-    prev.resize(len + 1);
-
-    // Forward pass
-    for (int i = 0; i < len; i++) {
-        int max_len = std::min(len - i, MAX_TOKEN_LEN);
-        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
-            auto sub = text.substr(i, sub_len);
-            auto token = vocab.token_to_id.find(sub);
-            if (token != vocab.token_to_id.end()) {
-                int token_score = sub.length() * sub.length();
-                int local_score = score[i] + token_score;
-                int next = i + sub_len;
-                if (score[next] < local_score) {
-                    score[next] = local_score;
-                    prev[next] = (*token).second;
+static size_t utf8_len(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+struct llama_sp_symbol {
+    using index = int;
+    index prev;
+    index next;
+    std::string_view text;
+};
+
+struct llama_sp_bigram {
+    struct comparator {
+        bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
+            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+        }
+    };
+    using queue_storage = std::vector<llama_sp_bigram>;
+    using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
+    llama_sp_symbol::index left;
+    llama_sp_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llama_tokenizer {
+    llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}
+
+    void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
+        // split string into utf8 chars
+        int index = 0;
+        while (!text.empty()) {
+            llama_sp_symbol sym;
+            size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
+            sym.text = std::string_view(text.data(), char_len);
+            sym.prev = index - 1;
+            text.remove_prefix(char_len);
+            sym.next = text.empty() ? -1 : index + 1;
+            index++;
+            symbols_.emplace_back(std::move(sym));
+        }
+
+        // seed the work queue with all possible 2-character tokens.
+        for (size_t i = 1; i < symbols_.size(); ++i) {
+            try_add_bigram(i - 1, i);
+        }
+
+        // keep substituting the highest frequency pairs for as long as we can.
+        while (!work_queue_.empty()) {
+            auto bigram = work_queue_.top();
+            work_queue_.pop();
+
+            auto & left_sym = symbols_[bigram.left];
+            auto & right_sym = symbols_[bigram.right];
+
+            // if one of the symbols already got merged, skip it.
+            if (left_sym.text.empty() || right_sym.text.empty() ||
+                left_sym.text.size() + right_sym.text.size() != bigram.size) {
+                continue;
+            }
+
+            // merge the right sym into the left one
+            left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
+            right_sym.text = std::string_view("");
+
+            // remove the right sym from the chain
+            left_sym.next = right_sym.next;
+            if (right_sym.next >= 0) {
+                symbols_[right_sym.next].prev = bigram.left;
+            }
+
+            // find more substitutions
+            try_add_bigram(left_sym.prev, bigram.left);
+            try_add_bigram(bigram.left, left_sym.next);
+        }
+
+        for (int i = 0; i != -1; i = symbols_[i].next) {
+            auto& symbol = symbols_[i];
+            auto token = vocab_.token_to_id.find(std::string(symbol.text));
+
+            if (token == vocab_.token_to_id.end()) {
+                // output any symbols that did not form tokens as bytes.
+                for (int j = 0; j < symbol.text.size(); ++j) {
+                    gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                    output.push_back(token_id);
                 }
+            } else {
+                output.push_back((*token).second);
             }
         }
     }
 
-    // Backward pass
-    int i = len;
-    while (i > 0) {
-        gpt_vocab::id token_id = prev[i];
-        if (token_id == 0) {
-	    // TODO: Return error or something more meaningful
-            printf("failed to tokenize string!\n");
-	    break;
+private:
+    void try_add_bigram(int left, int right) {
+        if (left == -1 || right == -1) {
+            return;
+        }
+
+        std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
+        auto token = vocab_.token_to_id.find(std::string(text));
+
+        if (token == vocab_.token_to_id.end()) {
+            return;
         }
-        res.push_back(token_id);
-        auto token = (*vocab.id_to_token.find(token_id)).second;
-        i -= token.length();
+
+        auto score = vocab_.score.find((*token).second);
+
+        if (score == vocab_.score.end()) {
+            return;
+        }
+
+        llama_sp_bigram bigram;
+        bigram.left = left;
+        bigram.right = right;
+        bigram.score = (*score).second;
+        bigram.size = text.size();
+        work_queue_.push(bigram);
     }
 
-    if (bos) {
-        res.push_back(1); // TODO: replace with vocab.bos
+    const gpt_vocab & vocab_;
+    std::vector<llama_sp_symbol> symbols_;
+    llama_sp_bigram::queue work_queue_;
+};
+
+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
+    llama_tokenizer tokenizer(vocab);
+    std::vector<gpt_vocab::id> output;
+
+    if (text.size() == 0) {
+        return output;
     }
 
-    // Pieces are in reverse order so correct that
-    std::reverse(res.begin(), res.end());
+    if (bos) {
+        output.push_back(1);
+    }
 
-    return res;
+    tokenizer.tokenize(text, output);
+    return output;
 }
 
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
diff --git a/utils.h b/utils.h
index 946ab312ec792..7ff8a12a7c68e 100644
--- a/utils.h
+++ b/utils.h
@@ -55,6 +55,7 @@ struct gpt_vocab {
 
     std::map<token, id> token_to_id;
     std::map<id, token> id_to_token;
+    std::map<id, float> score;
 };
 
 void replace(std::string & str, const std::string & needle, const std::string & replacement);
@@ -76,7 +77,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
 
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);

From 7c2170ecfa55e4def9f5bb81470c9f26a2ccedd7 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Tue, 21 Mar 2023 18:22:33 +0100
Subject: [PATCH 39/44] Enable loading 13b and 30b alpaca models

---
 go/llama.go   |  2 +-
 go/options.go |  6 ++++++
 lama.cpp      | 21 ++++++++++++++++-----
 lama.h        |  2 +-
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/go/llama.go b/go/llama.go
index af4ea3bc8f877..d815c936284c9 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -17,7 +17,7 @@ func New(model string, opts ...ModelOption) (*LLama, error) {
 	mo := NewModelOptions(opts...)
 	state := C.llama_allocate_state()
 	modelPath := C.CString(model)
-	result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory))
+	result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory), C.bool(mo.Alpaca))
 	if result != 0 {
 		return nil, fmt.Errorf("failed loading model")
 	}
diff --git a/go/options.go b/go/options.go
index 346fff0dd5f43..fd646d4e829a6 100644
--- a/go/options.go
+++ b/go/options.go
@@ -5,6 +5,7 @@ import "runtime"
 type ModelOptions struct {
 	ContextSize int
 	F16Memory   bool
+	Alpaca      bool
 }
 
 type PredictOptions struct {
@@ -19,6 +20,7 @@ type ModelOption func(p *ModelOptions)
 var DefaultModelOptions ModelOptions = ModelOptions{
 	ContextSize: 512,
 	F16Memory:   false,
+	Alpaca:      false,
 }
 
 var DefaultOptions PredictOptions = PredictOptions{
@@ -43,6 +45,10 @@ var EnableF16Memory ModelOption = func(p *ModelOptions) {
 	p.F16Memory = true
 }
 
+var EnableAlpaca ModelOption = func(p *ModelOptions) {
+	p.Alpaca = true
+}
+
 // Create a new PredictOptions object with the given options.
 func NewModelOptions(opts ...ModelOption) ModelOptions {
 	p := DefaultModelOptions
diff --git a/lama.cpp b/lama.cpp
index 12417af0057fc..071b100207e6c 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -38,6 +38,14 @@ static const std::map<int, int> LLAMA_N_PARTS = {
     { 8192, 8 },
 };
 
+// determine number of model parts based on the dimension
+static const std::map<int, int> ALPACA_N_PARTS = {
+    { 4096, 1 },
+    { 5120, 1 },
+    { 6656, 1 },
+    { 8192, 1 },
+};
+
 // default hparams (LLaMA 7B)
 struct llama_hparams {
     int32_t n_vocab = 32000;
@@ -99,7 +107,7 @@ struct llama_state {
 };
 
 // load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory) {
+bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory, bool alpaca) {
 //    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024*1024);
@@ -154,7 +162,11 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         hparams.n_ctx = n_ctx;
 
         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
-        n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
+        if (alpaca) {
+            n_parts = ALPACA_N_PARTS.at(hparams.n_embd);
+        } else {
+            n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
+        }
 /*
         fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
@@ -856,14 +868,14 @@ int main(int argc, char ** argv) {
 
  */
 
-int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory)
+int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory, bool alpaca)
     // load the model
     {
         ggml_time_init();
         llama_state* state = (llama_state*) state_pr;
 
         const int64_t t_start_us = ggml_time_us();
-        if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory)) {
+        if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory, alpaca)) {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path);
             return 1;
         }
@@ -928,7 +940,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result) {
     bool input_noecho = false;
 
     std::string res = "";
-
     while (true) {
          if (params.n_predict != 0 && remaining_tokens <= 0) {
             break;
diff --git a/lama.h b/lama.h
index 6a7b7fc60b36e..434a7367945d3 100644
--- a/lama.h
+++ b/lama.h
@@ -6,7 +6,7 @@ extern "C" {
 
 void *llama_allocate_state();
 
-int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory);
+int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory, bool alpaca);
 
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
                             int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos);

From 7be5326e18ccef816d5cc4486a19653e922c4bc9 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Tue, 21 Mar 2023 18:22:41 +0100
Subject: [PATCH 40/44] Stab a high number of tokens when 0 is supplied

---
 go/llama.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/go/llama.go b/go/llama.go
index d815c936284c9..217e11421774c 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -30,6 +30,9 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	po := NewPredictOptions(opts...)
 
 	input := C.CString(text)
+	if po.Tokens == 0 {
+		po.Tokens = 99999999
+	}
 	out := make([]byte, po.Tokens)
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS))

From 623e9d24922528cd20b78105dd117f4553ab36ad Mon Sep 17 00:00:00 2001
From: Casey Primozic <casey@cprimozic.net>
Date: Tue, 21 Mar 2023 07:35:42 -0700
Subject: [PATCH 41/44] Add initial AVX512 support for dot product on Linux
 (#320)

 * Update Makefile to detect AVX512 support and add compiler flags if it's available
 * Based on existing AVX2 implementation, dot product on one 32-value block of 4-bit quantized ints at a time
 * Perform 8 bit -> 16 bit sign extension and multiply+add on 32 values at time instead of 16
 * Use built-in AVX512 horizontal reduce add to get sum at the end
 * Manual unrolling on inner dot product loop to reduce loop counter overhead
---
 Makefile | 32 +++++++++++++++++++++++
 ggml.c   | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 9095c586b5254..1cae8bda9d65d 100644
--- a/Makefile
+++ b/Makefile
@@ -95,6 +95,38 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 		ifneq (,$(findstring sse3,$(SSE3_M)))
 			CFLAGS += -msse3
 		endif
+		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
+		ifneq (,$(findstring avx512f,$(AVX512F_M)))
+			CFLAGS += -mavx512f
+		endif
+		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
+		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
+			CFLAGS += -mavx512bw
+		endif
+		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
+		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
+			CFLAGS += -mavx512dq
+		endif
+		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
+		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
+			CFLAGS += -mavx512vl
+		endif
+		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
+		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
+			CFLAGS += -mavx512cd
+		endif
+		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
+		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
+			CFLAGS += -mavx512er
+		endif
+		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
+		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
+			CFLAGS += -mavx512ifma
+		endif
+		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
+		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
+			CFLAGS += -mavx512pf
+		endif
 	else ifeq ($(UNAME_S),Haiku)
 		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
 		ifneq (,$(findstring avx,$(AVX1_M)))
diff --git a/ggml.c b/ggml.c
index e1da0c737fa67..f5f14c0ebd0da 100644
--- a/ggml.c
+++ b/ggml.c
@@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
 // AVX routines provided by GH user Const-me
 // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
-#if __AVX2__
+#if __AVX2__ || __AVX512F__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytesFromNibbles( const uint8_t* rsi )
@@ -397,7 +397,6 @@ static inline __m128i packNibbles( __m256i bytes )
 }
 #endif
 
-
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@@ -1262,6 +1261,47 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
     *s = sumf;
 }
 
+#if __AVX512F__ && QK == 32
+static inline __m512 dot_q4_0_oneblock_avx512(
+    __m512 acc,
+    const uint8_t * pd0,
+    const uint8_t * pd1,
+    const uint8_t * pb0,
+    const uint8_t * pb1,
+    size_t bs,
+    int i
+) {
+    const float * d0_0 = (const float *) (pd0 + i*bs);
+    const float * d1_0 = (const float *) (pd1 + i*bs);
+
+    const uint8_t * restrict p0 = pb0 + (i+0)*bs;
+    const uint8_t * restrict p1 = pb1 + (i+0)*bs;
+
+    // Compute combined scale for the block
+    float scaleScalar = d0_0[0] * d1_0[0];
+    __m512 scale = _mm512_set1_ps( scaleScalar );
+
+    __m256i bx = bytesFromNibbles( p0 );
+    __m256i by = bytesFromNibbles( p1 );
+
+    // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+    const __m256i off = _mm256_set1_epi8( 8 );
+    bx = _mm256_sub_epi8( bx, off );
+    by = _mm256_sub_epi8( by, off );
+
+    // Sign-extend 16 signed bytes into int16_t
+    __m512i x32 = _mm512_cvtepi8_epi16( bx );
+    __m512i y32 = _mm512_cvtepi8_epi16( by );
+    // Compute products of int16_t integers, add pairwise
+    __m512i i64 = _mm512_madd_epi16( x32, y32 );
+
+    // Convert int32_t to float
+    __m512 p = _mm512_cvtepi32_ps( i64 );
+    // Apply the scale, and accumulate
+    return _mm512_fmadd_ps( scale, p, acc );
+}
+#endif
+
 inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
     ggml_float sumf = 0.0;
 
@@ -1417,6 +1457,40 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 #else
 #error "not implemented for QK"
 #endif
+#elif defined(__AVX512F__)
+
+#if QK == 32
+    // Initialize accumulator with zeros
+    __m512 acc0 = _mm512_setzero_ps();
+    __m512 acc1 = _mm512_setzero_ps();
+
+    const int superblock_size = 8;
+    const int superblock_count = nb / superblock_size;
+    const int remainder = nb % superblock_size;
+
+    for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
+        int i = superblock_ix * superblock_size;
+
+        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 );
+        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 );
+        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 );
+        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 );
+        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 );
+        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 );
+        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 );
+        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 );
+    }
+
+    // Remainders
+    for (int i = superblock_count * superblock_size; i < nb; ++i) {
+        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i );
+    }
+
+    // Horizontal sum of all lanes of the accumulator
+    sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
+#else
+#error "not implemented for QK"
+#endif
 #elif defined(__AVX2__)
 #if QK == 32
     const size_t countBlocks = nb;
@@ -1928,7 +2002,7 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
     const size_t bs = 2*sizeof(float) + QK/2;
 
     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
-    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float)); 
+    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float));
     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
     for (int i = 0; i < nb; i++) {

From 7e0ecbd19c759bf9cc59b86f1ced893f7271a0b8 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Thu, 23 Mar 2023 21:52:05 +0100
Subject: [PATCH 42/44] Revert "Add initial AVX512 support for dot product on
 Linux"

This reverts commit 623e9d24922528cd20b78105dd117f4553ab36ad.
---
 Makefile | 32 -----------------------
 ggml.c   | 80 +++-----------------------------------------------------
 2 files changed, 3 insertions(+), 109 deletions(-)

diff --git a/Makefile b/Makefile
index 1cae8bda9d65d..9095c586b5254 100644
--- a/Makefile
+++ b/Makefile
@@ -95,38 +95,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 		ifneq (,$(findstring sse3,$(SSE3_M)))
 			CFLAGS += -msse3
 		endif
-		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
-		ifneq (,$(findstring avx512f,$(AVX512F_M)))
-			CFLAGS += -mavx512f
-		endif
-		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
-		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
-			CFLAGS += -mavx512bw
-		endif
-		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
-		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
-			CFLAGS += -mavx512dq
-		endif
-		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
-		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
-			CFLAGS += -mavx512vl
-		endif
-		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
-		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
-			CFLAGS += -mavx512cd
-		endif
-		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
-		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
-			CFLAGS += -mavx512er
-		endif
-		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
-		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
-			CFLAGS += -mavx512ifma
-		endif
-		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
-		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
-			CFLAGS += -mavx512pf
-		endif
 	else ifeq ($(UNAME_S),Haiku)
 		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
 		ifneq (,$(findstring avx,$(AVX1_M)))
diff --git a/ggml.c b/ggml.c
index f5f14c0ebd0da..e1da0c737fa67 100644
--- a/ggml.c
+++ b/ggml.c
@@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
 // AVX routines provided by GH user Const-me
 // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
-#if __AVX2__ || __AVX512F__
+#if __AVX2__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytesFromNibbles( const uint8_t* rsi )
@@ -397,6 +397,7 @@ static inline __m128i packNibbles( __m256i bytes )
 }
 #endif
 
+
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@@ -1261,47 +1262,6 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
     *s = sumf;
 }
 
-#if __AVX512F__ && QK == 32
-static inline __m512 dot_q4_0_oneblock_avx512(
-    __m512 acc,
-    const uint8_t * pd0,
-    const uint8_t * pd1,
-    const uint8_t * pb0,
-    const uint8_t * pb1,
-    size_t bs,
-    int i
-) {
-    const float * d0_0 = (const float *) (pd0 + i*bs);
-    const float * d1_0 = (const float *) (pd1 + i*bs);
-
-    const uint8_t * restrict p0 = pb0 + (i+0)*bs;
-    const uint8_t * restrict p1 = pb1 + (i+0)*bs;
-
-    // Compute combined scale for the block
-    float scaleScalar = d0_0[0] * d1_0[0];
-    __m512 scale = _mm512_set1_ps( scaleScalar );
-
-    __m256i bx = bytesFromNibbles( p0 );
-    __m256i by = bytesFromNibbles( p1 );
-
-    // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-    const __m256i off = _mm256_set1_epi8( 8 );
-    bx = _mm256_sub_epi8( bx, off );
-    by = _mm256_sub_epi8( by, off );
-
-    // Sign-extend 16 signed bytes into int16_t
-    __m512i x32 = _mm512_cvtepi8_epi16( bx );
-    __m512i y32 = _mm512_cvtepi8_epi16( by );
-    // Compute products of int16_t integers, add pairwise
-    __m512i i64 = _mm512_madd_epi16( x32, y32 );
-
-    // Convert int32_t to float
-    __m512 p = _mm512_cvtepi32_ps( i64 );
-    // Apply the scale, and accumulate
-    return _mm512_fmadd_ps( scale, p, acc );
-}
-#endif
-
 inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
     ggml_float sumf = 0.0;
 
@@ -1457,40 +1417,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 #else
 #error "not implemented for QK"
 #endif
-#elif defined(__AVX512F__)
-
-#if QK == 32
-    // Initialize accumulator with zeros
-    __m512 acc0 = _mm512_setzero_ps();
-    __m512 acc1 = _mm512_setzero_ps();
-
-    const int superblock_size = 8;
-    const int superblock_count = nb / superblock_size;
-    const int remainder = nb % superblock_size;
-
-    for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
-        int i = superblock_ix * superblock_size;
-
-        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 );
-        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 );
-        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 );
-        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 );
-        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 );
-        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 );
-        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 );
-        acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 );
-    }
-
-    // Remainders
-    for (int i = superblock_count * superblock_size; i < nb; ++i) {
-        acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i );
-    }
-
-    // Horizontal sum of all lanes of the accumulator
-    sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
-#else
-#error "not implemented for QK"
-#endif
 #elif defined(__AVX2__)
 #if QK == 32
     const size_t countBlocks = nb;
@@ -2002,7 +1928,7 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
     const size_t bs = 2*sizeof(float) + QK/2;
 
     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
-    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float));
+    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float)); 
     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
     for (int i = 0; i < nb; i++) {

From a3563a2690baef395edb7ee41a96acf5376862d3 Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Sat, 25 Mar 2023 23:37:42 +0100
Subject: [PATCH 43/44] Trim newline returned by model

---
 go/llama.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/go/llama.go b/go/llama.go
index 217e11421774c..151bbe94047d6 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -42,7 +42,9 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	}
 	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
 
-	res = strings.TrimPrefix(res, " "+text)
+	res = strings.TrimPrefix(res, " ")
+	res = strings.TrimPrefix(res, text)
+	res = strings.TrimPrefix(res, "\n")
 
 	C.llama_free_params(params)
 

From 84efc8db364743915a17f52fd22e2afb4c2e948d Mon Sep 17 00:00:00 2001
From: mudler <mudler@c3os.io>
Date: Wed, 29 Mar 2023 18:52:01 +0200
Subject: [PATCH 44/44] Add compatibility to gpt4all models

---
 go/llama.go   |  3 ++-
 go/options.go |  6 ++++++
 lama.cpp      | 21 ++++++++++++++++-----
 lama.h        |  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/go/llama.go b/go/llama.go
index 151bbe94047d6..51f336dc34c75 100644
--- a/go/llama.go
+++ b/go/llama.go
@@ -17,7 +17,7 @@ func New(model string, opts ...ModelOption) (*LLama, error) {
 	mo := NewModelOptions(opts...)
 	state := C.llama_allocate_state()
 	modelPath := C.CString(model)
-	result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory), C.bool(mo.Alpaca))
+	result := C.llama_bootstrap(modelPath, state, C.int(mo.ContextSize), C.bool(mo.F16Memory), C.bool(mo.Alpaca), C.bool(mo.GPT4all))
 	if result != 0 {
 		return nil, fmt.Errorf("failed loading model")
 	}
@@ -34,6 +34,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 		po.Tokens = 99999999
 	}
 	out := make([]byte, po.Tokens)
+
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS))
 	ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
diff --git a/go/options.go b/go/options.go
index fd646d4e829a6..3bcadfbf19d92 100644
--- a/go/options.go
+++ b/go/options.go
@@ -6,6 +6,7 @@ type ModelOptions struct {
 	ContextSize int
 	F16Memory   bool
 	Alpaca      bool
+	GPT4all     bool
 }
 
 type PredictOptions struct {
@@ -21,6 +22,7 @@ var DefaultModelOptions ModelOptions = ModelOptions{
 	ContextSize: 512,
 	F16Memory:   false,
 	Alpaca:      false,
+	GPT4all:     false,
 }
 
 var DefaultOptions PredictOptions = PredictOptions{
@@ -49,6 +51,10 @@ var EnableAlpaca ModelOption = func(p *ModelOptions) {
 	p.Alpaca = true
 }
 
+var EnableGPT4All ModelOption = func(p *ModelOptions) {
+	p.GPT4all = true
+}
+
 // Create a new PredictOptions object with the given options.
 func NewModelOptions(opts ...ModelOption) ModelOptions {
 	p := DefaultModelOptions
diff --git a/lama.cpp b/lama.cpp
index 071b100207e6c..500fd7f0a6e43 100644
--- a/lama.cpp
+++ b/lama.cpp
@@ -107,7 +107,7 @@ struct llama_state {
 };
 
 // load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory, bool alpaca) {
+bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, bool f16memory, bool alpaca, bool gpt4all) {
 //    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024*1024);
@@ -149,7 +149,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
     // load hparams
     {
         auto & hparams = model.hparams;
-
+        if (gpt4all) {
+           model.hparams.n_vocab++;
+        }
         fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         //fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
         fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
@@ -183,7 +185,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
     // load vocab
     {
         std::string word;
-        for (int i = 0; i < model.hparams.n_vocab; i++) {
+        int n_vocab = model.hparams.n_vocab;
+        if (gpt4all) {
+            n_vocab = n_vocab - 1;
+        }
+
+        for (int i = 0; i < n_vocab; i++) {
             uint32_t len;
             fin.read((char *) &len, sizeof(len));
 
@@ -197,6 +204,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             vocab.id_to_token[i] = word;
             vocab.score[i] = score;
 
+            if (gpt4all) {
+                vocab.token_to_id["<pad>"] =  n_vocab - 1;
+                vocab.id_to_token[ n_vocab - 1] = "<pad>";
+            }
             //if (i < 30000) {
             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
             //}
@@ -868,14 +879,14 @@ int main(int argc, char ** argv) {
 
  */
 
-int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory, bool alpaca)
+int llama_bootstrap(const char *model_path, void* state_pr, int32_t n_ctx, bool f16memory, bool alpaca, bool gpt4all)
     // load the model
     {
         ggml_time_init();
         llama_state* state = (llama_state*) state_pr;
 
         const int64_t t_start_us = ggml_time_us();
-        if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory, alpaca)) {
+        if (!llama_model_load(model_path, state->model, state->vocab, n_ctx, f16memory, alpaca, gpt4all)) {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, model_path);
             return 1;
         }
diff --git a/lama.h b/lama.h
index 434a7367945d3..5c9847f299548 100644
--- a/lama.h
+++ b/lama.h
@@ -6,7 +6,7 @@ extern "C" {
 
 void *llama_allocate_state();
 
-int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory, bool alpaca);
+int llama_bootstrap(const char *model_path, void *state_pr, int n_ctx, bool f16memory, bool alpaca, bool gpt4all);
 
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
                             int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos);