server: update refs -> llama-server

gitignore llama-server
ggml-org · ochafik · Jun 12, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
commit f298cc63d2cec6cfa72446b8e7f4ec5448f3fd54
@@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 
-RUN make -j$(nproc) server
+RUN make -j$(nproc) llama-server
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1
 
-COPY --from=build /app/server /server
+COPY --from=build /app/llama-server /llama-server
 
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
@@ -38,8 +38,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev
 
-COPY --from=build /app/build/bin/server /server
+COPY --from=build /app/build/bin/llama-server /llama-server
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev
 
-RUN make -j$(nproc)
+RUN make -j$(nproc) llama-server
 
-ENTRYPOINT [ "/app/server" ]
+ENTRYPOINT [ "/app/llama-server" ]
@@ -23,9 +23,9 @@ RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
 
 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/server /server && \
+RUN cp /app/build/bin/llama-server /llama-server && \
     rm -rf /app
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
@@ -11,15 +11,15 @@ COPY . .
 
 ENV LLAMA_CURL=1
 
-RUN make -j$(nproc) server
+RUN make -j$(nproc) llama-server
 
 FROM ubuntu:$UBUNTU_VERSION as runtime
 
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1
 
-COPY --from=build /app/server /server
+COPY --from=build /app/llama-server /llama-server
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/tools.sh b/.devops/tools.sh
@@ -26,7 +26,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
         fi
     done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$@"
+    ./llama-server "$@"
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "

diff --git a/.gitignore b/.gitignore
@@ -76,7 +76,7 @@ models-mnt
 /quantize-stats
 /result
 /save-load-state
-/server
+/llama-server
 /simple
 /batched
 /batched-bench

diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py
@@ -1,5 +1,5 @@
 # Usage:
-#! ./server -m some-model.gguf &
+#! ./llama-server -m some-model.gguf &
 #! pip install pydantic
 #! python json-schema-pydantic-example.py
 

diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh
@@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
 
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./server $GEN_OPTIONS \
+./llama-server $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --rope-freq-scale 1.0 \

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -80,41 +80,41 @@ The project is under active development, and we are [looking for feedback and co
 
 ## Build
 
-`server` is built alongside everything else from the root of the project
+`llama-server` is built alongside everything else from the root of the project
 
 - Using `make`:
 
   ```bash
-  make server
+  make llama-server
   ```
 
 - Using `CMake`:
 
   ```bash
   cmake -B build
-  cmake --build build --config Release -t server
+  cmake --build build --config Release -t llama-server
   ```
 
-  Binary is at `./build/bin/server`
+  Binary is at `./build/bin/llama-server`
 
 ## Build with SSL
 
-`server` can also be built with SSL support using OpenSSL 3
+`llama-server` can also be built with SSL support using OpenSSL 3
 
 - Using `make`:
 
   ```bash
   # NOTE: For non-system openssl, use the following:
   #   CXXFLAGS="-I /path/to/openssl/include"
   #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true server
+  make LLAMA_SERVER_SSL=true llama-server
   ```
 
 - Using `CMake`:
 
   ```bash
   cmake -B build -DLLAMA_SERVER_SSL=ON
-  cmake --build build --config Release -t server
+  cmake --build build --config Release -t llama-server
   ```
 
 ## Quick Start
@@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.)
 
 ```bash
-./server -m models/7B/ggml-model.gguf -c 2048
+./llama-server -m models/7B/ggml-model.gguf -c 2048
 ```
 
 ### Windows
 
 ```powershell
-server.exe -m models\7B\ggml-model.gguf -c 2048
+llama-server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
 
 The above command will start a server that by default listens on `127.0.0.1:8080`.
@@ -629,11 +629,11 @@ bash chat.sh
 
 ### OAI-like API
 
-The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
+The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
 
 ### API errors
 
-`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
 
 Example of an error:
 

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
@@ -99,7 +99,7 @@ The `bench.py` script does several steps:
 It aims to be used in the CI, but you can run it manually:
 
 ```shell
-LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
               --runner-label local \
               --name local \
               --branch `git rev-parse --abbrev-ref HEAD` \

diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
@@ -245,7 +245,7 @@ def start_server(args):
 
 def start_server_background(args):
     # Start the server
-    server_path = '../../../build/bin/server'
+    server_path = '../../../build/bin/llama-server'
     if 'LLAMA_SERVER_BIN_PATH' in os.environ:
         server_path = os.environ['LLAMA_SERVER_BIN_PATH']
     server_args = [

diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md
@@ -44,12 +44,12 @@ http module.
 
 ### running using examples/server
 
-bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
+./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
 
 ### running using python3's server module
 
 first run examples/server
-* bin/server -m path/model.gguf
+* ./llama-server -m path/model.gguf
 
 next run this web front end in examples/server/public_simplechat
 * cd ../examples/server/public_simplechat

diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
@@ -40,7 +40,7 @@ It's possible to override some scenario steps values with environment variables:
 | variable                 | description                                                                                    |
 |--------------------------|------------------------------------------------------------------------------------------------|
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
-| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/server`                         |
+| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
 | `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |

diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -1272,9 +1272,9 @@ def context_text(context):
 
 def start_server_background(context):
     if os.name == 'nt':
-        context.server_path = '../../../build/bin/Release/server.exe'
+        context.server_path = '../../../build/bin/Release/llama-server.exe'
     else:
-        context.server_path = '../../../build/bin/server'
+        context.server_path = '../../../build/bin/llama-server'
     if 'LLAMA_SERVER_BIN_PATH' in os.environ:
         context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
     server_listen_addr = context.server_fqdn

diff --git a/grammars/README.md b/grammars/README.md
@@ -1,6 +1,6 @@
 # GBNF Guide
 
-GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
+GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/llama-server`.
 
 ## Background