diff --git a/llama.cpp.patches/patches/main_main.cpp.patch b/llama.cpp.patches/patches/main_main.cpp.patch index c6d7126405..7c1bdbbe81 100644 --- a/llama.cpp.patches/patches/main_main.cpp.patch +++ b/llama.cpp.patches/patches/main_main.cpp.patch @@ -107,7 +107,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { (void) level; -@@ -128,7 +146,91 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector 1 && url_prefix.back() == '/') { + url_prefix.pop_back(); + } + + // If only a single slash remains, convert to empty string + if (url_prefix == "/") { + url_prefix = ""; + } + + // Validate the normalized path + if (!url_prefix.empty() && !IsAcceptablePath(url_prefix.c_str(), url_prefix.length())) { + tinyprint(2, "error: --url-prefix must not have /. or /./ or /../ after normalization\n", NULL); exit(1); } + + // Store in static storage (persists for program lifetime) + static std::string stored_prefix = url_prefix; + FLAG_url_prefix = stored_prefix.c_str(); continue; } diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index e142a5a219..23e8ba4a83 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -522,12 +522,30 @@ Client::send_response_finish() bool Client::send_binary(const void* p, size_t n) { - ssize_t sent; - if ((sent = write(fd_, p, n)) != n) { - if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) - SLOG("write failed %m"); - close_connection_ = true; - return false; + size_t total_sent = 0; + const char* ptr = (const char*)p; + + while (total_sent < n) { + ssize_t sent = write(fd_, ptr + total_sent, n - total_sent); + + if (sent > 0) { + total_sent += sent; + } else if (sent == 0) { + // Connection closed + close_connection_ = true; + return false; + } else { + // Error occurred + if (errno == EINTR) { + // Interrupted by signal, retry + continue; + } + if (errno != EAGAIN && errno != ECONNRESET) { + SLOG("write failed %m"); + } + close_connection_ = true; + return false; + } } return true; } @@ -775,7 +793,7 @@ Client::dispatcher() should_send_error_if_canceled_ = false; if (!send(std::string_view(obuf_.p, p - obuf_.p))) return false; - char buf[512]; + char buf[16384]; size_t i, chunk; for (i = 0; i < size; i += chunk) { chunk = size - i; diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp index a016c62218..7c4d9c0311 100644 --- a/llamafile/server/worker.cpp +++ b/llamafile/server/worker.cpp @@ -56,13 +56,9 @@ Worker::begin() tokens = tokenbucket_acquire(client_.client_ip_); server_->lock(); dll_remove(&server_->idle_workers, &elem_); - if (dll_is_empty(server_->idle_workers)) { - Dll* slowbro; - if ((slowbro = dll_last(server_->active_workers))) { - SLOG("all threads active! dropping oldest client"); - WORKER(slowbro)->kill(); - } - } + // Remove aggressive client cancellation - let TCP backlog handle overflow + // The kernel's listen backlog will naturally queue incoming connections + // until a worker becomes available, providing better user experience working_ = true; if (tokens > FLAG_token_burst) { dll_make_last(&server_->active_workers, &elem_);