Skip to content
Open
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
07b506d
llama-router: multi-model serving with dynamic backends
ServeurpersoCom Nov 29, 2025
25f1433
llama-router: fix logging init via static constructor
ServeurpersoCom Nov 29, 2025
4cbedbc
llama-router: centralize defaults and add mmproj auto-detection
ServeurpersoCom Nov 29, 2025
cb7c489
llama-router: add process grouping for selective VRAM management
ServeurpersoCom Nov 29, 2025
dbf3250
llama-router: add legacy endpoint support for single-model compat
ServeurpersoCom Nov 29, 2025
70eec73
llama-router: add comprehensive debug logging
ServeurpersoCom Nov 29, 2025
0f090e2
llama-router: implement SSE streaming and production safety features
ServeurpersoCom Nov 29, 2025
dac95e8
llama-router: fix segfault from static initialization order fiasco
ServeurpersoCom Nov 29, 2025
ee94bfc
llama-router: auto-detect sibling binary, capture logs, wait for back…
ServeurpersoCom Nov 29, 2025
e472330
llama-router: implement cross-platform subprocess I/O forwarding and …
ServeurpersoCom Nov 30, 2025
728bccc
llama-router: validate binary before spawn, clean child error handling
ServeurpersoCom Nov 30, 2025
cbcc8a8
llama-router: add multi-engine support with configurable spawn and en…
ServeurpersoCom Nov 30, 2025
635b70d
llama-router: fix SSE streaming termination and use-after-free
ServeurpersoCom Nov 30, 2025
232799a
llama-router: auto-rescan, admin endpoints, and fixes
ServeurpersoCom Nov 30, 2025
7f274d5
llama-router: add --import-dir for custom model collections
ServeurpersoCom Nov 30, 2025
bfb3e62
llama-router: add README with CLI reference and configuration guide
ServeurpersoCom Nov 30, 2025
4bc8f69
llama-router: document KISS philosophy, optimization patterns, and sy…
ServeurpersoCom Nov 30, 2025
b14ea20
llama-router: fix PATH binary support and macOS detection
ServeurpersoCom Nov 30, 2025
c5fdd3a
llama-router: separate quick-start guide from technical architecture …
ServeurpersoCom Nov 30, 2025
cb44f59
llama-router: async polling for process termination after SIGKILL
ServeurpersoCom Nov 30, 2025
85f418d
llama-router: separate PROCESS (OS) and BACKEND (HTTP) polling constants
ServeurpersoCom Nov 30, 2025
41f506a
llama-router: add real-time model swap notifications via SSE
ServeurpersoCom Dec 1, 2025
da65c5f
llama-router: document notify_model_swap feature in README and ARCHIT…
ServeurpersoCom Dec 1, 2025
919e581
llama-router: add embedded WebUI support
ServeurpersoCom Dec 1, 2025
b248838
llama-router: add startup_model configuration option
ServeurpersoCom Dec 1, 2025
6e93322
llama-router: document startup_model in README and ARCHITECTURE
ServeurpersoCom Dec 1, 2025
47408bc
llama-router: auto-configure startup_model on first HF download
ServeurpersoCom Dec 1, 2025
1a014b2
llama-router: add --jinja to default spawn configuration
ServeurpersoCom Dec 1, 2025
d99d952
llama-router: replace implicit arg injection with explicit placeholders
ServeurpersoCom Dec 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
llama-router: fix SSE streaming termination and use-after-free
- Fix use-after-free: capture request data by value (path, method, body)
  instead of by-reference to avoid stack variable invalidation when
  proxy_request() returns while upstream thread still running
- Use shared_ptr<httplib::Client> to ensure lifetime during async ops
- Fix streaming termination: explicitly call sink.done() when upstream
  completes to signal httplib connection closure (fixes infinite hang)
- Add unlock() before all provider returns to prevent mutex deadlock
- Handle spurious wakeups: pause and retry when queue temporarily empty
  • Loading branch information
ServeurpersoCom committed Nov 30, 2025
commit 635b70d481d733428fe6fb9b9b5405bb1c48f038
55 changes: 41 additions & 14 deletions tools/router/router-proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <cpp-httplib/httplib.h>

#include <condition_variable>
#include <memory>
#include <mutex>
#include <queue>
#include <thread>
Expand Down Expand Up @@ -47,14 +48,16 @@ bool proxy_request(const httplib::Request & req,
}

LOG_INF("Proxying %s %s to upstream %s\n", req.method.c_str(), req.path.c_str(), upstream_base.c_str());
httplib::Client client(upstream_base.c_str());
client.set_connection_timeout(opts.connection_timeout_s, 0);
client.set_read_timeout(opts.read_timeout_s, 0);
auto client = std::make_shared<httplib::Client>(upstream_base.c_str());
client->set_connection_timeout(opts.connection_timeout_s, 0);
client->set_read_timeout(opts.read_timeout_s, 0);

httplib::Headers headers = req.headers;
headers.erase("Host");

const std::string path = !req.target.empty() ? req.target : req.path;
const std::string path = !req.target.empty() ? req.target : req.path;
const std::string method = req.method;
const std::string request_body = req.body;

if (!matches_any_endpoint(path, proxy_endpoints)) {
LOG_WRN("Request %s not proxied because it does not match configured endpoints\n", path.c_str());
Expand All @@ -63,7 +66,7 @@ bool proxy_request(const httplib::Request & req,
return false;
}

std::string content_type = req.get_header_value("Content-Type", "application/json");
const std::string content_type = req.get_header_value("Content-Type", "application/json");

const auto accept_header = req.get_header_value("Accept");
const bool wants_stream = accept_header.find("text/event-stream") != std::string::npos ||
Expand Down Expand Up @@ -93,9 +96,17 @@ bool proxy_request(const httplib::Request & req,
return true;
};

auto upstream_thread = std::make_shared<std::thread>([&, state_ptr]() {
if (req.method == "POST") {
result = client.Post(path.c_str(), headers, req.body, content_type.c_str(), content_receiver);
auto upstream_thread = std::make_shared<std::thread>([state_ptr,
client,
path,
headers,
content_type,
method,
request_body,
content_receiver]() {
httplib::Result result;
if (method == "POST") {
result = client->Post(path.c_str(), headers, request_body, content_type.c_str(), content_receiver);
if (result) {
std::lock_guard<std::mutex> lock(state_ptr->mutex);
state_ptr->status = result->status;
Expand All @@ -112,7 +123,7 @@ bool proxy_request(const httplib::Request & req,
state_ptr->content_type = upstream.get_header_value("Content-Type", "text/event-stream");
return true;
};
result = client.Get(path.c_str(), headers, response_handler, content_receiver);
result = client->Get(path.c_str(), headers, response_handler, content_receiver);
}

std::lock_guard<std::mutex> lock(state_ptr->mutex);
Expand All @@ -134,20 +145,36 @@ bool proxy_request(const httplib::Request & req,
state_ptr->cv.wait(lock, [&] { return !state_ptr->chunks.empty() || state_ptr->done; });

if (!state_ptr->chunks.empty()) {

// Chunks available: send next chunk to client
auto chunk = std::move(state_ptr->chunks.front());
state_ptr->chunks.pop();
if (!state_ptr->upstream_headers.empty()) {

// Apply response headers on first chunk
res.status = state_ptr->status;
res.reason = state_ptr->reason;
copy_response_headers(state_ptr->upstream_headers, res);
state_ptr->upstream_headers.clear();
res.set_header("Content-Type", state_ptr->content_type);
}
lock.unlock();

// sink.write() returns true -> provider continues immediately
return sink.write(chunk.data(), chunk.size());
}

return state_ptr->done;
// No chunks available: determine if stream should continue or terminate
if (state_ptr->done) {
// Upstream finished and all chunks have been sent
lock.unlock();
sink.done(); // Explicitly signal stream completion to httplib
return false; // Stop provider -> httplib closes connection gracefully
}

// Spurious wakeup or transient empty queue: upstream still processing
lock.unlock();
return false; // Pause provider -> httplib retries after timeout/new data
},
[state_ptr, upstream_thread](bool) {
(void) state_ptr;
Expand All @@ -159,14 +186,14 @@ bool proxy_request(const httplib::Request & req,
return true;
}

if (req.method == "POST") {
result = client.Post(path.c_str(), headers, req.body, content_type.c_str());
if (method == "POST") {
result = client->Post(path.c_str(), headers, request_body, content_type.c_str());
} else {
result = client.Get(path.c_str(), headers);
result = client->Get(path.c_str(), headers);
}

if (!result) {
LOG_ERR("Upstream %s unavailable for %s %s\n", upstream_base.c_str(), req.method.c_str(), path.c_str());
LOG_ERR("Upstream %s unavailable for %s %s\n", upstream_base.c_str(), method.c_str(), path.c_str());
res.status = 502;
res.set_content("{\"error\":\"upstream unavailable\"}", "application/json");
return false;
Expand Down