From 51be1fae5a2de9ad3d501de812dd72f34b829cf0 Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 4 Dec 2025 20:33:33 +0100 Subject: [PATCH 01/19] llama-server: recursive GGUF loading Replace flat directory scan with recursive traversal using std::filesystem::recursive_directory_iterator. Support for nested vendor/model layouts (e.g. vendor/model/*.gguf). Model name now reflects the relative path within --models-dir instead of just the filename. Aggregate files by parent directory via std::map before constructing local_model --- tools/server/README.md | 2 + tools/server/server-models.cpp | 83 ++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index f98fb44c7bc..089ef5a0cbf 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1383,6 +1383,8 @@ Alternatively, you can point the router to a local directory containing your GGU llama-server --models-dir ./models_directory ``` +The directory is scanned recursively, so nested vendor/model layouts such as `vendor_name/model_name/*.gguf` are supported. The model name in the router UI matches the relative path inside `--models-dir` (for example, `vendor_name/model_name`). + If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this: ```sh diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 6f88e93c4bb..3cf4b412ff3 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -86,49 +87,61 @@ static std::vector list_local_models(const std::string & dir) { } std::vector models; - auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) { - auto files = fs_list(subdir_path, false); + + struct dir_model_files { common_file_info model_file; common_file_info first_shard_file; common_file_info mmproj_file; - for (const auto & file : files) { - if (string_ends_with(file.name, ".gguf")) { - if (file.name.find("mmproj") != std::string::npos) { - mmproj_file = file; - } else if (file.name.find("-00001-of-") != std::string::npos) { - first_shard_file = file; - } else { - model_file = file; - } - } + }; + + std::map model_directories; + + for (const auto & entry : std::filesystem::recursive_directory_iterator( + dir, std::filesystem::directory_options::skip_permission_denied)) { + if (!entry.is_regular_file()) { + continue; } - // single file model - local_model model{ - /* name */ name, - /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path, - /* path_mmproj */ mmproj_file.path // can be empty - }; - if (!model.path.empty()) { - models.push_back(model); + + const auto & path = entry.path(); + if (!string_ends_with(path.filename().string(), ".gguf")) { + continue; } - }; - auto files = fs_list(dir, true); - for (const auto & file : files) { - if (file.is_dir) { - scan_subdir(file.path, file.name); - } else if (string_ends_with(file.name, ".gguf")) { - // single file model - std::string name = file.name; - string_replace_all(name, ".gguf", ""); - local_model model{ - /* name */ name, - /* path */ file.path, - /* path_mmproj */ "" - }; - models.push_back(model); + auto & files = model_directories[path.parent_path()]; + const auto filename = path.filename().string(); + if (filename.find("mmproj") != std::string::npos) { + files.mmproj_file = {path.string(), filename, 0, false}; + } else if (filename.find("-00001-of-") != std::string::npos) { + files.first_shard_file = {path.string(), filename, 0, false}; + } else { + files.model_file = {path.string(), filename, 0, false}; + } + } + + for (const auto & [parent_path, files] : model_directories) { + std::string model_path = files.first_shard_file.path.empty() ? files.model_file.path : files.first_shard_file.path; + if (model_path.empty()) { + continue; + } + + std::string name; + std::error_code ec; + auto rel_parent = std::filesystem::relative(parent_path, dir, ec); + if (!ec && !rel_parent.empty() && rel_parent.string() != ".") { + name = rel_parent.generic_string(); + } else { + std::filesystem::path model_file_path(model_path); + name = model_file_path.stem().string(); } + + local_model model{ + /* name */ name, + /* path */ model_path, + /* path_mmproj */ files.mmproj_file.path + }; + models.push_back(model); } + return models; } From 972369e81a4842f713ac7099e29cc23746474084 Mon Sep 17 00:00:00 2001 From: Pascal Date: Mon, 8 Dec 2025 11:06:02 +0100 Subject: [PATCH 02/19] server : router config POC (INI-based per-model settings) --- common/arg.cpp | 20 ++ common/arg.h | 4 + tools/server/CMakeLists.txt | 10 + tools/server/server-config.cpp | 339 +++++++++++++++++++++++++++++++++ tools/server/server-config.h | 40 ++++ tools/server/server-models.cpp | 162 +++++++++++++--- tools/server/server-models.h | 3 + 7 files changed, 547 insertions(+), 31 deletions(-) create mode 100644 tools/server/server-config.cpp create mode 100644 tools/server/server-config.h diff --git a/common/arg.cpp b/common/arg.cpp index 210ef8d6214..6e79225dc6d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -64,6 +64,26 @@ static std::string read_file(const std::string & fname) { return content; } +static const std::vector & get_common_arg_defs() { + static const std::vector options = [] { + common_params params; + auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr); + return ctx.options; + }(); + return options; +} + +std::string common_arg_get_env_name(const std::string & flag) { + for (const auto & arg : get_common_arg_defs()) { + for (const auto & arg_flag : arg.args) { + if (arg_flag == flag) { + return arg.env ? arg.env : ""; + } + } + } + return ""; +} + common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = examples; return *this; diff --git a/common/arg.h b/common/arg.h index 7ab7e2cea43..7ebe0cede07 100644 --- a/common/arg.h +++ b/common/arg.h @@ -79,6 +79,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e // function to be used by test-arg-parser common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +// Get environment variable name for a CLI flag (e.g. "--ctx-size" -> "LLAMA_ARG_CTX_SIZE") +// Returns empty string if flag not found +std::string common_arg_get_env_name(const std::string & flag); + struct common_remote_params { std::vector headers; long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index a39b4c5b35f..14459245a95 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -38,6 +38,16 @@ set(TARGET_SRCS server-http.h server-models.cpp server-models.h + server-config.cpp + server-config.h + server-task.cpp + server-task.h + server-queue.cpp + server-queue.h + server-common.cpp + server-common.h + server-context.cpp + server-context.h ) set(PUBLIC_ASSETS index.html.gz diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp new file mode 100644 index 00000000000..52dccc6cb27 --- /dev/null +++ b/tools/server/server-config.cpp @@ -0,0 +1,339 @@ +#include "server-config.h" + +#include "peg-parser.h" +#include "arg.h" + +#include +#include +#include +#include +#include +#include + +namespace { + +bool is_option(const std::string & arg) { + return !arg.empty() && arg[0] == '-'; +} + +std::string trim(const std::string & value) { + const auto is_space = [](unsigned char c) { return std::isspace(c) != 0; }; + size_t start = 0; + while (start < value.size() && is_space(value[start])) { + ++start; + } + size_t end = value.size(); + while (end > start && is_space(value[end - 1])) { + --end; + } + return value.substr(start, end - start); +} + +bool is_implicit_value(const std::vector & args, size_t index) { + return index + 1 < args.size() && !is_option(args[index + 1]); +} + +std::string relativize(const std::string & path, const std::string & base) { + if (path.empty()) { + return path; + } + + std::error_code ec; + const auto abs_path = std::filesystem::absolute(path, ec); + if (ec) { + return path; + } + const auto abs_base = std::filesystem::absolute(base, ec); + if (ec) { + return path; + } + + const auto rel = std::filesystem::relative(abs_path, abs_base, ec); + if (ec) { + return path; + } + + return rel.generic_string(); +} + +} // namespace + +server_config_manager::server_config_manager(const std::string & models_dir) + : models_dir(models_dir) { + if (!models_dir.empty()) { + path = (std::filesystem::path(models_dir) / "config.ini").string(); + } +} + +bool server_config_manager::enabled() const { + return !models_dir.empty(); +} + +void server_config_manager::ensure_loaded() { + if (!enabled()) { + return; + } + + namespace fs = std::filesystem; + + std::lock_guard lock(mutex); + + if (!fs::exists(path)) { + data.clear(); + last_write_time = {}; + return; + } + + const auto current_write_time = fs::last_write_time(path); + if (last_write_time == current_write_time) { + return; + } + + std::ifstream file(path); + if (!file.good()) { + throw std::runtime_error("failed to open server config file: " + path); + } + + std::string contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + + static const auto & parser = *new common_peg_arena(build_peg_parser([](common_peg_parser_builder & p) { + const auto ws = p.space(); + const auto new_line = p.choice({p.literal("\r\n"), p.literal("\n"), p.literal("\r")}); + + const auto section_name = p.tag("section-name", p.until("]")); + const auto section_line = p.zero_or_more(ws) + "[" + section_name + "]" + p.optional(p.until_one_of({"\r", "\n"})); + + const auto key = p.tag("key", p.until("=")); + const auto value = p.tag("value", p.until_one_of({"\r", "\n"})); + const auto key_value_line = p.zero_or_more(ws) + key + p.zero_or_more(ws) + "=" + p.zero_or_more(ws) + p.optional(value); + + const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.optional(p.until_one_of({"\r", "\n"})); + const auto comment_line = p.zero_or_more(ws) + comment; + + const auto blank_line = p.zero_or_more(ws) + new_line; + + const auto line = p.choice({ + section_line << p.optional(new_line), + key_value_line << p.optional(new_line), + comment_line << p.optional(new_line), + blank_line, + }); + + return p.rule("ini", p.zero_or_more(line) << p.optional(p.zero_or_more(ws)) << p.end()); + })); + + common_peg_parse_context ctx(contents); + const auto result = parser.parse(ctx); + if (!result.success() || result.end != contents.size()) { + throw std::runtime_error("failed to parse server config file: " + path); + } + + std::map> parsed; + std::string current_section; + std::optional pending_key; + + const auto flush_pending = [&](const std::string & value) { + if (current_section.empty() || !pending_key) { + return; + } + + const auto & key = *pending_key; + if (key.rfind("LLAMA_ARG_", 0) != 0) { + return; + } + + parsed[current_section][key] = value; + }; + + ctx.ast.visit(result, [&](const common_peg_ast_node & node) { + if (node.tag == "section-name") { + if (pending_key) { + flush_pending(""); + pending_key.reset(); + } + + current_section = trim(std::string(node.text)); + return; + } + + if (node.tag == "key") { + if (pending_key) { + flush_pending(""); + } + + pending_key = trim(std::string(node.text)); + return; + } + + if (node.tag == "value") { + if (!pending_key) { + return; + } + + flush_pending(trim(std::string(node.text))); + pending_key.reset(); + return; + } + }); + + if (pending_key) { + flush_pending(""); + } + + data = std::move(parsed); + last_write_time = current_write_time; +} + +// write_locked expects the caller to hold `mutex`. +void server_config_manager::write_locked() { + if (!enabled()) { + return; + } + + namespace fs = std::filesystem; + + if (!path.empty()) { + auto parent = fs::path(path).parent_path(); + if (!parent.empty()) { + fs::create_directories(parent); + } + } + + std::ofstream file(path); + file << "LLAMA_CONFIG_VERSION=1\n\n"; + + bool first_section = true; + for (const auto & [section, args] : data) { + if (!first_section) { + file << "\n"; + } + first_section = false; + + file << "[" << section << "]\n"; + for (const auto & [key, value] : args) { + file << key << "="; + if (!value.empty()) { + file << value; + } + file << "\n"; + } + } + + file.flush(); + last_write_time = fs::last_write_time(path); +} + +bool is_router_control_arg(const std::string & arg) { + static const std::set blacklist = { + "--alias", // set per-child in server_models::load + "--models-dir", // router-side discovery only + "--models-max", // router capacity control + "--no-models-autoload", // router autoload policy + "--port", // router port differs from child port + "-m", "--model", // model path supplied per-child + "-hf", "--hf-file" // model source supplied per-child + }; + return blacklist.count(arg) != 0; +} + +void server_config_manager::sync(const std::vector & models, const std::vector & base_args) { + if (!enabled()) { + return; + } + + ensure_loaded(); + + std::map router_args; + + for (size_t i = 1; i < base_args.size(); ++i) { // skip argv[0] + const auto & arg = base_args[i]; + if (!is_option(arg)) { + continue; + } + + if (is_router_control_arg(arg)) { + if (is_implicit_value(base_args, i)) { + ++i; + } + continue; + } + + std::string value = "true"; + if (is_implicit_value(base_args, i)) { + value = base_args[i + 1]; + ++i; + } + + const auto env_name = common_arg_get_env_name(arg); + if (!env_name.empty()) { + router_args[env_name] = value; + } + } + + std::lock_guard lock(mutex); + + bool changed = !std::filesystem::exists(path); + + const auto model_key = common_arg_get_env_name("--model"); + const auto model_alias = common_arg_get_env_name("-m"); + const auto mmproj_key = common_arg_get_env_name("--mmproj"); + + const std::vector model_keys = { + model_key, + model_alias, + "LLAMA_ARG_MODEL", + }; + + const std::vector mmproj_keys = { + mmproj_key, + "LLAMA_ARG_MMPROJ", + }; + + for (const auto & model : models) { + auto & section = data[model.name]; + + const auto has_any_key = [](const auto & section_map, const std::vector & keys) { + for (const auto & key : keys) { + if (!key.empty() && section_map.find(key) != section_map.end()) { + return true; + } + } + return false; + }; + + if (!model_key.empty() && !has_any_key(section, model_keys)) { + section[model_key] = relativize(model.path, models_dir); + changed = true; + } + + if (!model.path_mmproj.empty() && !mmproj_key.empty() && !has_any_key(section, mmproj_keys)) { + section[mmproj_key] = relativize(model.path_mmproj, models_dir); + changed = true; + } + + for (const auto & router_arg : router_args) { + if (section.find(router_arg.first) == section.end()) { + section[router_arg.first] = router_arg.second; + changed = true; + } + } + } + + if (changed) { + write_locked(); + } +} + +std::map server_config_manager::env_for(const std::string & name) { + if (!enabled()) { + return {}; + } + + ensure_loaded(); + + std::lock_guard lock(mutex); + + auto it = data.find(name); + return it != data.end() ? it->second : std::map{}; +} + diff --git a/tools/server/server-config.h b/tools/server/server-config.h new file mode 100644 index 00000000000..c8fc7efa83b --- /dev/null +++ b/tools/server/server-config.h @@ -0,0 +1,40 @@ +#pragma once + +#include "server-common.h" + +#include +#include +#include +#include +#include + +struct server_local_model { + std::string name; + std::string path; + std::string path_mmproj; +}; + +class server_config_manager { +public: + explicit server_config_manager(const std::string & models_dir); + + bool enabled() const; + + void sync(const std::vector & models, const std::vector & base_args); + + std::map env_for(const std::string & name); + +private: + void ensure_loaded(); + void write_locked(); + +private: + std::string path; + std::string models_dir; + std::filesystem::file_time_type last_write_time{}; + std::map> data; + std::mutex mutex; +}; + +bool is_router_control_arg(const std::string & arg); + diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 3cf4b412ff3..ae75dd79ebf 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -1,5 +1,6 @@ #include "server-common.h" #include "server-models.h" +#include "server-config.h" #include "download.h" @@ -11,7 +12,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -75,23 +78,53 @@ static std::filesystem::path get_server_exec_path() { #endif } -struct local_model { - std::string name; - std::string path; - std::string path_mmproj; -}; +static std::string to_upper_copy(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { return (char) std::toupper(c); }); + return value; +} + +static std::string pick_preferred_mmproj(const std::vector & paths) { + if (paths.empty()) { + return ""; + } + + auto score = [](const std::string & path) { + const auto upper = to_upper_copy(path); + if (upper.find("BF16") != std::string::npos) { + return 3; + } + if (upper.find("F16") != std::string::npos) { + return 2; + } + if (upper.find("F32") != std::string::npos) { + return 1; + } + return 0; + }; + + const auto * best = &paths.front(); + int best_score = score(best->string()); + for (const auto & candidate : paths) { + const int candidate_score = score(candidate.string()); + if (candidate_score > best_score) { + best = &candidate; + best_score = candidate_score; + } + } + + return best->string(); +} -static std::vector list_local_models(const std::string & dir) { +static std::vector list_local_models(const std::string & dir) { if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); } - std::vector models; + std::vector models; struct dir_model_files { - common_file_info model_file; - common_file_info first_shard_file; - common_file_info mmproj_file; + std::vector model_files; + std::vector mmproj_files; }; std::map model_directories; @@ -110,34 +143,57 @@ static std::vector list_local_models(const std::string & dir) { auto & files = model_directories[path.parent_path()]; const auto filename = path.filename().string(); if (filename.find("mmproj") != std::string::npos) { - files.mmproj_file = {path.string(), filename, 0, false}; - } else if (filename.find("-00001-of-") != std::string::npos) { - files.first_shard_file = {path.string(), filename, 0, false}; - } else { - files.model_file = {path.string(), filename, 0, false}; + files.mmproj_files.push_back(path); + continue; } + + if (filename.find("-00001-of-") != std::string::npos) { + files.model_files.push_back(path); + continue; + } + + // skip shards that aren't the first chunk + if (filename.find("-000") != std::string::npos && filename.find("-of-") != std::string::npos) { + continue; + } + + files.model_files.push_back(path); } for (const auto & [parent_path, files] : model_directories) { - std::string model_path = files.first_shard_file.path.empty() ? files.model_file.path : files.first_shard_file.path; - if (model_path.empty()) { + if (files.model_files.empty()) { continue; } - std::string name; + std::string preferred_mmproj = pick_preferred_mmproj(files.mmproj_files); + + const auto * best_model = &files.model_files.front(); + std::uintmax_t best_size = std::numeric_limits::max(); + for (const auto & candidate : files.model_files) { + std::error_code size_ec; + const auto size = std::filesystem::file_size(candidate, size_ec); + if (size_ec) { + continue; + } + if (best_size == std::numeric_limits::max() || size < best_size) { + best_model = &candidate; + best_size = size; + } + } + std::error_code ec; auto rel_parent = std::filesystem::relative(parent_path, dir, ec); + std::string name; if (!ec && !rel_parent.empty() && rel_parent.string() != ".") { name = rel_parent.generic_string(); } else { - std::filesystem::path model_file_path(model_path); - name = model_file_path.stem().string(); + name = parent_path.filename().generic_string(); } - local_model model{ + server_local_model model{ /* name */ name, - /* path */ model_path, - /* path_mmproj */ files.mmproj_file.path + /* path */ std::filesystem::absolute(*best_model).string(), + /* path_mmproj */ preferred_mmproj.empty() ? "" : std::filesystem::absolute(preferred_mmproj).string() }; models.push_back(model); } @@ -145,6 +201,29 @@ static std::vector list_local_models(const std::string & dir) { return models; } +static bool is_option(const std::string & arg) { + return !arg.empty() && arg[0] == '-'; +} + +static std::vector strip_router_control_args(const std::vector & args) { + std::vector filtered; + filtered.reserve(args.size()); + + for (size_t i = 0; i < args.size(); ++i) { + const auto & arg = args[i]; + if (is_router_control_arg(arg)) { + if (i + 1 < args.size() && !is_option(args[i + 1])) { + ++i; + } + continue; + } + + filtered.push_back(arg); + } + + return filtered; +} + // // server_models // @@ -153,10 +232,12 @@ server_models::server_models( const common_params & params, int argc, char ** argv, - char ** envp) : base_params(params) { + char ** envp) : base_params(params), server_config(params.models_dir) { for (int i = 0; i < argc; i++) { base_args.push_back(std::string(argv[i])); } + + base_args = strip_router_control_args(base_args); for (char ** env = envp; *env != nullptr; env++) { base_env.push_back(std::string(*env)); } @@ -192,6 +273,7 @@ server_models::server_models( // add local models specificed via --models-dir if (!params.models_dir.empty()) { auto local_models = list_local_models(params.models_dir); + server_config.sync(local_models, base_args); for (const auto & model : local_models) { if (mapping.find(model.name) != mapping.end()) { // already exists in cached models, skip @@ -349,15 +431,21 @@ void server_models::unload_lru() { } static void add_or_replace_arg(std::vector & args, const std::string & key, const std::string & value) { - for (size_t i = 0; i < args.size(); i++) { - if (args[i] == key && i + 1 < args.size()) { - args[i + 1] = value; - return; + for (size_t i = 0; i < args.size();) { + if (args[i] == key) { + args.erase(args.begin() + i); + if (i < args.size() && !is_option(args[i])) { + args.erase(args.begin() + i); + } + } else { + ++i; } } - // not found, append + args.push_back(key); - args.push_back(value); + if (!value.empty()) { + args.push_back(value); + } } void server_models::load(const std::string & name, bool auto_load) { @@ -391,7 +479,7 @@ void server_models::load(const std::string & name, bool auto_load) { std::vector child_args; if (auto_load && !meta.args.empty()) { - child_args = meta.args; // copy previous args + child_args = strip_router_control_args(meta.args); // copy previous args minus router-only flags } else { child_args = base_args; // copy if (inst.meta.in_cache) { @@ -409,6 +497,18 @@ void server_models::load(const std::string & name, bool auto_load) { add_or_replace_arg(child_args, "--alias", inst.meta.name); std::vector child_env = base_env; // copy + auto config_env = server_config.env_for(inst.meta.name); + for (const auto & [key, value] : config_env) { + if (value == "false") { + continue; + } + + if (value == "true" || value.empty()) { + child_env.push_back(key + "="); + } else { + child_env.push_back(key + "=" + value); + } + } child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); SRV_INF("%s", "spawning server instance with args:\n"); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 526e7488dc9..725e059b87c 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -2,6 +2,7 @@ #include "common.h" #include "server-http.h" +#include "server-config.h" #include #include @@ -85,6 +86,8 @@ struct server_models { std::vector base_args; std::vector base_env; + server_config_manager server_config; + void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached From d564ebf9c9a1e6ccfd196d688cc5ecbf3448062e Mon Sep 17 00:00:00 2001 From: Pascal Date: Mon, 8 Dec 2025 12:24:02 +0100 Subject: [PATCH 03/19] server: address review feedback from @aldehir and @ngxson PEG parser usage improvements: - Simplify parser instantiation (remove arena indirection) - Optimize grammar usage (ws instead of zero_or_more, remove optional wrapping) - Fix last line without newline bug (+ operator instead of <<) - Remove redundant end position check Feature scope: - Remove auto-reload feature (will be separate PR per @ngxson) - Keep config.ini auto-creation and template generation - Preserve per-model customization logic Co-authored-by: aldehir Co-authored-by: ngxson --- tools/server/server-config.cpp | 32 ++++++++++++-------------------- tools/server/server-config.h | 1 - 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp index 52dccc6cb27..955ff89af9a 100644 --- a/tools/server/server-config.cpp +++ b/tools/server/server-config.cpp @@ -80,12 +80,6 @@ void server_config_manager::ensure_loaded() { if (!fs::exists(path)) { data.clear(); - last_write_time = {}; - return; - } - - const auto current_write_time = fs::last_write_time(path); - if (last_write_time == current_write_time) { return; } @@ -96,35 +90,35 @@ void server_config_manager::ensure_loaded() { std::string contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - static const auto & parser = *new common_peg_arena(build_peg_parser([](common_peg_parser_builder & p) { + static const auto parser = build_peg_parser([](auto & p) { const auto ws = p.space(); const auto new_line = p.choice({p.literal("\r\n"), p.literal("\n"), p.literal("\r")}); const auto section_name = p.tag("section-name", p.until("]")); - const auto section_line = p.zero_or_more(ws) + "[" + section_name + "]" + p.optional(p.until_one_of({"\r", "\n"})); + const auto section_line = ws + "[" + section_name + "]" + p.until_one_of({"\r", "\n"}); const auto key = p.tag("key", p.until("=")); const auto value = p.tag("value", p.until_one_of({"\r", "\n"})); - const auto key_value_line = p.zero_or_more(ws) + key + p.zero_or_more(ws) + "=" + p.zero_or_more(ws) + p.optional(value); + const auto key_value_line = ws + key + ws + "=" + ws + value; - const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.optional(p.until_one_of({"\r", "\n"})); - const auto comment_line = p.zero_or_more(ws) + comment; + const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.until_one_of({"\r", "\n"}); + const auto comment_line = ws + comment; - const auto blank_line = p.zero_or_more(ws) + new_line; + const auto blank_line = ws + new_line; const auto line = p.choice({ - section_line << p.optional(new_line), - key_value_line << p.optional(new_line), - comment_line << p.optional(new_line), + section_line + new_line, + key_value_line + new_line, + comment_line + new_line, blank_line, }); - return p.rule("ini", p.zero_or_more(line) << p.optional(p.zero_or_more(ws)) << p.end()); - })); + return p.rule("ini", p.zero_or_more(line) + p.optional(ws) + p.end()); + }); common_peg_parse_context ctx(contents); const auto result = parser.parse(ctx); - if (!result.success() || result.end != contents.size()) { + if (!result.success()) { throw std::runtime_error("failed to parse server config file: " + path); } @@ -181,7 +175,6 @@ void server_config_manager::ensure_loaded() { } data = std::move(parsed); - last_write_time = current_write_time; } // write_locked expects the caller to hold `mutex`. @@ -220,7 +213,6 @@ void server_config_manager::write_locked() { } file.flush(); - last_write_time = fs::last_write_time(path); } bool is_router_control_arg(const std::string & arg) { diff --git a/tools/server/server-config.h b/tools/server/server-config.h index c8fc7efa83b..53395890f5f 100644 --- a/tools/server/server-config.h +++ b/tools/server/server-config.h @@ -31,7 +31,6 @@ class server_config_manager { private: std::string path; std::string models_dir; - std::filesystem::file_time_type last_write_time{}; std::map> data; std::mutex mutex; }; From 193bead2684ae14b725a67c672a1ce39a352d69a Mon Sep 17 00:00:00 2001 From: Pascal Date: Mon, 8 Dec 2025 12:36:01 +0100 Subject: [PATCH 04/19] server: adopt aldehir's line-oriented PEG parser Complete rewrite of INI parser grammar and visitor: - Use p.chars(), p.negate(), p.any() instead of p.until() - Support end-of-line comments (key=value # comment) - Handle EOF without trailing newline correctly - Strict identifier validation ([a-zA-Z_][a-zA-Z0-9_.-]*) - Simplified visitor (no pending state, no trim needed) - Grammar handles whitespace natively via eol rule Business validation preserved: - Reject section names starting with LLAMA_ARG_* - Accept only keys starting with LLAMA_ARG_* - Require explicit section before key-value pairs Co-authored-by: aldehir --- tools/server/server-config.cpp | 123 ++++++++++++++------------------- 1 file changed, 50 insertions(+), 73 deletions(-) diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp index 955ff89af9a..f6c8746c7ce 100644 --- a/tools/server/server-config.cpp +++ b/tools/server/server-config.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include namespace { @@ -16,19 +15,6 @@ bool is_option(const std::string & arg) { return !arg.empty() && arg[0] == '-'; } -std::string trim(const std::string & value) { - const auto is_space = [](unsigned char c) { return std::isspace(c) != 0; }; - size_t start = 0; - while (start < value.size() && is_space(value[start])) { - ++start; - } - size_t end = value.size(); - while (end > start && is_space(value[end - 1])) { - --end; - } - return value.substr(start, end - start); -} - bool is_implicit_value(const std::vector & args, size_t index) { return index + 1 < args.size() && !is_option(args[index + 1]); } @@ -91,29 +77,44 @@ void server_config_manager::ensure_loaded() { std::string contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); static const auto parser = build_peg_parser([](auto & p) { - const auto ws = p.space(); - const auto new_line = p.choice({p.literal("\r\n"), p.literal("\n"), p.literal("\r")}); + // newline ::= "\r\n" / "\n" / "\r" + auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r")); + + // ws ::= [ \t]* + auto ws = p.rule("ws", p.chars("[ \t]", 0, -1)); + + // comment ::= [;#] (!newline .)* + auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any())); + + // eol ::= ws comment? (newline / EOF) + auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end())); - const auto section_name = p.tag("section-name", p.until("]")); - const auto section_line = ws + "[" + section_name + "]" + p.until_one_of({"\r", "\n"}); + // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]* + auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1)); - const auto key = p.tag("key", p.until("=")); - const auto value = p.tag("value", p.until_one_of({"\r", "\n"})); - const auto key_value_line = ws + key + ws + "=" + ws + value; + // value ::= (!eol-start .)* + auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end())); + auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any())); - const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.until_one_of({"\r", "\n"}); - const auto comment_line = ws + comment; + // header-line ::= "[" ws ident ws "]" eol + auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol); - const auto blank_line = ws + new_line; + // kv-line ::= ident ws "=" ws value eol + auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol); - const auto line = p.choice({ - section_line + new_line, - key_value_line + new_line, - comment_line + new_line, - blank_line, - }); + // comment-line ::= ws comment (newline / EOF) + auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end())); - return p.rule("ini", p.zero_or_more(line) + p.optional(ws) + p.end()); + // blank-line ::= ws (newline / EOF) + auto blank_line = p.rule("blank-line", ws + (newline | p.end())); + + // line ::= header-line / kv-line / comment-line / blank-line + auto line = p.rule("line", header_line | kv_line | comment_line | blank_line); + + // ini ::= line* EOF + auto ini = p.rule("ini", p.zero_or_more(line) + p.end()); + + return ini; }); common_peg_parse_context ctx(contents); @@ -123,57 +124,33 @@ void server_config_manager::ensure_loaded() { } std::map> parsed; - std::string current_section; - std::optional pending_key; - - const auto flush_pending = [&](const std::string & value) { - if (current_section.empty() || !pending_key) { - return; - } - - const auto & key = *pending_key; - if (key.rfind("LLAMA_ARG_", 0) != 0) { - return; - } - parsed[current_section][key] = value; - }; + std::string current_section; + std::string current_key; - ctx.ast.visit(result, [&](const common_peg_ast_node & node) { + ctx.ast.visit(result, [&](const auto & node) { if (node.tag == "section-name") { - if (pending_key) { - flush_pending(""); - pending_key.reset(); - } - - current_section = trim(std::string(node.text)); - return; - } - - if (node.tag == "key") { - if (pending_key) { - flush_pending(""); - } - - pending_key = trim(std::string(node.text)); - return; - } - - if (node.tag == "value") { - if (!pending_key) { + const std::string section = std::string(node.text); + if (section.rfind("LLAMA_ARG_", 0) == 0) { + current_section.clear(); return; } - flush_pending(trim(std::string(node.text))); - pending_key.reset(); - return; + current_section = section; + parsed[current_section] = {}; + } else if (node.tag == "key") { + const std::string key = std::string(node.text); + if (key.rfind("LLAMA_ARG_", 0) == 0) { + current_key = key; + } else { + current_key.clear(); + } + } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) { + parsed[current_section][current_key] = std::string(node.text); + current_key.clear(); } }); - if (pending_key) { - flush_pending(""); - } - data = std::move(parsed); } From a17f501c40ede930f316f9246da9b9d39dbc14da Mon Sep 17 00:00:00 2001 From: Pascal Date: Mon, 8 Dec 2025 13:11:12 +0100 Subject: [PATCH 05/19] server: fix CLI/env duplication in child processes Children now receive minimal CLI args (executable, model, port, alias) instead of inheriting all router args. Global settings pass through LLAMA_ARG_* environment variables only, eliminating duplicate config warnings. Fixes: Router args like -ngl, -fa were passed both via CLI and env, causing 'will be overwritten' warnings on every child spawn --- tools/server/server-models.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ae75dd79ebf..0a4358e76dc 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -481,20 +481,25 @@ void server_models::load(const std::string & name, bool auto_load) { if (auto_load && !meta.args.empty()) { child_args = strip_router_control_args(meta.args); // copy previous args minus router-only flags } else { - child_args = base_args; // copy + child_args.push_back(base_args[0]); if (inst.meta.in_cache) { - add_or_replace_arg(child_args, "-hf", inst.meta.name); + child_args.push_back("-hf"); + child_args.push_back(inst.meta.name); } else { - add_or_replace_arg(child_args, "-m", inst.meta.path); + child_args.push_back("-m"); + child_args.push_back(inst.meta.path); if (!inst.meta.path_mmproj.empty()) { - add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj); + child_args.push_back("--mmproj"); + child_args.push_back(inst.meta.path_mmproj); } } - } - // set model args - add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port)); - add_or_replace_arg(child_args, "--alias", inst.meta.name); + child_args.push_back("--port"); + child_args.push_back(std::to_string(inst.meta.port)); + + child_args.push_back("--alias"); + child_args.push_back(inst.meta.name); + } std::vector child_env = base_env; // copy auto config_env = server_config.env_for(inst.meta.name); From 31cb86a28fcac976f693070c0f5674aa910111b5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 18:52:40 +0100 Subject: [PATCH 06/19] add common/preset.cpp --- common/CMakeLists.txt | 2 + common/arg.cpp | 57 +++++- common/arg.h | 25 ++- common/preset.cpp | 186 ++++++++++++++++++ common/preset.h | 33 ++++ tools/server/CMakeLists.txt | 2 - tools/server/README.md | 2 - tools/server/server-config.cpp | 308 ------------------------------ tools/server/server-config.h | 39 ---- tools/server/server-models.cpp | 339 ++++++++++++--------------------- tools/server/server-models.h | 23 ++- 11 files changed, 438 insertions(+), 578 deletions(-) create mode 100644 common/preset.cpp create mode 100644 common/preset.h delete mode 100644 tools/server/server-config.cpp delete mode 100644 tools/server/server-config.h diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 377b26846b6..0182767c2b3 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -73,6 +73,8 @@ add_library(${TARGET} STATIC ngram-cache.h peg-parser.cpp peg-parser.h + preset.cpp + preset.h regex-partial.cpp regex-partial.h sampling.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 6e79225dc6d..5f181b343bc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -154,7 +154,7 @@ static std::vector break_str_into_lines(std::string input, size_t m return result; } -std::string common_arg::to_string() { +std::string common_arg::to_string() const { // params for printing to console const static int n_leading_spaces = 40; const static int n_char_per_line_help = 70; // TODO: detect this based on current console @@ -667,6 +667,53 @@ static void add_rpc_devices(const std::string & servers) { } } +bool common_params_parse(int argc, char ** argv, llama_example ex, std::map & out_map) { + common_params dummy_params; + common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr); + + std::unordered_map arg_to_options; + for (auto & opt : ctx_arg.options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + // TODO @ngxson : find a way to deduplicate this code + + // handle command line arguments + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + + for (int i = 1; i < argc; i++) { + const std::string arg_prefix = "--"; + + std::string arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); + } + auto opt = *arg_to_options[arg]; + std::string val; + if (opt.value_hint != nullptr) { + // arg with single value + check_arg(i); + val = argv[++i]; + } + if (opt.value_hint_2 != nullptr) { + // TODO: support arg with 2 values + throw std::invalid_argument("error: argument with 2 values is not yet supported\n"); + } + out_map[opt] = val; + } + + return true; +} + bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { auto ctx_arg = common_params_parser_init(params, ex, print_usage); const common_params params_org = ctx_arg.params; // the example can modify the default params @@ -3270,3 +3317,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex return ctx_arg; } + +static std::string rm_leading_dashes(const std::string & str) { + size_t pos = 0; + while (pos < str.size() && str[pos] == '-') { + ++pos; + } + return str.substr(pos); +} diff --git a/common/arg.h b/common/arg.h index 7ebe0cede07..a0fef251d27 100644 --- a/common/arg.h +++ b/common/arg.h @@ -3,6 +3,7 @@ #include "common.h" #include +#include #include #include @@ -24,6 +25,8 @@ struct common_arg { void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; void (*handler_int) (common_params & params, int) = nullptr; + common_arg() = default; + common_arg( const std::initializer_list & args, const char * value_hint, @@ -61,7 +64,21 @@ struct common_arg { bool is_exclude(enum llama_example ex); bool get_value_from_env(std::string & output) const; bool has_value_from_env() const; - std::string to_string(); + std::string to_string() const; + + // for using as key in std::map + bool operator<(const common_arg& other) const { + if (args.empty() || other.args.empty()) { + return false; + } + return strcmp(args[0], other.args[0]) < 0; + } + bool operator==(const common_arg& other) const { + if (args.empty() || other.args.empty()) { + return false; + } + return strcmp(args[0], other.args[0]) == 0; + } }; struct common_params_context { @@ -76,7 +93,11 @@ struct common_params_context { // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); -// function to be used by test-arg-parser +// parse input arguments from CLI into a map +// TODO: support repeated args in the future +bool common_params_parse(int argc, char ** argv, llama_example ex, std::map & out_map); + +// initialize argument parser context - used by test-arg-parser and preset common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); // Get environment variable name for a CLI flag (e.g. "--ctx-size" -> "LLAMA_ARG_CTX_SIZE") diff --git a/common/preset.cpp b/common/preset.cpp new file mode 100644 index 00000000000..7c050728f7b --- /dev/null +++ b/common/preset.cpp @@ -0,0 +1,186 @@ +#include "arg.h" +#include "preset.h" +#include "peg-parser.h" +#include "log.h" + +#include +#include +#include + +static std::string rm_leading_dashes(const std::string & str) { + size_t pos = 0; + while (pos < str.size() && str[pos] == '-') { + ++pos; + } + return str.substr(pos); +} + +std::vector common_preset::to_args() const { + std::vector args; + + for (const auto & [opt, value] : options) { + args.push_back(opt.args.back()); // use the last arg as the main arg + if (opt.value_hint != nullptr) { + // single value + args.push_back(value); + } + if (opt.value_hint_2 != nullptr) { + throw std::runtime_error(string_format( + "common_preset::to_args(): option '%s' has two values, which is not supported yet", + opt.args.back() + )); + } + } + + return args; +} + +std::string common_preset::to_ini() const { + std::ostringstream ss; + + ss << "[" << name << "]\n"; + for (const auto & [opt, value] : options) { + auto espaced_value = value; + string_replace_all(espaced_value, "\n", "\\\n"); + ss << rm_leading_dashes(opt.args.back()) << " = "; + ss << espaced_value << "\n"; + } + ss << "\n"; + + return ss.str(); +} + +static std::map> parse_ini_from_file(const std::string & path) { + std::map> parsed; + + if (!std::filesystem::exists(path)) { + return parsed; // return empty if file does not exist (expected behavior) + } + + std::ifstream file(path); + if (!file.good()) { + throw std::runtime_error("failed to open server config file: " + path); + } + + std::string contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + + static const auto parser = build_peg_parser([](auto & p) { + // newline ::= "\r\n" / "\n" / "\r" + auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r")); + + // ws ::= [ \t]* + auto ws = p.rule("ws", p.chars("[ \t]", 0, -1)); + + // comment ::= [;#] (!newline .)* + auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any())); + + // eol ::= ws comment? (newline / EOF) + auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end())); + + // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]* + auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1)); + + // value ::= (!eol-start .)* + auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end())); + auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any())); + + // header-line ::= "[" ws ident ws "]" eol + auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol); + + // kv-line ::= ident ws "=" ws value eol + auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol); + + // comment-line ::= ws comment (newline / EOF) + auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end())); + + // blank-line ::= ws (newline / EOF) + auto blank_line = p.rule("blank-line", ws + (newline | p.end())); + + // line ::= header-line / kv-line / comment-line / blank-line + auto line = p.rule("line", header_line | kv_line | comment_line | blank_line); + + // ini ::= line* EOF + auto ini = p.rule("ini", p.zero_or_more(line) + p.end()); + + return ini; + }); + + common_peg_parse_context ctx(contents); + const auto result = parser.parse(ctx); + if (!result.success()) { + throw std::runtime_error("failed to parse server config file: " + path); + } + + std::string current_section = COMMON_PRESET_DEFAULT_NAME; + std::string current_key; + + ctx.ast.visit(result, [&](const auto & node) { + if (node.tag == "section-name") { + const std::string section = std::string(node.text); + current_section = section; + parsed[current_section] = {}; + } else if (node.tag == "key") { + const std::string key = std::string(node.text); + current_key = key; + } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) { + parsed[current_section][current_key] = std::string(node.text); + current_key.clear(); + } + }); + + return parsed; +} + +static std::map get_map_key_opt(common_params_context & ctx_params) { + std::map mapping; + for (const auto & opt : ctx_params.options) { + if (opt.env != nullptr) { + mapping[opt.env] = opt; + } + for (const auto & arg : opt.args) { + mapping[rm_leading_dashes(arg)] = opt; + } + } + return mapping; +} + +common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) { + common_presets out; + auto key_to_opt = get_map_key_opt(ctx_params); + auto ini_data = parse_ini_from_file(path); + + for (auto section : ini_data) { + common_preset preset; + if (section.first.empty()) { + preset.name = COMMON_PRESET_DEFAULT_NAME; + } else { + preset.name = section.first; + } + LOG_DBG("loading preset: %s\n", preset.name.c_str()); + for (const auto & [key, value] : section.second) { + LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str()); + if (key_to_opt.find(key) != key_to_opt.end()) { + preset.options[key_to_opt[key]] = value; + LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str()); + } else { + // TODO: maybe warn about unknown key? + } + } + out[preset.name] = preset; + } + + return out; +} + +void common_presets_save(const std::string & path, const common_presets & presets) { + std::ofstream file(path); + if (!file.good()) { + throw std::runtime_error("failed to open preset file for writing: " + path); + } + + file << "version = 1\n\n"; + + for (const auto & it : presets) { + file << it.second.to_ini(); + } +} diff --git a/common/preset.h b/common/preset.h new file mode 100644 index 00000000000..d200bdce809 --- /dev/null +++ b/common/preset.h @@ -0,0 +1,33 @@ +#pragma once + +#include "common.h" +#include "arg.h" + +#include +#include +#include + +// +// INI preset parser and writer +// + +constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default"; + +struct common_preset { + std::string name; + // TODO: support repeated args in the future + std::map options; + + // convert preset to CLI argument list + std::vector to_args() const; + + // convert preset to INI format string + std::string to_ini() const; + + // TODO: maybe implement to_env() if needed +}; + +// interface for multiple presets in one file +using common_presets = std::map; +common_presets common_presets_load(const std::string & path, common_params_context & ctx_params); +void common_presets_save(const std::string & path, const common_presets & presets); diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 14459245a95..ae1a497be6d 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -38,8 +38,6 @@ set(TARGET_SRCS server-http.h server-models.cpp server-models.h - server-config.cpp - server-config.h server-task.cpp server-task.h server-queue.cpp diff --git a/tools/server/README.md b/tools/server/README.md index 089ef5a0cbf..f98fb44c7bc 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1383,8 +1383,6 @@ Alternatively, you can point the router to a local directory containing your GGU llama-server --models-dir ./models_directory ``` -The directory is scanned recursively, so nested vendor/model layouts such as `vendor_name/model_name/*.gguf` are supported. The model name in the router UI matches the relative path inside `--models-dir` (for example, `vendor_name/model_name`). - If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this: ```sh diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp deleted file mode 100644 index f6c8746c7ce..00000000000 --- a/tools/server/server-config.cpp +++ /dev/null @@ -1,308 +0,0 @@ -#include "server-config.h" - -#include "peg-parser.h" -#include "arg.h" - -#include -#include -#include -#include -#include - -namespace { - -bool is_option(const std::string & arg) { - return !arg.empty() && arg[0] == '-'; -} - -bool is_implicit_value(const std::vector & args, size_t index) { - return index + 1 < args.size() && !is_option(args[index + 1]); -} - -std::string relativize(const std::string & path, const std::string & base) { - if (path.empty()) { - return path; - } - - std::error_code ec; - const auto abs_path = std::filesystem::absolute(path, ec); - if (ec) { - return path; - } - const auto abs_base = std::filesystem::absolute(base, ec); - if (ec) { - return path; - } - - const auto rel = std::filesystem::relative(abs_path, abs_base, ec); - if (ec) { - return path; - } - - return rel.generic_string(); -} - -} // namespace - -server_config_manager::server_config_manager(const std::string & models_dir) - : models_dir(models_dir) { - if (!models_dir.empty()) { - path = (std::filesystem::path(models_dir) / "config.ini").string(); - } -} - -bool server_config_manager::enabled() const { - return !models_dir.empty(); -} - -void server_config_manager::ensure_loaded() { - if (!enabled()) { - return; - } - - namespace fs = std::filesystem; - - std::lock_guard lock(mutex); - - if (!fs::exists(path)) { - data.clear(); - return; - } - - std::ifstream file(path); - if (!file.good()) { - throw std::runtime_error("failed to open server config file: " + path); - } - - std::string contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - - static const auto parser = build_peg_parser([](auto & p) { - // newline ::= "\r\n" / "\n" / "\r" - auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r")); - - // ws ::= [ \t]* - auto ws = p.rule("ws", p.chars("[ \t]", 0, -1)); - - // comment ::= [;#] (!newline .)* - auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any())); - - // eol ::= ws comment? (newline / EOF) - auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end())); - - // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]* - auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1)); - - // value ::= (!eol-start .)* - auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end())); - auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any())); - - // header-line ::= "[" ws ident ws "]" eol - auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol); - - // kv-line ::= ident ws "=" ws value eol - auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol); - - // comment-line ::= ws comment (newline / EOF) - auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end())); - - // blank-line ::= ws (newline / EOF) - auto blank_line = p.rule("blank-line", ws + (newline | p.end())); - - // line ::= header-line / kv-line / comment-line / blank-line - auto line = p.rule("line", header_line | kv_line | comment_line | blank_line); - - // ini ::= line* EOF - auto ini = p.rule("ini", p.zero_or_more(line) + p.end()); - - return ini; - }); - - common_peg_parse_context ctx(contents); - const auto result = parser.parse(ctx); - if (!result.success()) { - throw std::runtime_error("failed to parse server config file: " + path); - } - - std::map> parsed; - - std::string current_section; - std::string current_key; - - ctx.ast.visit(result, [&](const auto & node) { - if (node.tag == "section-name") { - const std::string section = std::string(node.text); - if (section.rfind("LLAMA_ARG_", 0) == 0) { - current_section.clear(); - return; - } - - current_section = section; - parsed[current_section] = {}; - } else if (node.tag == "key") { - const std::string key = std::string(node.text); - if (key.rfind("LLAMA_ARG_", 0) == 0) { - current_key = key; - } else { - current_key.clear(); - } - } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) { - parsed[current_section][current_key] = std::string(node.text); - current_key.clear(); - } - }); - - data = std::move(parsed); -} - -// write_locked expects the caller to hold `mutex`. -void server_config_manager::write_locked() { - if (!enabled()) { - return; - } - - namespace fs = std::filesystem; - - if (!path.empty()) { - auto parent = fs::path(path).parent_path(); - if (!parent.empty()) { - fs::create_directories(parent); - } - } - - std::ofstream file(path); - file << "LLAMA_CONFIG_VERSION=1\n\n"; - - bool first_section = true; - for (const auto & [section, args] : data) { - if (!first_section) { - file << "\n"; - } - first_section = false; - - file << "[" << section << "]\n"; - for (const auto & [key, value] : args) { - file << key << "="; - if (!value.empty()) { - file << value; - } - file << "\n"; - } - } - - file.flush(); -} - -bool is_router_control_arg(const std::string & arg) { - static const std::set blacklist = { - "--alias", // set per-child in server_models::load - "--models-dir", // router-side discovery only - "--models-max", // router capacity control - "--no-models-autoload", // router autoload policy - "--port", // router port differs from child port - "-m", "--model", // model path supplied per-child - "-hf", "--hf-file" // model source supplied per-child - }; - return blacklist.count(arg) != 0; -} - -void server_config_manager::sync(const std::vector & models, const std::vector & base_args) { - if (!enabled()) { - return; - } - - ensure_loaded(); - - std::map router_args; - - for (size_t i = 1; i < base_args.size(); ++i) { // skip argv[0] - const auto & arg = base_args[i]; - if (!is_option(arg)) { - continue; - } - - if (is_router_control_arg(arg)) { - if (is_implicit_value(base_args, i)) { - ++i; - } - continue; - } - - std::string value = "true"; - if (is_implicit_value(base_args, i)) { - value = base_args[i + 1]; - ++i; - } - - const auto env_name = common_arg_get_env_name(arg); - if (!env_name.empty()) { - router_args[env_name] = value; - } - } - - std::lock_guard lock(mutex); - - bool changed = !std::filesystem::exists(path); - - const auto model_key = common_arg_get_env_name("--model"); - const auto model_alias = common_arg_get_env_name("-m"); - const auto mmproj_key = common_arg_get_env_name("--mmproj"); - - const std::vector model_keys = { - model_key, - model_alias, - "LLAMA_ARG_MODEL", - }; - - const std::vector mmproj_keys = { - mmproj_key, - "LLAMA_ARG_MMPROJ", - }; - - for (const auto & model : models) { - auto & section = data[model.name]; - - const auto has_any_key = [](const auto & section_map, const std::vector & keys) { - for (const auto & key : keys) { - if (!key.empty() && section_map.find(key) != section_map.end()) { - return true; - } - } - return false; - }; - - if (!model_key.empty() && !has_any_key(section, model_keys)) { - section[model_key] = relativize(model.path, models_dir); - changed = true; - } - - if (!model.path_mmproj.empty() && !mmproj_key.empty() && !has_any_key(section, mmproj_keys)) { - section[mmproj_key] = relativize(model.path_mmproj, models_dir); - changed = true; - } - - for (const auto & router_arg : router_args) { - if (section.find(router_arg.first) == section.end()) { - section[router_arg.first] = router_arg.second; - changed = true; - } - } - } - - if (changed) { - write_locked(); - } -} - -std::map server_config_manager::env_for(const std::string & name) { - if (!enabled()) { - return {}; - } - - ensure_loaded(); - - std::lock_guard lock(mutex); - - auto it = data.find(name); - return it != data.end() ? it->second : std::map{}; -} - diff --git a/tools/server/server-config.h b/tools/server/server-config.h deleted file mode 100644 index 53395890f5f..00000000000 --- a/tools/server/server-config.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "server-common.h" - -#include -#include -#include -#include -#include - -struct server_local_model { - std::string name; - std::string path; - std::string path_mmproj; -}; - -class server_config_manager { -public: - explicit server_config_manager(const std::string & models_dir); - - bool enabled() const; - - void sync(const std::vector & models, const std::vector & base_args); - - std::map env_for(const std::string & name); - -private: - void ensure_loaded(); - void write_locked(); - -private: - std::string path; - std::string models_dir; - std::map> data; - std::mutex mutex; -}; - -bool is_router_control_arg(const std::string & arg); - diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 0a4358e76dc..014a1eee952 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -1,7 +1,7 @@ #include "server-common.h" #include "server-models.h" -#include "server-config.h" +#include "preset.h" #include "download.h" #include // TODO: remove this once we use HTTP client from download.h @@ -12,13 +12,10 @@ #include #include #include -#include #include -#include #include #include #include -#include #ifdef _WIN32 #include @@ -78,150 +75,130 @@ static std::filesystem::path get_server_exec_path() { #endif } -static std::string to_upper_copy(std::string value) { - std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { return (char) std::toupper(c); }); - return value; -} +struct local_model { + std::string name; + std::string path; + std::string path_mmproj; +}; -static std::string pick_preferred_mmproj(const std::vector & paths) { - if (paths.empty()) { - return ""; +static std::vector list_local_models(const std::string & dir) { + if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { + throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); } - auto score = [](const std::string & path) { - const auto upper = to_upper_copy(path); - if (upper.find("BF16") != std::string::npos) { - return 3; - } - if (upper.find("F16") != std::string::npos) { - return 2; + std::vector models; + auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) { + auto files = fs_list(subdir_path, false); + common_file_info model_file; + common_file_info first_shard_file; + common_file_info mmproj_file; + for (const auto & file : files) { + if (string_ends_with(file.name, ".gguf")) { + if (file.name.find("mmproj") != std::string::npos) { + mmproj_file = file; + } else if (file.name.find("-00001-of-") != std::string::npos) { + first_shard_file = file; + } else { + model_file = file; + } + } } - if (upper.find("F32") != std::string::npos) { - return 1; + // single file model + local_model model{ + /* name */ name, + /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path, + /* path_mmproj */ mmproj_file.path // can be empty + }; + if (!model.path.empty()) { + models.push_back(model); } - return 0; }; - const auto * best = &paths.front(); - int best_score = score(best->string()); - for (const auto & candidate : paths) { - const int candidate_score = score(candidate.string()); - if (candidate_score > best_score) { - best = &candidate; - best_score = candidate_score; + auto files = fs_list(dir, true); + for (const auto & file : files) { + if (file.is_dir) { + scan_subdir(file.path, file.name); + } else if (string_ends_with(file.name, ".gguf")) { + // single file model + std::string name = file.name; + string_replace_all(name, ".gguf", ""); + local_model model{ + /* name */ name, + /* path */ file.path, + /* path_mmproj */ "" + }; + models.push_back(model); } } - - return best->string(); + return models; } -static std::vector list_local_models(const std::string & dir) { - if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { - throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); - } - - std::vector models; - - struct dir_model_files { - std::vector model_files; - std::vector mmproj_files; - }; - - std::map model_directories; - - for (const auto & entry : std::filesystem::recursive_directory_iterator( - dir, std::filesystem::directory_options::skip_permission_denied)) { - if (!entry.is_regular_file()) { - continue; - } - - const auto & path = entry.path(); - if (!string_ends_with(path.filename().string(), ".gguf")) { - continue; - } - - auto & files = model_directories[path.parent_path()]; - const auto filename = path.filename().string(); - if (filename.find("mmproj") != std::string::npos) { - files.mmproj_files.push_back(path); - continue; - } - - if (filename.find("-00001-of-") != std::string::npos) { - files.model_files.push_back(path); - continue; - } +// +// server_presets +// - // skip shards that aren't the first chunk - if (filename.find("-000") != std::string::npos && filename.find("-of-") != std::string::npos) { - continue; - } - files.model_files.push_back(path); +server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir) + : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) { + if (!models_dir.empty()) { + auto presets_path = models_dir + DIRECTORY_SEPARATOR + "presets.ini"; + presets = common_presets_load(presets_path, ctx_params); + SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str()); } - for (const auto & [parent_path, files] : model_directories) { - if (files.model_files.empty()) { - continue; - } + common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args); - std::string preferred_mmproj = pick_preferred_mmproj(files.mmproj_files); - - const auto * best_model = &files.model_files.front(); - std::uintmax_t best_size = std::numeric_limits::max(); - for (const auto & candidate : files.model_files) { - std::error_code size_ec; - const auto size = std::filesystem::file_size(candidate, size_ec); - if (size_ec) { - continue; - } - if (best_size == std::numeric_limits::max() || size < best_size) { - best_model = &candidate; - best_size = size; - } + // populate reserved args (will be appended by the router) + for (auto & opt : ctx_params.options) { + if (opt.env == nullptr) { + continue; } - - std::error_code ec; - auto rel_parent = std::filesystem::relative(parent_path, dir, ec); - std::string name; - if (!ec && !rel_parent.empty() && rel_parent.string() != ".") { - name = rel_parent.generic_string(); - } else { - name = parent_path.filename().generic_string(); + std::string env = opt.env; + if (env == "LLAMA_ARG_PORT" || + env == "LLAMA_ARG_HOST" || + env == "LLAMA_ARG_ALIAS" || + env == "LLAMA_ARG_API_KEY" || + env == "LLAMA_ARG_MODELS_DIR" || + env == "LLAMA_ARG_MODELS_MAX" || + env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" || + env == "LLAMA_ARG_MODEL" || + env == "LLAMA_ARG_MMPROJ" || + env == "LLAMA_ARG_HF_REPO") { + control_args[env] = opt; } - - server_local_model model{ - /* name */ name, - /* path */ std::filesystem::absolute(*best_model).string(), - /* path_mmproj */ preferred_mmproj.empty() ? "" : std::filesystem::absolute(preferred_mmproj).string() - }; - models.push_back(model); } - - return models; } -static bool is_option(const std::string & arg) { - return !arg.empty() && arg[0] == '-'; +common_preset server_presets::get_preset(const std::string & name) { + auto it = presets.find(name); + if (it != presets.end()) { + return it->second; + } + return common_preset(); } -static std::vector strip_router_control_args(const std::vector & args) { - std::vector filtered; - filtered.reserve(args.size()); - - for (size_t i = 0; i < args.size(); ++i) { - const auto & arg = args[i]; - if (is_router_control_arg(arg)) { - if (i + 1 < args.size() && !is_option(args[i + 1])) { - ++i; - } - continue; +void server_presets::render_args(server_model_meta & meta) { + common_preset preset = meta.preset; // copy + // force removing control args if any + for (auto & cargs : control_args) { + preset.options.erase(cargs.second); + } + // inherit from base args + for (const auto & [arg, value] : base_args) { + preset.options[arg] = value; + } + // set control values + preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port); + preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name; + if (meta.in_cache) { + preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name; + } else { + preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path; + if (!meta.path_mmproj.empty()) { + preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj; } - - filtered.push_back(arg); } - - return filtered; + meta.args = preset.to_args(); } // @@ -232,12 +209,10 @@ server_models::server_models( const common_params & params, int argc, char ** argv, - char ** envp) : base_params(params), server_config(params.models_dir) { + char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_dir) { for (int i = 0; i < argc; i++) { base_args.push_back(std::string(argv[i])); } - - base_args = strip_router_control_args(base_args); for (char ** env = envp; *env != nullptr; env++) { base_env.push_back(std::string(*env)); } @@ -254,6 +229,7 @@ server_models::server_models( auto cached_models = common_list_cached_models(); for (const auto & model : cached_models) { server_model_meta meta{ + /* preset */ presets.get_preset(model.to_string()), /* name */ model.to_string(), /* path */ model.manifest_path, /* path_mmproj */ "", // auto-detected when loading @@ -264,6 +240,7 @@ server_models::server_models( /* args */ std::vector(), /* exit_code */ 0 }; + presets.render_args(meta); // populate meta.args mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), /* th */ std::thread(), @@ -273,13 +250,13 @@ server_models::server_models( // add local models specificed via --models-dir if (!params.models_dir.empty()) { auto local_models = list_local_models(params.models_dir); - server_config.sync(local_models, base_args); for (const auto & model : local_models) { if (mapping.find(model.name) != mapping.end()) { // already exists in cached models, skip continue; } server_model_meta meta{ + /* preset */ presets.get_preset(model.name), /* name */ model.name, /* path */ model.path, /* path_mmproj */ model.path_mmproj, @@ -290,6 +267,7 @@ server_models::server_models( /* args */ std::vector(), /* exit_code */ 0 }; + presets.render_args(meta); // populate meta.args mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), /* th */ std::thread(), @@ -297,6 +275,11 @@ server_models::server_models( }; } } + // log available models + SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); + for (const auto & [name, inst] : mapping) { + SRV_INF(" %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str()); + } } void server_models::update_meta(const std::string & name, const server_model_meta & meta) { @@ -430,25 +413,7 @@ void server_models::unload_lru() { } } -static void add_or_replace_arg(std::vector & args, const std::string & key, const std::string & value) { - for (size_t i = 0; i < args.size();) { - if (args[i] == key) { - args.erase(args.begin() + i); - if (i < args.size() && !is_option(args[i])) { - args.erase(args.begin() + i); - } - } else { - ++i; - } - } - - args.push_back(key); - if (!value.empty()) { - args.push_back(value); - } -} - -void server_models::load(const std::string & name, bool auto_load) { +void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } @@ -477,43 +442,8 @@ void server_models::load(const std::string & name, bool auto_load) { { SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); - std::vector child_args; - if (auto_load && !meta.args.empty()) { - child_args = strip_router_control_args(meta.args); // copy previous args minus router-only flags - } else { - child_args.push_back(base_args[0]); - if (inst.meta.in_cache) { - child_args.push_back("-hf"); - child_args.push_back(inst.meta.name); - } else { - child_args.push_back("-m"); - child_args.push_back(inst.meta.path); - if (!inst.meta.path_mmproj.empty()) { - child_args.push_back("--mmproj"); - child_args.push_back(inst.meta.path_mmproj); - } - } - - child_args.push_back("--port"); - child_args.push_back(std::to_string(inst.meta.port)); - - child_args.push_back("--alias"); - child_args.push_back(inst.meta.name); - } - - std::vector child_env = base_env; // copy - auto config_env = server_config.env_for(inst.meta.name); - for (const auto & [key, value] : config_env) { - if (value == "false") { - continue; - } - - if (value == "true" || value.empty()) { - child_env.push_back(key + "="); - } else { - child_env.push_back(key + "=" + value); - } - } + std::vector child_args = inst.meta.args; // copy + std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); SRV_INF("%s", "spawning server instance with args:\n"); @@ -659,7 +589,7 @@ bool server_models::ensure_model_loaded(const std::string & name) { } if (meta->status == SERVER_MODEL_STATUS_UNLOADED) { SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); - load(name, true); + load(name); } SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str()); @@ -842,38 +772,6 @@ void server_models_routes::init_routes() { return models.proxy_request(req, method, name, true); // update last usage for POST request only }; - this->get_router_models = [this](const server_http_req &) { - auto res = std::make_unique(); - json models_json = json::array(); - auto all_models = models.get_all_meta(); - std::time_t t = std::time(0); - for (const auto & meta : all_models) { - json status { - {"value", server_model_status_to_string(meta.status)}, - {"args", meta.args}, - }; - if (meta.is_failed()) { - status["exit_code"] = meta.exit_code; - status["failed"] = true; - } - models_json.push_back(json { - {"id", meta.name}, - {"object", "model"}, // for OAI-compat - {"owned_by", "llamacpp"}, // for OAI-compat - {"created", t}, // for OAI-compat - {"in_cache", meta.in_cache}, - {"path", meta.path}, - {"status", status}, - // TODO: add other fields, may require reading GGUF metadata - }); - } - res_ok(res, { - {"data", models_json}, - {"object", "list"}, - }); - return res; - }; - this->post_router_models_load = [this](const server_http_req & req) { auto res = std::make_unique(); json body = json::parse(req.body); @@ -887,7 +785,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.load(name, false); + models.load(name); res_ok(res, {{"success", true}}); return res; }; @@ -911,9 +809,12 @@ void server_models_routes::init_routes() { std::time_t t = std::time(0); for (const auto & meta : all_models) { json status { - {"value", server_model_status_to_string(meta.status)}, - {"args", meta.args}, + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, }; + if (!meta.preset.name.empty()) { + status["preset"] = meta.preset.to_ini(); + } if (meta.is_failed()) { status["exit_code"] = meta.exit_code; status["failed"] = true; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 725e059b87c..ae0ef78ce3f 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -1,8 +1,8 @@ #pragma once #include "common.h" +#include "preset.h" #include "server-http.h" -#include "server-config.h" #include #include @@ -48,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) { } struct server_model_meta { + common_preset preset; std::string name; std::string path; std::string path_mmproj; // only available if in_cache=false @@ -55,7 +56,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - std::vector args; // additional args passed to the model instance (used for debugging) + std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) bool is_active() const { @@ -67,6 +68,19 @@ struct server_model_meta { } }; +// the server_presets struct holds the presets read from presets.ini +// as well as base args from the router server +struct server_presets { + common_presets presets; + common_params_context ctx_params; + std::map base_args; + std::map control_args; // args reserved for server control + + server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir); + common_preset get_preset(const std::string & name); + void render_args(server_model_meta & meta); +}; + struct subprocess_s; struct server_models { @@ -86,7 +100,7 @@ struct server_models { std::vector base_args; std::vector base_env; - server_config_manager server_config; + server_presets presets; void update_meta(const std::string & name, const server_model_meta & meta); @@ -105,8 +119,7 @@ struct server_models { // return a copy of all model metadata std::vector get_all_meta(); - // if auto_load is true, load the model with previous args if any - void load(const std::string & name, bool auto_load); + void load(const std::string & name); void unload(const std::string & name); void unload_all(); From a7c7aca6e21cccf86e5ace74122e2371bdbe2345 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 18:59:09 +0100 Subject: [PATCH 07/19] fix compile --- common/arg.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common/arg.h b/common/arg.h index a0fef251d27..78961328cac 100644 --- a/common/arg.h +++ b/common/arg.h @@ -6,6 +6,7 @@ #include #include #include +#include // // CLI argument parsing From e5c3c4712fc272775641b83372332460a0df2fa6 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 20:47:57 +0100 Subject: [PATCH 08/19] cont --- common/arg.cpp | 7 +++++++ common/common.h | 7 ++++--- common/preset.cpp | 4 ++-- tools/server/server-models.cpp | 15 +++++++++++---- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5f181b343bc..318cdbb1289 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2610,6 +2610,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.models_dir = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR")); + add_opt(common_arg( + {"--models-preset"}, "PATH", + "path to INI file containing model presets for the router server (default: disabled)", + [](common_params & params, const std::string & value) { + params.models_preset = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET")); add_opt(common_arg( {"--models-max"}, "N", string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max), diff --git a/common/common.h b/common/common.h index ad79f5b425c..6119adcc0f8 100644 --- a/common/common.h +++ b/common/common.h @@ -484,9 +484,10 @@ struct common_params { bool endpoint_metrics = false; // router server configs - std::string models_dir = ""; // directory containing models for the router server - int models_max = 4; // maximum number of models to load simultaneously - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/common/preset.cpp b/common/preset.cpp index 7c050728f7b..7000ff96c2c 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -54,12 +54,12 @@ static std::map> parse_ini_from_ std::map> parsed; if (!std::filesystem::exists(path)) { - return parsed; // return empty if file does not exist (expected behavior) + throw std::runtime_error("preset file does not exist: " + path); } std::ifstream file(path); if (!file.good()) { - throw std::runtime_error("failed to open server config file: " + path); + throw std::runtime_error("failed to open server preset file: " + path); } std::string contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 014a1eee952..a1c7efccca7 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -138,10 +138,9 @@ static std::vector list_local_models(const std::string & dir) { // -server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir) +server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path) : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) { - if (!models_dir.empty()) { - auto presets_path = models_dir + DIRECTORY_SEPARATOR + "presets.ini"; + if (!presets_path.empty()) { presets = common_presets_load(presets_path, ctx_params); SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str()); } @@ -167,6 +166,14 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para control_args[env] = opt; } } + + // remove any router-controlled args from base_args + for (const auto & cargs : control_args) { + auto it = base_args.find(cargs.second); + if (it != base_args.end()) { + base_args.erase(it); + } + } } common_preset server_presets::get_preset(const std::string & name) { @@ -209,7 +216,7 @@ server_models::server_models( const common_params & params, int argc, char ** argv, - char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_dir) { + char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) { for (int i = 0; i < argc; i++) { base_args.push_back(std::string(argv[i])); } From 7b96207166a7d45f7de979eec9ffcb443fe32741 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 21:42:16 +0100 Subject: [PATCH 09/19] allow custom-path models --- tools/server/README.md | 50 ++++++++++++++++ tools/server/server-models.cpp | 105 ++++++++++++++++++++++++++------- tools/server/server-models.h | 5 ++ 3 files changed, 138 insertions(+), 22 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index f98fb44c7bc..649f1e7ca2b 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1369,6 +1369,11 @@ llama-server ### Model sources +There are 3 possible sources for model files: +1. Cached models (controlled by the `LLAMA_CACHE` environment variable) +2. Custom model directory (set via the `--models-dir` argument) +3. Custom preset (set via the `--models-preset` argument) + By default, the router looks for models in the cache. You can add Hugging Face models to the cache with: ```sh @@ -1413,6 +1418,51 @@ llama-server -ctx 8192 -n 1024 -np 2 Note: model instances inherit both command line arguments and environment variables from the router server. +Alternatively, you can also add GGUF based preset (see next section) + +### Model presets + +Model presets allow advanced users to define custom configurations using an `.ini` file: + +```sh +llama-server --models-preset ./my-models.ini +``` + +Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`. + +Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys. + +Example: + +```ini +version = 1 + +; If the key corresponds to an existing model on the server, +; this will be used as the default config for that model +[ggml-org/MY-MODEL-GGUF:Q8_0] +; string value +chat-template = chatml +; numeric value +n-gpu-layer = 123 +; boolean value +jinja = false +; shorthand argument (for example, context size) +c = 4096 +; environment variable name +LLAMA_ARG_CACHE_RAM = 0 +; file paths are relative to server's CWD +model-draft = ./my-models/draft.gguf +; but it's RECOMMENDED to use absolute path +model-draft = /Users/abc/my-models/draft.gguf + +; If the key does NOT correspond to an existing model, +; you need to specify at least the model path +[custom_model] +model = /Users/abc/my-awesome-model-Q4_K_M.gguf +``` + +Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading. + ### Routing requests Requests are routed according to the requested model name. diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index a1c7efccca7..18e21c00d66 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -145,8 +145,6 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str()); } - common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args); - // populate reserved args (will be appended by the router) for (auto & opt : ctx_params.options) { if (opt.env == nullptr) { @@ -159,14 +157,17 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para env == "LLAMA_ARG_API_KEY" || env == "LLAMA_ARG_MODELS_DIR" || env == "LLAMA_ARG_MODELS_MAX" || - env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" || env == "LLAMA_ARG_MODEL" || env == "LLAMA_ARG_MMPROJ" || - env == "LLAMA_ARG_HF_REPO") { + env == "LLAMA_ARG_HF_REPO" || + env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") { control_args[env] = opt; } } + // read base args from router's argv + common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args); + // remove any router-controlled args from base_args for (const auto & cargs : control_args) { auto it = base_args.find(cargs.second); @@ -186,14 +187,21 @@ common_preset server_presets::get_preset(const std::string & name) { void server_presets::render_args(server_model_meta & meta) { common_preset preset = meta.preset; // copy + // merging 3 kinds of args: + // 1. model-specific args (from preset) // force removing control args if any for (auto & cargs : control_args) { - preset.options.erase(cargs.second); + if (preset.options.find(cargs.second) != preset.options.end()) { + SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]); + preset.options.erase(cargs.second); + } } + // 2. base args (from router) // inherit from base args for (const auto & [arg, value] : base_args) { preset.options[arg] = value; } + // 3. control args (from router) // set control values preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port); preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name; @@ -231,8 +239,54 @@ server_models::server_models( LOG_WRN("failed to get server executable path: %s\n", e.what()); LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str()); } - // TODO: allow refreshing cached model list - // add cached models + load_models(); +} + +void server_models::add_model(server_model_meta && meta) { + if (mapping.find(meta.name) != mapping.end()) { + throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); + } + presets.render_args(meta); // populate meta.args + std::string name = meta.name; + mapping[name] = instance_t{ + /* subproc */ std::make_shared(), + /* th */ std::thread(), + /* meta */ std::move(meta) + }; +} + +static std::vector list_custom_path_models(server_presets & presets) { + // detect any custom-path models in presets + std::vector custom_models; + for (auto & [model_name, preset] : presets.presets) { + local_model model; + model.name = model_name; + std::vector to_erase; + for (auto & [arg, value] : preset.options) { + std::string env(arg.env ? arg.env : ""); + if (env == "LLAMA_ARG_MODEL") { + model.path = value; + to_erase.push_back(arg); + } + if (env == "LLAMA_ARG_MMPROJ") { + model.path_mmproj = value; + to_erase.push_back(arg); + } + } + for (auto & arg : to_erase) { + preset.options.erase(arg); + } + if (!model.name.empty() && !model.path.empty()) { + custom_models.push_back(model); + } + } + return custom_models; +} + +// TODO: allow refreshing cached model list +void server_models::load_models() { + // loading models from 3 sources: + // 1. cached models auto cached_models = common_list_cached_models(); for (const auto & model : cached_models) { server_model_meta meta{ @@ -247,16 +301,11 @@ server_models::server_models( /* args */ std::vector(), /* exit_code */ 0 }; - presets.render_args(meta); // populate meta.args - mapping[meta.name] = instance_t{ - /* subproc */ std::make_shared(), - /* th */ std::thread(), - /* meta */ meta - }; + add_model(std::move(meta)); } - // add local models specificed via --models-dir - if (!params.models_dir.empty()) { - auto local_models = list_local_models(params.models_dir); + // 2. local models specificed via --models-dir + if (!base_params.models_dir.empty()) { + auto local_models = list_local_models(base_params.models_dir); for (const auto & model : local_models) { if (mapping.find(model.name) != mapping.end()) { // already exists in cached models, skip @@ -274,14 +323,26 @@ server_models::server_models( /* args */ std::vector(), /* exit_code */ 0 }; - presets.render_args(meta); // populate meta.args - mapping[meta.name] = instance_t{ - /* subproc */ std::make_shared(), - /* th */ std::thread(), - /* meta */ meta - }; + add_model(std::move(meta)); } } + // 3. custom-path models specified in presets + auto custom_models = list_custom_path_models(presets); + for (const auto & model : custom_models) { + server_model_meta meta{ + /* preset */ presets.get_preset(model.name), + /* name */ model.name, + /* path */ model.path, + /* path_mmproj */ model.path_mmproj, + /* in_cache */ false, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 + }; + add_model(std::move(meta)); + } // log available models SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { diff --git a/tools/server/server-models.h b/tools/server/server-models.h index ae0ef78ce3f..9cdbbad9b6a 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -107,9 +107,14 @@ struct server_models { // unload least recently used models if the limit is reached void unload_lru(); + // not thread-safe, caller must hold mutex + void add_model(server_model_meta && meta); + public: server_models(const common_params & params, int argc, char ** argv, char ** envp); + void load_models(); + // check if a model instance exists bool has_model(const std::string & name); From b8d8ffee3c71a578de58cefc8bf4bcfd58831fcc Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 21:55:32 +0100 Subject: [PATCH 10/19] add falsey check --- common/arg.cpp | 7 ++++--- common/arg.h | 6 ++++++ common/preset.cpp | 9 ++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 318cdbb1289..f2b460f86dd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -47,6 +47,7 @@ #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 using json = nlohmann::ordered_json; +using namespace common_arg_utils; static std::initializer_list mmproj_examples = { LLAMA_EXAMPLE_MTMD, @@ -759,15 +760,15 @@ static std::string list_builtin_chat_templates() { return msg.str(); } -static bool is_truthy(const std::string & value) { +bool common_arg_utils::is_truthy(const std::string & value) { return value == "on" || value == "enabled" || value == "1"; } -static bool is_falsey(const std::string & value) { +bool common_arg_utils::is_falsey(const std::string & value) { return value == "off" || value == "disabled" || value == "0"; } -static bool is_autoy(const std::string & value) { +bool common_arg_utils::is_autoy(const std::string & value) { return value == "auto" || value == "-1"; } diff --git a/common/arg.h b/common/arg.h index 78961328cac..161d0688cd0 100644 --- a/common/arg.h +++ b/common/arg.h @@ -82,6 +82,12 @@ struct common_arg { } }; +namespace common_arg_utils { + bool is_truthy(const std::string & value); + bool is_falsey(const std::string & value); + bool is_autoy(const std::string & value); +} + struct common_params_context { enum llama_example ex = LLAMA_EXAMPLE_COMMON; common_params & params; diff --git a/common/preset.cpp b/common/preset.cpp index 7000ff96c2c..c07e68b28be 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -20,11 +20,18 @@ std::vector common_preset::to_args() const { for (const auto & [opt, value] : options) { args.push_back(opt.args.back()); // use the last arg as the main arg + if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { + // flag option, no value + if (common_arg_utils::is_falsey(value)) { + // skip the flag + args.pop_back(); + } + } if (opt.value_hint != nullptr) { // single value args.push_back(value); } - if (opt.value_hint_2 != nullptr) { + if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) { throw std::runtime_error(string_format( "common_preset::to_args(): option '%s' has two values, which is not supported yet", opt.args.back() From 0734bbe4ea0fbb5d3ab9e6ff68e2ca04f479beed Mon Sep 17 00:00:00 2001 From: Pascal Date: Mon, 8 Dec 2025 21:55:24 +0100 Subject: [PATCH 11/19] server: fix router model discovery and child process spawning - Sanitize model names: replace / and \ with _ for display - Recursive directory scan with relative path storage - Convert relative paths to absolute when spawning children - Filter router control args from child processes - Refresh args after port assignment for correct port value - Fallback preset lookup for compatibility - Fix missing argv[0]: store server binary path before base_args parsing --- tools/server/server-models.cpp | 144 ++++++++++++++++++++++----------- tools/server/server-models.h | 4 +- 2 files changed, 100 insertions(+), 48 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 18e21c00d66..a60cf82b941 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -7,6 +7,7 @@ #include // TODO: remove this once we use HTTP client from download.h #include +#include #include #include #include @@ -77,57 +78,87 @@ static std::filesystem::path get_server_exec_path() { struct local_model { std::string name; + std::string display_name; std::string path; std::string path_mmproj; }; +static std::string sanitize_model_name(const std::string & name) { + std::string sanitized = name; + string_replace_all(sanitized, "/", "_"); + string_replace_all(sanitized, "\\", "_"); + return sanitized; +} + static std::vector list_local_models(const std::string & dir) { if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); } std::vector models; - auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) { - auto files = fs_list(subdir_path, false); - common_file_info model_file; - common_file_info first_shard_file; - common_file_info mmproj_file; - for (const auto & file : files) { - if (string_ends_with(file.name, ".gguf")) { - if (file.name.find("mmproj") != std::string::npos) { - mmproj_file = file; - } else if (file.name.find("-00001-of-") != std::string::npos) { - first_shard_file = file; - } else { - model_file = file; + std::function scan_subdir = + [&](const std::string & subdir_path, const std::string & name) { + auto files = fs_list(subdir_path, true); // Need directories for recursion + common_file_info model_file; + common_file_info first_shard_file; + common_file_info mmproj_file; + + for (const auto & file : files) { + if (file.is_dir) { + const std::string child_name = name.empty() ? file.name : name + "/" + file.name; + scan_subdir(file.path, child_name); + continue; + } + + if (string_ends_with(file.name, ".gguf")) { + if (file.name.find("mmproj") != std::string::npos) { + mmproj_file = file; + } else if (file.name.find("-00001-of-") != std::string::npos) { + first_shard_file = file; + } else { + model_file = file; + } + } + } + + // Convert absolute paths to relative + std::string model_path = first_shard_file.path.empty() ? model_file.path : first_shard_file.path; + if (!model_path.empty()) { + std::error_code ec; + auto rel_path = std::filesystem::relative(model_path, dir, ec); + if (!ec) { + model_path = rel_path.generic_string(); + } + } + + std::string mmproj_path = mmproj_file.path; + if (!mmproj_path.empty()) { + std::error_code ec; + auto rel_path = std::filesystem::relative(mmproj_path, dir, ec); + if (!ec) { + mmproj_path = rel_path.generic_string(); } } - } - // single file model - local_model model{ - /* name */ name, - /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path, - /* path_mmproj */ mmproj_file.path // can be empty - }; - if (!model.path.empty()) { - models.push_back(model); - } - }; - auto files = fs_list(dir, true); - for (const auto & file : files) { - if (file.is_dir) { - scan_subdir(file.path, file.name); - } else if (string_ends_with(file.name, ".gguf")) { - // single file model - std::string name = file.name; - string_replace_all(name, ".gguf", ""); local_model model{ - /* name */ name, - /* path */ file.path, - /* path_mmproj */ "" + /* name */ name, + /* display_name */ sanitize_model_name(name), + /* path */ model_path, + /* path_mmproj */ mmproj_path // can be empty }; - models.push_back(model); + if (!model.path.empty()) { + models.push_back(model); + } + }; + + scan_subdir(dir, ""); + + // when scanning the root, the name is empty, so adjust names for models directly under models_dir + for (auto & model : models) { + if (model.name.empty() && !model.path.empty()) { + model.name = std::filesystem::path(model.path).filename().string(); + string_replace_all(model.name, ".gguf", ""); + model.display_name = sanitize_model_name(model.name); } } return models; @@ -138,8 +169,8 @@ static std::vector list_local_models(const std::string & dir) { // -server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path) - : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) { +server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir) + : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)), models_dir(models_dir) { if (!presets_path.empty()) { presets = common_presets_load(presets_path, ctx_params); SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str()); @@ -154,6 +185,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para if (env == "LLAMA_ARG_PORT" || env == "LLAMA_ARG_HOST" || env == "LLAMA_ARG_ALIAS" || + env == "LLAMA_ARG_MODELS_PRESET" || env == "LLAMA_ARG_API_KEY" || env == "LLAMA_ARG_MODELS_DIR" || env == "LLAMA_ARG_MODELS_MAX" || @@ -208,9 +240,17 @@ void server_presets::render_args(server_model_meta & meta) { if (meta.in_cache) { preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name; } else { - preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path; + std::string model_path = meta.path; + if (!models_dir.empty() && !std::filesystem::path(model_path).is_absolute()) { + model_path = models_dir + "/" + model_path; + } + preset.options[control_args["LLAMA_ARG_MODEL"]] = model_path; if (!meta.path_mmproj.empty()) { - preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj; + std::string mmproj_path = meta.path_mmproj; + if (!models_dir.empty() && !std::filesystem::path(mmproj_path).is_absolute()) { + mmproj_path = models_dir + "/" + mmproj_path; + } + preset.options[control_args["LLAMA_ARG_MMPROJ"]] = mmproj_path; } } meta.args = preset.to_args(); @@ -224,7 +264,7 @@ server_models::server_models( const common_params & params, int argc, char ** argv, - char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) { + char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset, params.models_dir) { for (int i = 0; i < argc; i++) { base_args.push_back(std::string(argv[i])); } @@ -232,12 +272,13 @@ server_models::server_models( base_env.push_back(std::string(*env)); } GGML_ASSERT(!base_args.empty()); - // set binary path + // Save binary path before base_args is modified by presets parsing try { - base_args[0] = get_server_exec_path().string(); + server_binary_path = get_server_exec_path().string(); } catch (const std::exception & e) { LOG_WRN("failed to get server executable path: %s\n", e.what()); - LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str()); + LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); + server_binary_path = std::string(argv[0]); } load_models(); } @@ -307,13 +348,18 @@ void server_models::load_models() { if (!base_params.models_dir.empty()) { auto local_models = list_local_models(base_params.models_dir); for (const auto & model : local_models) { - if (mapping.find(model.name) != mapping.end()) { + const std::string name = model.display_name; + if (mapping.find(name) != mapping.end()) { // already exists in cached models, skip continue; } + auto preset = presets.get_preset(name); + if (preset.name.empty() && name != model.name) { + preset = presets.get_preset(model.name); + } server_model_meta meta{ - /* preset */ presets.get_preset(model.name), - /* name */ model.name, + /* preset */ preset, + /* name */ name, /* path */ model.path, /* path_mmproj */ model.path_mmproj, /* in_cache */ false, @@ -506,11 +552,15 @@ void server_models::load(const std::string & name) { throw std::runtime_error("failed to get a port number"); } + presets.render_args(inst.meta); + inst.subproc = std::make_shared(); { SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); std::vector child_args = inst.meta.args; // copy + // Insert binary path as argv[0] + child_args.insert(child_args.begin(), server_binary_path); std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 9cdbbad9b6a..2aa93cab742 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -75,8 +75,9 @@ struct server_presets { common_params_context ctx_params; std::map base_args; std::map control_args; // args reserved for server control + std::string models_dir; - server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir); + server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir); common_preset get_preset(const std::string & name); void render_args(server_model_meta & meta); }; @@ -99,6 +100,7 @@ struct server_models { common_params base_params; std::vector base_args; std::vector base_env; + std::string server_binary_path; server_presets presets; From a7baeab410a7053ed0467809bc0eac2f539d242d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 22:14:31 +0100 Subject: [PATCH 12/19] Revert "server: fix router model discovery and child process spawning" This reverts commit e3832b42eeea7fcb108995966c7584479f745857. --- tools/server/server-models.cpp | 144 +++++++++++---------------------- tools/server/server-models.h | 4 +- 2 files changed, 48 insertions(+), 100 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index a60cf82b941..18e21c00d66 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -7,7 +7,6 @@ #include // TODO: remove this once we use HTTP client from download.h #include -#include #include #include #include @@ -78,87 +77,57 @@ static std::filesystem::path get_server_exec_path() { struct local_model { std::string name; - std::string display_name; std::string path; std::string path_mmproj; }; -static std::string sanitize_model_name(const std::string & name) { - std::string sanitized = name; - string_replace_all(sanitized, "/", "_"); - string_replace_all(sanitized, "\\", "_"); - return sanitized; -} - static std::vector list_local_models(const std::string & dir) { if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); } std::vector models; - std::function scan_subdir = - [&](const std::string & subdir_path, const std::string & name) { - auto files = fs_list(subdir_path, true); // Need directories for recursion - common_file_info model_file; - common_file_info first_shard_file; - common_file_info mmproj_file; - - for (const auto & file : files) { - if (file.is_dir) { - const std::string child_name = name.empty() ? file.name : name + "/" + file.name; - scan_subdir(file.path, child_name); - continue; - } - - if (string_ends_with(file.name, ".gguf")) { - if (file.name.find("mmproj") != std::string::npos) { - mmproj_file = file; - } else if (file.name.find("-00001-of-") != std::string::npos) { - first_shard_file = file; - } else { - model_file = file; - } - } - } - - // Convert absolute paths to relative - std::string model_path = first_shard_file.path.empty() ? model_file.path : first_shard_file.path; - if (!model_path.empty()) { - std::error_code ec; - auto rel_path = std::filesystem::relative(model_path, dir, ec); - if (!ec) { - model_path = rel_path.generic_string(); - } - } - - std::string mmproj_path = mmproj_file.path; - if (!mmproj_path.empty()) { - std::error_code ec; - auto rel_path = std::filesystem::relative(mmproj_path, dir, ec); - if (!ec) { - mmproj_path = rel_path.generic_string(); + auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) { + auto files = fs_list(subdir_path, false); + common_file_info model_file; + common_file_info first_shard_file; + common_file_info mmproj_file; + for (const auto & file : files) { + if (string_ends_with(file.name, ".gguf")) { + if (file.name.find("mmproj") != std::string::npos) { + mmproj_file = file; + } else if (file.name.find("-00001-of-") != std::string::npos) { + first_shard_file = file; + } else { + model_file = file; } } + } + // single file model + local_model model{ + /* name */ name, + /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path, + /* path_mmproj */ mmproj_file.path // can be empty + }; + if (!model.path.empty()) { + models.push_back(model); + } + }; + auto files = fs_list(dir, true); + for (const auto & file : files) { + if (file.is_dir) { + scan_subdir(file.path, file.name); + } else if (string_ends_with(file.name, ".gguf")) { + // single file model + std::string name = file.name; + string_replace_all(name, ".gguf", ""); local_model model{ - /* name */ name, - /* display_name */ sanitize_model_name(name), - /* path */ model_path, - /* path_mmproj */ mmproj_path // can be empty + /* name */ name, + /* path */ file.path, + /* path_mmproj */ "" }; - if (!model.path.empty()) { - models.push_back(model); - } - }; - - scan_subdir(dir, ""); - - // when scanning the root, the name is empty, so adjust names for models directly under models_dir - for (auto & model : models) { - if (model.name.empty() && !model.path.empty()) { - model.name = std::filesystem::path(model.path).filename().string(); - string_replace_all(model.name, ".gguf", ""); - model.display_name = sanitize_model_name(model.name); + models.push_back(model); } } return models; @@ -169,8 +138,8 @@ static std::vector list_local_models(const std::string & dir) { // -server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir) - : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)), models_dir(models_dir) { +server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path) + : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) { if (!presets_path.empty()) { presets = common_presets_load(presets_path, ctx_params); SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str()); @@ -185,7 +154,6 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para if (env == "LLAMA_ARG_PORT" || env == "LLAMA_ARG_HOST" || env == "LLAMA_ARG_ALIAS" || - env == "LLAMA_ARG_MODELS_PRESET" || env == "LLAMA_ARG_API_KEY" || env == "LLAMA_ARG_MODELS_DIR" || env == "LLAMA_ARG_MODELS_MAX" || @@ -240,17 +208,9 @@ void server_presets::render_args(server_model_meta & meta) { if (meta.in_cache) { preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name; } else { - std::string model_path = meta.path; - if (!models_dir.empty() && !std::filesystem::path(model_path).is_absolute()) { - model_path = models_dir + "/" + model_path; - } - preset.options[control_args["LLAMA_ARG_MODEL"]] = model_path; + preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path; if (!meta.path_mmproj.empty()) { - std::string mmproj_path = meta.path_mmproj; - if (!models_dir.empty() && !std::filesystem::path(mmproj_path).is_absolute()) { - mmproj_path = models_dir + "/" + mmproj_path; - } - preset.options[control_args["LLAMA_ARG_MMPROJ"]] = mmproj_path; + preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj; } } meta.args = preset.to_args(); @@ -264,7 +224,7 @@ server_models::server_models( const common_params & params, int argc, char ** argv, - char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset, params.models_dir) { + char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) { for (int i = 0; i < argc; i++) { base_args.push_back(std::string(argv[i])); } @@ -272,13 +232,12 @@ server_models::server_models( base_env.push_back(std::string(*env)); } GGML_ASSERT(!base_args.empty()); - // Save binary path before base_args is modified by presets parsing + // set binary path try { - server_binary_path = get_server_exec_path().string(); + base_args[0] = get_server_exec_path().string(); } catch (const std::exception & e) { LOG_WRN("failed to get server executable path: %s\n", e.what()); - LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); - server_binary_path = std::string(argv[0]); + LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str()); } load_models(); } @@ -348,18 +307,13 @@ void server_models::load_models() { if (!base_params.models_dir.empty()) { auto local_models = list_local_models(base_params.models_dir); for (const auto & model : local_models) { - const std::string name = model.display_name; - if (mapping.find(name) != mapping.end()) { + if (mapping.find(model.name) != mapping.end()) { // already exists in cached models, skip continue; } - auto preset = presets.get_preset(name); - if (preset.name.empty() && name != model.name) { - preset = presets.get_preset(model.name); - } server_model_meta meta{ - /* preset */ preset, - /* name */ name, + /* preset */ presets.get_preset(model.name), + /* name */ model.name, /* path */ model.path, /* path_mmproj */ model.path_mmproj, /* in_cache */ false, @@ -552,15 +506,11 @@ void server_models::load(const std::string & name) { throw std::runtime_error("failed to get a port number"); } - presets.render_args(inst.meta); - inst.subproc = std::make_shared(); { SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); std::vector child_args = inst.meta.args; // copy - // Insert binary path as argv[0] - child_args.insert(child_args.begin(), server_binary_path); std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2aa93cab742..9cdbbad9b6a 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -75,9 +75,8 @@ struct server_presets { common_params_context ctx_params; std::map base_args; std::map control_args; // args reserved for server control - std::string models_dir; - server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir); + server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir); common_preset get_preset(const std::string & name); void render_args(server_model_meta & meta); }; @@ -100,7 +99,6 @@ struct server_models { common_params base_params; std::vector base_args; std::vector base_env; - std::string server_binary_path; server_presets presets; From a70419c0115c2393b2ca59f8eb7a4d7a52d1fb2f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 22:16:07 +0100 Subject: [PATCH 13/19] clarify about "no-" prefix --- tools/server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index 649f1e7ca2b..d6b9b87dcf7 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1444,8 +1444,8 @@ version = 1 chat-template = chatml ; numeric value n-gpu-layer = 123 -; boolean value -jinja = false +; flag value (for certain flags, you need to use the "no-" prefix for negation) +jinja = true ; shorthand argument (for example, context size) c = 4096 ; environment variable name From 97de3114eca2444b12190c32f816eb63156f9a03 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 22:22:28 +0100 Subject: [PATCH 14/19] correct render_args() to include binary path --- tools/server/server-models.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 18e21c00d66..a92c5e06fed 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -214,6 +214,8 @@ void server_presets::render_args(server_model_meta & meta) { } } meta.args = preset.to_args(); + // add back the binary path at the front + meta.args.insert(meta.args.begin(), get_server_exec_path().string()); } // @@ -510,6 +512,8 @@ void server_models::load(const std::string & name) { { SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); + presets.render_args(inst.meta); // update meta.args + std::vector child_args = inst.meta.args; // copy std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); From f645e887f23dcda2802abf2ad16a103eb2c4461b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 22:28:35 +0100 Subject: [PATCH 15/19] also remove arg LLAMA_ARG_MODELS_PRESET for child --- tools/server/server-models.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index a92c5e06fed..a823df4e1e3 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -157,6 +157,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para env == "LLAMA_ARG_API_KEY" || env == "LLAMA_ARG_MODELS_DIR" || env == "LLAMA_ARG_MODELS_MAX" || + env == "LLAMA_ARG_MODELS_PRESET" || env == "LLAMA_ARG_MODEL" || env == "LLAMA_ARG_MMPROJ" || env == "LLAMA_ARG_HF_REPO" || From 6bda0d47ede87e30074b3bc73c7c5b9d1308512f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 8 Dec 2025 23:52:15 +0100 Subject: [PATCH 16/19] add co-author for ini parser code Co-authored-by: aldehir From 035f56adbc961cf65860c87364f3eb7b5ff8f5e2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 14:43:47 +0100 Subject: [PATCH 17/19] also set LLAMA_ARG_HOST --- tools/server/server-models.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index a823df4e1e3..3c4b0015e7f 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -204,6 +204,7 @@ void server_presets::render_args(server_model_meta & meta) { } // 3. control args (from router) // set control values + preset.options[control_args["LLAMA_ARG_HOST"]] = "127.0.0.1"; preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port); preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name; if (meta.in_cache) { From f2ad7dc9db790d7db6744065d764bc2deb4485fa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 17:51:42 +0100 Subject: [PATCH 18/19] add CHILD_ADDR --- tools/server/server-models.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 3c4b0015e7f..6c618a673c9 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -34,6 +34,10 @@ #define CMD_EXIT "exit" +// address for child process, this is needed because router may run on 0.0.0.0 +// ref: https://github.com/ggml-org/llama.cpp/issues/17862 +#define CHILD_ADDR "127.0.0.1" + static std::filesystem::path get_server_exec_path() { #if defined(_WIN32) wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths @@ -204,7 +208,7 @@ void server_presets::render_args(server_model_meta & meta) { } // 3. control args (from router) // set control values - preset.options[control_args["LLAMA_ARG_HOST"]] = "127.0.0.1"; + preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR; preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port); preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name; if (meta.in_cache) { @@ -693,7 +697,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port); auto proxy = std::make_unique( method, - base_params.hostname, + CHILD_ADDR, meta->port, req.path, req.headers, From b36b3fe1a4404348071d90166a5ce7fb93733dbc Mon Sep 17 00:00:00 2001 From: Pascal Date: Wed, 10 Dec 2025 14:30:04 +0100 Subject: [PATCH 19/19] Remove dead code --- common/arg.cpp | 19 ------------------- common/arg.h | 4 ---- common/preset.cpp | 13 ------------- common/preset.h | 1 - 4 files changed, 37 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index f2b460f86dd..b333f45c96a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -74,17 +74,6 @@ static const std::vector & get_common_arg_defs() { return options; } -std::string common_arg_get_env_name(const std::string & flag) { - for (const auto & arg : get_common_arg_defs()) { - for (const auto & arg_flag : arg.args) { - if (arg_flag == flag) { - return arg.env ? arg.env : ""; - } - } - } - return ""; -} - common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = examples; return *this; @@ -3325,11 +3314,3 @@ common_params_context common_params_parser_init(common_params & params, llama_ex return ctx_arg; } - -static std::string rm_leading_dashes(const std::string & str) { - size_t pos = 0; - while (pos < str.size() && str[pos] == '-') { - ++pos; - } - return str.substr(pos); -} diff --git a/common/arg.h b/common/arg.h index 161d0688cd0..219c115e635 100644 --- a/common/arg.h +++ b/common/arg.h @@ -107,10 +107,6 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map "LLAMA_ARG_CTX_SIZE") -// Returns empty string if flag not found -std::string common_arg_get_env_name(const std::string & flag); - struct common_remote_params { std::vector headers; long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout diff --git a/common/preset.cpp b/common/preset.cpp index c07e68b28be..09ac171b720 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -178,16 +178,3 @@ common_presets common_presets_load(const std::string & path, common_params_conte return out; } - -void common_presets_save(const std::string & path, const common_presets & presets) { - std::ofstream file(path); - if (!file.good()) { - throw std::runtime_error("failed to open preset file for writing: " + path); - } - - file << "version = 1\n\n"; - - for (const auto & it : presets) { - file << it.second.to_ini(); - } -} diff --git a/common/preset.h b/common/preset.h index d200bdce809..dceb849eb81 100644 --- a/common/preset.h +++ b/common/preset.h @@ -30,4 +30,3 @@ struct common_preset { // interface for multiple presets in one file using common_presets = std::map; common_presets common_presets_load(const std::string & path, common_params_context & ctx_params); -void common_presets_save(const std::string & path, const common_presets & presets);