From 51be1fae5a2de9ad3d501de812dd72f34b829cf0 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 4 Dec 2025 20:33:33 +0100
Subject: [PATCH 01/19] llama-server: recursive GGUF loading

Replace flat directory scan with recursive traversal using
std::filesystem::recursive_directory_iterator. Support for
nested vendor/model layouts (e.g. vendor/model/*.gguf).
Model name now reflects the relative path within --models-dir
instead of just the filename. Aggregate files by parent
directory via std::map before constructing local_model
---
 tools/server/README.md         |  2 +
 tools/server/server-models.cpp | 83 ++++++++++++++++++++--------------
 2 files changed, 50 insertions(+), 35 deletions(-)
diff --git a/tools/server/README.md b/tools/server/README.md
index f98fb44c7bc..089ef5a0cbf 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1383,6 +1383,8 @@ Alternatively, you can point the router to a local directory containing your GGU
 llama-server --models-dir ./models_directory
 ```
 
+The directory is scanned recursively, so nested vendor/model layouts such as `vendor_name/model_name/*.gguf` are supported. The model name in the router UI matches the relative path inside `--models-dir` (for example, `vendor_name/model_name`).
+
 If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
 
 ```sh
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 6f88e93c4bb..3cf4b412ff3 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -15,6 +15,7 @@
 #include <atomic>
 #include <chrono>
 #include <queue>
+#include <map>
 
 #ifdef _WIN32
 #include <winsock2.h>
@@ -86,49 +87,61 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
     }
 
     std::vector<local_model> models;
-    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
-        auto files = fs_list(subdir_path, false);
+
+    struct dir_model_files {
         common_file_info model_file;
         common_file_info first_shard_file;
         common_file_info mmproj_file;
-        for (const auto & file : files) {
-            if (string_ends_with(file.name, ".gguf")) {
-                if (file.name.find("mmproj") != std::string::npos) {
-                    mmproj_file = file;
-                } else if (file.name.find("-00001-of-") != std::string::npos) {
-                    first_shard_file = file;
-                } else {
-                    model_file = file;
-                }
-            }
+    };
+
+    std::map<std::filesystem::path, dir_model_files> model_directories;
+
+    for (const auto & entry : std::filesystem::recursive_directory_iterator(
+                 dir, std::filesystem::directory_options::skip_permission_denied)) {
+        if (!entry.is_regular_file()) {
+            continue;
         }
-        // single file model
-        local_model model{
-            /* name        */ name,
-            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
-            /* path_mmproj */ mmproj_file.path // can be empty
-        };
-        if (!model.path.empty()) {
-            models.push_back(model);
+
+        const auto & path = entry.path();
+        if (!string_ends_with(path.filename().string(), ".gguf")) {
+            continue;
         }
-    };
 
-    auto files = fs_list(dir, true);
-    for (const auto & file : files) {
-        if (file.is_dir) {
-            scan_subdir(file.path, file.name);
-        } else if (string_ends_with(file.name, ".gguf")) {
-            // single file model
-            std::string name = file.name;
-            string_replace_all(name, ".gguf", "");
-            local_model model{
-                /* name        */ name,
-                /* path        */ file.path,
-                /* path_mmproj */ ""
-            };
-            models.push_back(model);
+        auto & files = model_directories[path.parent_path()];
+        const auto filename = path.filename().string();
+        if (filename.find("mmproj") != std::string::npos) {
+            files.mmproj_file = {path.string(), filename, 0, false};
+        } else if (filename.find("-00001-of-") != std::string::npos) {
+            files.first_shard_file = {path.string(), filename, 0, false};
+        } else {
+            files.model_file = {path.string(), filename, 0, false};
+        }
+    }
+
+    for (const auto & [parent_path, files] : model_directories) {
+        std::string model_path = files.first_shard_file.path.empty() ? files.model_file.path : files.first_shard_file.path;
+        if (model_path.empty()) {
+            continue;
+        }
+
+        std::string name;
+        std::error_code ec;
+        auto rel_parent = std::filesystem::relative(parent_path, dir, ec);
+        if (!ec && !rel_parent.empty() && rel_parent.string() != ".") {
+            name = rel_parent.generic_string();
+        } else {
+            std::filesystem::path model_file_path(model_path);
+            name = model_file_path.stem().string();
         }
+
+        local_model model{
+            /* name        */ name,
+            /* path        */ model_path,
+            /* path_mmproj */ files.mmproj_file.path
+        };
+        models.push_back(model);
     }
+
     return models;
 }
 

From 972369e81a4842f713ac7099e29cc23746474084 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Mon, 8 Dec 2025 11:06:02 +0100
Subject: [PATCH 02/19] server : router config POC (INI-based per-model
 settings)

---
 common/arg.cpp                 |  20 ++
 common/arg.h                   |   4 +
 tools/server/CMakeLists.txt    |  10 +
 tools/server/server-config.cpp | 339 +++++++++++++++++++++++++++++++++
 tools/server/server-config.h   |  40 ++++
 tools/server/server-models.cpp | 162 +++++++++++++---
 tools/server/server-models.h   |   3 +
 7 files changed, 547 insertions(+), 31 deletions(-)
 create mode 100644 tools/server/server-config.cpp
 create mode 100644 tools/server/server-config.h

diff --git a/common/arg.cpp b/common/arg.cpp
index 210ef8d6214..6e79225dc6d 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -64,6 +64,26 @@ static std::string read_file(const std::string & fname) {
     return content;
 }
 
+static const std::vector<common_arg> & get_common_arg_defs() {
+    static const std::vector<common_arg> options = [] {
+        common_params params;
+        auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
+        return ctx.options;
+    }();
+    return options;
+}
+
+std::string common_arg_get_env_name(const std::string & flag) {
+    for (const auto & arg : get_common_arg_defs()) {
+        for (const auto & arg_flag : arg.args) {
+            if (arg_flag == flag) {
+                return arg.env ? arg.env : "";
+            }
+        }
+    }
+    return "";
+}
+
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
     this->examples = examples;
     return *this;
diff --git a/common/arg.h b/common/arg.h
index 7ab7e2cea43..7ebe0cede07 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -79,6 +79,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 // function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
+// Get environment variable name for a CLI flag (e.g. "--ctx-size" -> "LLAMA_ARG_CTX_SIZE")
+// Returns empty string if flag not found
+std::string common_arg_get_env_name(const std::string & flag);
+
 struct common_remote_params {
     std::vector<std::string> headers;
     long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index a39b4c5b35f..14459245a95 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -38,6 +38,16 @@ set(TARGET_SRCS
     server-http.h
     server-models.cpp
     server-models.h
+    server-config.cpp
+    server-config.h
+    server-task.cpp
+    server-task.h
+    server-queue.cpp
+    server-queue.h
+    server-common.cpp
+    server-common.h
+    server-context.cpp
+    server-context.h
 )
 set(PUBLIC_ASSETS
     index.html.gz
diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp
new file mode 100644
index 00000000000..52dccc6cb27
--- /dev/null
+++ b/tools/server/server-config.cpp
@@ -0,0 +1,339 @@
+#include "server-config.h"
+
+#include "peg-parser.h"
+#include "arg.h"
+
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <functional>
+#include <optional>
+#include <set>
+
+namespace {
+
+bool is_option(const std::string & arg) {
+    return !arg.empty() && arg[0] == '-';
+}
+
+std::string trim(const std::string & value) {
+    const auto is_space = [](unsigned char c) { return std::isspace(c) != 0; };
+    size_t start = 0;
+    while (start < value.size() && is_space(value[start])) {
+        ++start;
+    }
+    size_t end = value.size();
+    while (end > start && is_space(value[end - 1])) {
+        --end;
+    }
+    return value.substr(start, end - start);
+}
+
+bool is_implicit_value(const std::vector<std::string> & args, size_t index) {
+    return index + 1 < args.size() && !is_option(args[index + 1]);
+}
+
+std::string relativize(const std::string & path, const std::string & base) {
+    if (path.empty()) {
+        return path;
+    }
+
+    std::error_code ec;
+    const auto abs_path = std::filesystem::absolute(path, ec);
+    if (ec) {
+        return path;
+    }
+    const auto abs_base = std::filesystem::absolute(base, ec);
+    if (ec) {
+        return path;
+    }
+
+    const auto rel = std::filesystem::relative(abs_path, abs_base, ec);
+    if (ec) {
+        return path;
+    }
+
+    return rel.generic_string();
+}
+
+} // namespace
+
+server_config_manager::server_config_manager(const std::string & models_dir)
+    : models_dir(models_dir) {
+    if (!models_dir.empty()) {
+        path = (std::filesystem::path(models_dir) / "config.ini").string();
+    }
+}
+
+bool server_config_manager::enabled() const {
+    return !models_dir.empty();
+}
+
+void server_config_manager::ensure_loaded() {
+    if (!enabled()) {
+        return;
+    }
+
+    namespace fs = std::filesystem;
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (!fs::exists(path)) {
+        data.clear();
+        last_write_time = {};
+        return;
+    }
+
+    const auto current_write_time = fs::last_write_time(path);
+    if (last_write_time == current_write_time) {
+        return;
+    }
+
+    std::ifstream file(path);
+    if (!file.good()) {
+        throw std::runtime_error("failed to open server config file: " + path);
+    }
+
+    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+    static const auto & parser = *new common_peg_arena(build_peg_parser([](common_peg_parser_builder & p) {
+        const auto ws = p.space();
+        const auto new_line = p.choice({p.literal("\r\n"), p.literal("\n"), p.literal("\r")});
+
+        const auto section_name = p.tag("section-name", p.until("]"));
+        const auto section_line = p.zero_or_more(ws) + "[" + section_name + "]" + p.optional(p.until_one_of({"\r", "\n"}));
+
+        const auto key = p.tag("key", p.until("="));
+        const auto value = p.tag("value", p.until_one_of({"\r", "\n"}));
+        const auto key_value_line = p.zero_or_more(ws) + key + p.zero_or_more(ws) + "=" + p.zero_or_more(ws) + p.optional(value);
+
+        const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.optional(p.until_one_of({"\r", "\n"}));
+        const auto comment_line = p.zero_or_more(ws) + comment;
+
+        const auto blank_line = p.zero_or_more(ws) + new_line;
+
+        const auto line = p.choice({
+            section_line << p.optional(new_line),
+            key_value_line << p.optional(new_line),
+            comment_line << p.optional(new_line),
+            blank_line,
+        });
+
+        return p.rule("ini", p.zero_or_more(line) << p.optional(p.zero_or_more(ws)) << p.end());
+    }));
+
+    common_peg_parse_context ctx(contents);
+    const auto result = parser.parse(ctx);
+    if (!result.success() || result.end != contents.size()) {
+        throw std::runtime_error("failed to parse server config file: " + path);
+    }
+
+    std::map<std::string, std::map<std::string, std::string>> parsed;
+    std::string current_section;
+    std::optional<std::string> pending_key;
+
+    const auto flush_pending = [&](const std::string & value) {
+        if (current_section.empty() || !pending_key) {
+            return;
+        }
+
+        const auto & key = *pending_key;
+        if (key.rfind("LLAMA_ARG_", 0) != 0) {
+            return;
+        }
+
+        parsed[current_section][key] = value;
+    };
+
+    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+        if (node.tag == "section-name") {
+            if (pending_key) {
+                flush_pending("");
+                pending_key.reset();
+            }
+
+            current_section = trim(std::string(node.text));
+            return;
+        }
+
+        if (node.tag == "key") {
+            if (pending_key) {
+                flush_pending("");
+            }
+
+            pending_key = trim(std::string(node.text));
+            return;
+        }
+
+        if (node.tag == "value") {
+            if (!pending_key) {
+                return;
+            }
+
+            flush_pending(trim(std::string(node.text)));
+            pending_key.reset();
+            return;
+        }
+    });
+
+    if (pending_key) {
+        flush_pending("");
+    }
+
+    data = std::move(parsed);
+    last_write_time = current_write_time;
+}
+
+// write_locked expects the caller to hold `mutex`.
+void server_config_manager::write_locked() {
+    if (!enabled()) {
+        return;
+    }
+
+    namespace fs = std::filesystem;
+
+    if (!path.empty()) {
+        auto parent = fs::path(path).parent_path();
+        if (!parent.empty()) {
+            fs::create_directories(parent);
+        }
+    }
+
+    std::ofstream file(path);
+    file << "LLAMA_CONFIG_VERSION=1\n\n";
+
+    bool first_section = true;
+    for (const auto & [section, args] : data) {
+        if (!first_section) {
+            file << "\n";
+        }
+        first_section = false;
+
+        file << "[" << section << "]\n";
+        for (const auto & [key, value] : args) {
+            file << key << "=";
+            if (!value.empty()) {
+                file << value;
+            }
+            file << "\n";
+        }
+    }
+
+    file.flush();
+    last_write_time = fs::last_write_time(path);
+}
+
+bool is_router_control_arg(const std::string & arg) {
+    static const std::set<std::string> blacklist = {
+        "--alias",          // set per-child in server_models::load
+        "--models-dir",     // router-side discovery only
+        "--models-max",     // router capacity control
+        "--no-models-autoload", // router autoload policy
+        "--port",           // router port differs from child port
+        "-m", "--model",  // model path supplied per-child
+        "-hf", "--hf-file" // model source supplied per-child
+    };
+    return blacklist.count(arg) != 0;
+}
+
+void server_config_manager::sync(const std::vector<server_local_model> & models, const std::vector<std::string> & base_args) {
+    if (!enabled()) {
+        return;
+    }
+
+    ensure_loaded();
+
+    std::map<std::string, std::string> router_args;
+
+    for (size_t i = 1; i < base_args.size(); ++i) { // skip argv[0]
+        const auto & arg = base_args[i];
+        if (!is_option(arg)) {
+            continue;
+        }
+
+        if (is_router_control_arg(arg)) {
+            if (is_implicit_value(base_args, i)) {
+                ++i;
+            }
+            continue;
+        }
+
+        std::string value = "true";
+        if (is_implicit_value(base_args, i)) {
+            value = base_args[i + 1];
+            ++i;
+        }
+
+        const auto env_name = common_arg_get_env_name(arg);
+        if (!env_name.empty()) {
+            router_args[env_name] = value;
+        }
+    }
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    bool changed = !std::filesystem::exists(path);
+
+    const auto model_key  = common_arg_get_env_name("--model");
+    const auto model_alias = common_arg_get_env_name("-m");
+    const auto mmproj_key = common_arg_get_env_name("--mmproj");
+
+    const std::vector<std::string> model_keys = {
+        model_key,
+        model_alias,
+        "LLAMA_ARG_MODEL",
+    };
+
+    const std::vector<std::string> mmproj_keys = {
+        mmproj_key,
+        "LLAMA_ARG_MMPROJ",
+    };
+
+    for (const auto & model : models) {
+        auto & section = data[model.name];
+
+        const auto has_any_key = [](const auto & section_map, const std::vector<std::string> & keys) {
+            for (const auto & key : keys) {
+                if (!key.empty() && section_map.find(key) != section_map.end()) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        if (!model_key.empty() && !has_any_key(section, model_keys)) {
+            section[model_key] = relativize(model.path, models_dir);
+            changed = true;
+        }
+
+        if (!model.path_mmproj.empty() && !mmproj_key.empty() && !has_any_key(section, mmproj_keys)) {
+            section[mmproj_key] = relativize(model.path_mmproj, models_dir);
+            changed = true;
+        }
+
+        for (const auto & router_arg : router_args) {
+            if (section.find(router_arg.first) == section.end()) {
+                section[router_arg.first] = router_arg.second;
+                changed = true;
+            }
+        }
+    }
+
+    if (changed) {
+        write_locked();
+    }
+}
+
+std::map<std::string, std::string> server_config_manager::env_for(const std::string & name) {
+    if (!enabled()) {
+        return {};
+    }
+
+    ensure_loaded();
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    auto it = data.find(name);
+    return it != data.end() ? it->second : std::map<std::string, std::string>{};
+}
+
diff --git a/tools/server/server-config.h b/tools/server/server-config.h
new file mode 100644
index 00000000000..c8fc7efa83b
--- /dev/null
+++ b/tools/server/server-config.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "server-common.h"
+
+#include <filesystem>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct server_local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+class server_config_manager {
+public:
+    explicit server_config_manager(const std::string & models_dir);
+
+    bool enabled() const;
+
+    void sync(const std::vector<server_local_model> & models, const std::vector<std::string> & base_args);
+
+    std::map<std::string, std::string> env_for(const std::string & name);
+
+private:
+    void ensure_loaded();
+    void write_locked();
+
+private:
+    std::string path;
+    std::string models_dir;
+    std::filesystem::file_time_type last_write_time{};
+    std::map<std::string, std::map<std::string, std::string>> data;
+    std::mutex mutex;
+};
+
+bool is_router_control_arg(const std::string & arg);
+
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 3cf4b412ff3..ae75dd79ebf 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -1,5 +1,6 @@
 #include "server-common.h"
 #include "server-models.h"
+#include "server-config.h"
 
 #include "download.h"
 
@@ -11,7 +12,9 @@
 #include <thread>
 #include <mutex>
 #include <condition_variable>
+#include <cctype>
 #include <cstring>
+#include <limits>
 #include <atomic>
 #include <chrono>
 #include <queue>
@@ -75,23 +78,53 @@ static std::filesystem::path get_server_exec_path() {
 #endif
 }
 
-struct local_model {
-    std::string name;
-    std::string path;
-    std::string path_mmproj;
-};
+static std::string to_upper_copy(std::string value) {
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { return (char) std::toupper(c); });
+    return value;
+}
+
+static std::string pick_preferred_mmproj(const std::vector<std::filesystem::path> & paths) {
+    if (paths.empty()) {
+        return "";
+    }
+
+    auto score = [](const std::string & path) {
+        const auto upper = to_upper_copy(path);
+        if (upper.find("BF16") != std::string::npos) {
+            return 3;
+        }
+        if (upper.find("F16") != std::string::npos) {
+            return 2;
+        }
+        if (upper.find("F32") != std::string::npos) {
+            return 1;
+        }
+        return 0;
+    };
+
+    const auto * best = &paths.front();
+    int best_score = score(best->string());
+    for (const auto & candidate : paths) {
+        const int candidate_score = score(candidate.string());
+        if (candidate_score > best_score) {
+            best = &candidate;
+            best_score = candidate_score;
+        }
+    }
+
+    return best->string();
+}
 
-static std::vector<local_model> list_local_models(const std::string & dir) {
+static std::vector<server_local_model> list_local_models(const std::string & dir) {
     if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
         throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
     }
 
-    std::vector<local_model> models;
+    std::vector<server_local_model> models;
 
     struct dir_model_files {
-        common_file_info model_file;
-        common_file_info first_shard_file;
-        common_file_info mmproj_file;
+        std::vector<std::filesystem::path> model_files;
+        std::vector<std::filesystem::path> mmproj_files;
     };
 
     std::map<std::filesystem::path, dir_model_files> model_directories;
@@ -110,34 +143,57 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
         auto & files = model_directories[path.parent_path()];
         const auto filename = path.filename().string();
         if (filename.find("mmproj") != std::string::npos) {
-            files.mmproj_file = {path.string(), filename, 0, false};
-        } else if (filename.find("-00001-of-") != std::string::npos) {
-            files.first_shard_file = {path.string(), filename, 0, false};
-        } else {
-            files.model_file = {path.string(), filename, 0, false};
+            files.mmproj_files.push_back(path);
+            continue;
         }
+
+        if (filename.find("-00001-of-") != std::string::npos) {
+            files.model_files.push_back(path);
+            continue;
+        }
+
+        // skip shards that aren't the first chunk
+        if (filename.find("-000") != std::string::npos && filename.find("-of-") != std::string::npos) {
+            continue;
+        }
+
+        files.model_files.push_back(path);
     }
 
     for (const auto & [parent_path, files] : model_directories) {
-        std::string model_path = files.first_shard_file.path.empty() ? files.model_file.path : files.first_shard_file.path;
-        if (model_path.empty()) {
+        if (files.model_files.empty()) {
             continue;
         }
 
-        std::string name;
+        std::string preferred_mmproj = pick_preferred_mmproj(files.mmproj_files);
+
+        const auto * best_model = &files.model_files.front();
+        std::uintmax_t best_size = std::numeric_limits<std::uintmax_t>::max();
+        for (const auto & candidate : files.model_files) {
+            std::error_code size_ec;
+            const auto size = std::filesystem::file_size(candidate, size_ec);
+            if (size_ec) {
+                continue;
+            }
+            if (best_size == std::numeric_limits<std::uintmax_t>::max() || size < best_size) {
+                best_model = &candidate;
+                best_size = size;
+            }
+        }
+
         std::error_code ec;
         auto rel_parent = std::filesystem::relative(parent_path, dir, ec);
+        std::string name;
         if (!ec && !rel_parent.empty() && rel_parent.string() != ".") {
             name = rel_parent.generic_string();
         } else {
-            std::filesystem::path model_file_path(model_path);
-            name = model_file_path.stem().string();
+            name = parent_path.filename().generic_string();
         }
 
-        local_model model{
+        server_local_model model{
             /* name        */ name,
-            /* path        */ model_path,
-            /* path_mmproj */ files.mmproj_file.path
+            /* path        */ std::filesystem::absolute(*best_model).string(),
+            /* path_mmproj */ preferred_mmproj.empty() ? "" : std::filesystem::absolute(preferred_mmproj).string()
         };
         models.push_back(model);
     }
@@ -145,6 +201,29 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
     return models;
 }
 
+static bool is_option(const std::string & arg) {
+    return !arg.empty() && arg[0] == '-';
+}
+
+static std::vector<std::string> strip_router_control_args(const std::vector<std::string> & args) {
+    std::vector<std::string> filtered;
+    filtered.reserve(args.size());
+
+    for (size_t i = 0; i < args.size(); ++i) {
+        const auto & arg = args[i];
+        if (is_router_control_arg(arg)) {
+            if (i + 1 < args.size() && !is_option(args[i + 1])) {
+                ++i;
+            }
+            continue;
+        }
+
+        filtered.push_back(arg);
+    }
+
+    return filtered;
+}
+
 //
 // server_models
 //
@@ -153,10 +232,12 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params) {
+        char ** envp) : base_params(params), server_config(params.models_dir) {
     for (int i = 0; i < argc; i++) {
         base_args.push_back(std::string(argv[i]));
     }
+
+    base_args = strip_router_control_args(base_args);
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
@@ -192,6 +273,7 @@ server_models::server_models(
     // add local models specificed via --models-dir
     if (!params.models_dir.empty()) {
         auto local_models = list_local_models(params.models_dir);
+        server_config.sync(local_models, base_args);
         for (const auto & model : local_models) {
             if (mapping.find(model.name) != mapping.end()) {
                 // already exists in cached models, skip
@@ -349,15 +431,21 @@ void server_models::unload_lru() {
 }
 
 static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
-    for (size_t i = 0; i < args.size(); i++) {
-        if (args[i] == key && i + 1 < args.size()) {
-            args[i + 1] = value;
-            return;
+    for (size_t i = 0; i < args.size();) {
+        if (args[i] == key) {
+            args.erase(args.begin() + i);
+            if (i < args.size() && !is_option(args[i])) {
+                args.erase(args.begin() + i);
+            }
+        } else {
+            ++i;
         }
     }
-    // not found, append
+
     args.push_back(key);
-    args.push_back(value);
+    if (!value.empty()) {
+        args.push_back(value);
+    }
 }
 
 void server_models::load(const std::string & name, bool auto_load) {
@@ -391,7 +479,7 @@ void server_models::load(const std::string & name, bool auto_load) {
 
         std::vector<std::string> child_args;
         if (auto_load && !meta.args.empty()) {
-            child_args = meta.args; // copy previous args
+            child_args = strip_router_control_args(meta.args); // copy previous args minus router-only flags
         } else {
             child_args = base_args; // copy
             if (inst.meta.in_cache) {
@@ -409,6 +497,18 @@ void server_models::load(const std::string & name, bool auto_load) {
         add_or_replace_arg(child_args, "--alias", inst.meta.name);
 
         std::vector<std::string> child_env = base_env; // copy
+        auto config_env = server_config.env_for(inst.meta.name);
+        for (const auto & [key, value] : config_env) {
+            if (value == "false") {
+                continue;
+            }
+
+            if (value == "true" || value.empty()) {
+                child_env.push_back(key + "=");
+            } else {
+                child_env.push_back(key + "=" + value);
+            }
+        }
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
 
         SRV_INF("%s", "spawning server instance with args:\n");
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 526e7488dc9..725e059b87c 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -2,6 +2,7 @@
 
 #include "common.h"
 #include "server-http.h"
+#include "server-config.h"
 
 #include <mutex>
 #include <condition_variable>
@@ -85,6 +86,8 @@ struct server_models {
     std::vector<std::string> base_args;
     std::vector<std::string> base_env;
 
+    server_config_manager server_config;
+
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached

From d564ebf9c9a1e6ccfd196d688cc5ecbf3448062e Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Mon, 8 Dec 2025 12:24:02 +0100
Subject: [PATCH 03/19] server: address review feedback from @aldehir and
 @ngxson

PEG parser usage improvements:
- Simplify parser instantiation (remove arena indirection)
- Optimize grammar usage (ws instead of zero_or_more, remove optional wrapping)
- Fix last line without newline bug (+ operator instead of <<)
- Remove redundant end position check

Feature scope:
- Remove auto-reload feature (will be separate PR per @ngxson)
- Keep config.ini auto-creation and template generation
- Preserve per-model customization logic

Co-authored-by: aldehir <aldehir@users.noreply.github.com>
Co-authored-by: ngxson <ngxson@users.noreply.github.com>
---
 tools/server/server-config.cpp | 32 ++++++++++++--------------------
 tools/server/server-config.h   |  1 -
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp
index 52dccc6cb27..955ff89af9a 100644
--- a/tools/server/server-config.cpp
+++ b/tools/server/server-config.cpp
@@ -80,12 +80,6 @@ void server_config_manager::ensure_loaded() {
 
     if (!fs::exists(path)) {
         data.clear();
-        last_write_time = {};
-        return;
-    }
-
-    const auto current_write_time = fs::last_write_time(path);
-    if (last_write_time == current_write_time) {
         return;
     }
 
@@ -96,35 +90,35 @@ void server_config_manager::ensure_loaded() {
 
     std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
 
-    static const auto & parser = *new common_peg_arena(build_peg_parser([](common_peg_parser_builder & p) {
+    static const auto parser = build_peg_parser([](auto & p) {
         const auto ws = p.space();
         const auto new_line = p.choice({p.literal("\r\n"), p.literal("\n"), p.literal("\r")});
 
         const auto section_name = p.tag("section-name", p.until("]"));
-        const auto section_line = p.zero_or_more(ws) + "[" + section_name + "]" + p.optional(p.until_one_of({"\r", "\n"}));
+        const auto section_line = ws + "[" + section_name + "]" + p.until_one_of({"\r", "\n"});
 
         const auto key = p.tag("key", p.until("="));
         const auto value = p.tag("value", p.until_one_of({"\r", "\n"}));
-        const auto key_value_line = p.zero_or_more(ws) + key + p.zero_or_more(ws) + "=" + p.zero_or_more(ws) + p.optional(value);
+        const auto key_value_line = ws + key + ws + "=" + ws + value;
 
-        const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.optional(p.until_one_of({"\r", "\n"}));
-        const auto comment_line = p.zero_or_more(ws) + comment;
+        const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.until_one_of({"\r", "\n"});
+        const auto comment_line = ws + comment;
 
-        const auto blank_line = p.zero_or_more(ws) + new_line;
+        const auto blank_line = ws + new_line;
 
         const auto line = p.choice({
-            section_line << p.optional(new_line),
-            key_value_line << p.optional(new_line),
-            comment_line << p.optional(new_line),
+            section_line + new_line,
+            key_value_line + new_line,
+            comment_line + new_line,
             blank_line,
         });
 
-        return p.rule("ini", p.zero_or_more(line) << p.optional(p.zero_or_more(ws)) << p.end());
-    }));
+        return p.rule("ini", p.zero_or_more(line) + p.optional(ws) + p.end());
+    });
 
     common_peg_parse_context ctx(contents);
     const auto result = parser.parse(ctx);
-    if (!result.success() || result.end != contents.size()) {
+    if (!result.success()) {
         throw std::runtime_error("failed to parse server config file: " + path);
     }
 
@@ -181,7 +175,6 @@ void server_config_manager::ensure_loaded() {
     }
 
     data = std::move(parsed);
-    last_write_time = current_write_time;
 }
 
 // write_locked expects the caller to hold `mutex`.
@@ -220,7 +213,6 @@ void server_config_manager::write_locked() {
     }
 
     file.flush();
-    last_write_time = fs::last_write_time(path);
 }
 
 bool is_router_control_arg(const std::string & arg) {
diff --git a/tools/server/server-config.h b/tools/server/server-config.h
index c8fc7efa83b..53395890f5f 100644
--- a/tools/server/server-config.h
+++ b/tools/server/server-config.h
@@ -31,7 +31,6 @@ class server_config_manager {
 private:
     std::string path;
     std::string models_dir;
-    std::filesystem::file_time_type last_write_time{};
     std::map<std::string, std::map<std::string, std::string>> data;
     std::mutex mutex;
 };

From 193bead2684ae14b725a67c672a1ce39a352d69a Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Mon, 8 Dec 2025 12:36:01 +0100
Subject: [PATCH 04/19] server: adopt aldehir's line-oriented PEG parser

Complete rewrite of INI parser grammar and visitor:
- Use p.chars(), p.negate(), p.any() instead of p.until()
- Support end-of-line comments (key=value # comment)
- Handle EOF without trailing newline correctly
- Strict identifier validation ([a-zA-Z_][a-zA-Z0-9_.-]*)
- Simplified visitor (no pending state, no trim needed)
- Grammar handles whitespace natively via eol rule

Business validation preserved:
- Reject section names starting with LLAMA_ARG_*
- Accept only keys starting with LLAMA_ARG_*
- Require explicit section before key-value pairs

Co-authored-by: aldehir <aldehir@users.noreply.github.com>
---
 tools/server/server-config.cpp | 123 ++++++++++++++-------------------
 1 file changed, 50 insertions(+), 73 deletions(-)

diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp
index 955ff89af9a..f6c8746c7ce 100644
--- a/tools/server/server-config.cpp
+++ b/tools/server/server-config.cpp
@@ -7,7 +7,6 @@
 #include <cctype>
 #include <fstream>
 #include <functional>
-#include <optional>
 #include <set>
 
 namespace {
@@ -16,19 +15,6 @@ bool is_option(const std::string & arg) {
     return !arg.empty() && arg[0] == '-';
 }
 
-std::string trim(const std::string & value) {
-    const auto is_space = [](unsigned char c) { return std::isspace(c) != 0; };
-    size_t start = 0;
-    while (start < value.size() && is_space(value[start])) {
-        ++start;
-    }
-    size_t end = value.size();
-    while (end > start && is_space(value[end - 1])) {
-        --end;
-    }
-    return value.substr(start, end - start);
-}
-
 bool is_implicit_value(const std::vector<std::string> & args, size_t index) {
     return index + 1 < args.size() && !is_option(args[index + 1]);
 }
@@ -91,29 +77,44 @@ void server_config_manager::ensure_loaded() {
     std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
 
     static const auto parser = build_peg_parser([](auto & p) {
-        const auto ws = p.space();
-        const auto new_line = p.choice({p.literal("\r\n"), p.literal("\n"), p.literal("\r")});
+        // newline ::= "\r\n" / "\n" / "\r"
+        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
+
+        // ws ::= [ \t]*
+        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
+
+        // comment ::= [;#] (!newline .)*
+        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
+
+        // eol ::= ws comment? (newline / EOF)
+        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
 
-        const auto section_name = p.tag("section-name", p.until("]"));
-        const auto section_line = ws + "[" + section_name + "]" + p.until_one_of({"\r", "\n"});
+        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
+        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
 
-        const auto key = p.tag("key", p.until("="));
-        const auto value = p.tag("value", p.until_one_of({"\r", "\n"}));
-        const auto key_value_line = ws + key + ws + "=" + ws + value;
+        // value ::= (!eol-start .)*
+        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
+        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
 
-        const auto comment = p.choice({p.literal(";"), p.literal("#")}) + p.until_one_of({"\r", "\n"});
-        const auto comment_line = ws + comment;
+        // header-line ::= "[" ws ident ws "]" eol
+        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
 
-        const auto blank_line = ws + new_line;
+        // kv-line ::= ident ws "=" ws value eol
+        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
 
-        const auto line = p.choice({
-            section_line + new_line,
-            key_value_line + new_line,
-            comment_line + new_line,
-            blank_line,
-        });
+        // comment-line ::= ws comment (newline / EOF)
+        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
 
-        return p.rule("ini", p.zero_or_more(line) + p.optional(ws) + p.end());
+        // blank-line ::= ws (newline / EOF)
+        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
+
+        // line ::= header-line / kv-line / comment-line / blank-line
+        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
+
+        // ini ::= line* EOF
+        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
+
+        return ini;
     });
 
     common_peg_parse_context ctx(contents);
@@ -123,57 +124,33 @@ void server_config_manager::ensure_loaded() {
     }
 
     std::map<std::string, std::map<std::string, std::string>> parsed;
-    std::string current_section;
-    std::optional<std::string> pending_key;
-
-    const auto flush_pending = [&](const std::string & value) {
-        if (current_section.empty() || !pending_key) {
-            return;
-        }
-
-        const auto & key = *pending_key;
-        if (key.rfind("LLAMA_ARG_", 0) != 0) {
-            return;
-        }
 
-        parsed[current_section][key] = value;
-    };
+    std::string current_section;
+    std::string current_key;
 
-    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+    ctx.ast.visit(result, [&](const auto & node) {
         if (node.tag == "section-name") {
-            if (pending_key) {
-                flush_pending("");
-                pending_key.reset();
-            }
-
-            current_section = trim(std::string(node.text));
-            return;
-        }
-
-        if (node.tag == "key") {
-            if (pending_key) {
-                flush_pending("");
-            }
-
-            pending_key = trim(std::string(node.text));
-            return;
-        }
-
-        if (node.tag == "value") {
-            if (!pending_key) {
+            const std::string section = std::string(node.text);
+            if (section.rfind("LLAMA_ARG_", 0) == 0) {
+                current_section.clear();
                 return;
             }
 
-            flush_pending(trim(std::string(node.text)));
-            pending_key.reset();
-            return;
+            current_section = section;
+            parsed[current_section] = {};
+        } else if (node.tag == "key") {
+            const std::string key = std::string(node.text);
+            if (key.rfind("LLAMA_ARG_", 0) == 0) {
+                current_key = key;
+            } else {
+                current_key.clear();
+            }
+        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
+            parsed[current_section][current_key] = std::string(node.text);
+            current_key.clear();
         }
     });
 
-    if (pending_key) {
-        flush_pending("");
-    }
-
     data = std::move(parsed);
 }
 

From a17f501c40ede930f316f9246da9b9d39dbc14da Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Mon, 8 Dec 2025 13:11:12 +0100
Subject: [PATCH 05/19] server: fix CLI/env duplication in child processes

Children now receive minimal CLI args (executable, model, port, alias)
instead of inheriting all router args. Global settings pass through
LLAMA_ARG_* environment variables only, eliminating duplicate config
warnings.

Fixes: Router args like -ngl, -fa were passed both via CLI and env,
causing 'will be overwritten' warnings on every child spawn
---
 tools/server/server-models.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index ae75dd79ebf..0a4358e76dc 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -481,20 +481,25 @@ void server_models::load(const std::string & name, bool auto_load) {
         if (auto_load && !meta.args.empty()) {
             child_args = strip_router_control_args(meta.args); // copy previous args minus router-only flags
         } else {
-            child_args = base_args; // copy
+            child_args.push_back(base_args[0]);
             if (inst.meta.in_cache) {
-                add_or_replace_arg(child_args, "-hf", inst.meta.name);
+                child_args.push_back("-hf");
+                child_args.push_back(inst.meta.name);
             } else {
-                add_or_replace_arg(child_args, "-m", inst.meta.path);
+                child_args.push_back("-m");
+                child_args.push_back(inst.meta.path);
                 if (!inst.meta.path_mmproj.empty()) {
-                    add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
+                    child_args.push_back("--mmproj");
+                    child_args.push_back(inst.meta.path_mmproj);
                 }
             }
-        }
 
-        // set model args
-        add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
-        add_or_replace_arg(child_args, "--alias", inst.meta.name);
+            child_args.push_back("--port");
+            child_args.push_back(std::to_string(inst.meta.port));
+
+            child_args.push_back("--alias");
+            child_args.push_back(inst.meta.name);
+        }
 
         std::vector<std::string> child_env = base_env; // copy
         auto config_env = server_config.env_for(inst.meta.name);

From 31cb86a28fcac976f693070c0f5674aa910111b5 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 18:52:40 +0100
Subject: [PATCH 06/19] add common/preset.cpp

---
 common/CMakeLists.txt          |   2 +
 common/arg.cpp                 |  57 +++++-
 common/arg.h                   |  25 ++-
 common/preset.cpp              | 186 ++++++++++++++++++
 common/preset.h                |  33 ++++
 tools/server/CMakeLists.txt    |   2 -
 tools/server/README.md         |   2 -
 tools/server/server-config.cpp | 308 ------------------------------
 tools/server/server-config.h   |  39 ----
 tools/server/server-models.cpp | 339 ++++++++++++---------------------
 tools/server/server-models.h   |  23 ++-
 11 files changed, 438 insertions(+), 578 deletions(-)
 create mode 100644 common/preset.cpp
 create mode 100644 common/preset.h
 delete mode 100644 tools/server/server-config.cpp
 delete mode 100644 tools/server/server-config.h

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 377b26846b6..0182767c2b3 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -73,6 +73,8 @@ add_library(${TARGET} STATIC
     ngram-cache.h
     peg-parser.cpp
     peg-parser.h
+    preset.cpp
+    preset.h
     regex-partial.cpp
     regex-partial.h
     sampling.cpp
diff --git a/common/arg.cpp b/common/arg.cpp
index 6e79225dc6d..5f181b343bc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -154,7 +154,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
     return result;
 }
 
-std::string common_arg::to_string() {
+std::string common_arg::to_string() const {
     // params for printing to console
     const static int n_leading_spaces = 40;
     const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -667,6 +667,53 @@ static void add_rpc_devices(const std::string & servers) {
     }
 }
 
+bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
+    common_params dummy_params;
+    common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
+
+    std::unordered_map<std::string, common_arg *> arg_to_options;
+    for (auto & opt : ctx_arg.options) {
+        for (const auto & arg : opt.args) {
+            arg_to_options[arg] = &opt;
+        }
+    }
+
+    // TODO @ngxson : find a way to deduplicate this code
+
+    // handle command line arguments
+    auto check_arg = [&](int i) {
+        if (i+1 >= argc) {
+            throw std::invalid_argument("expected value for argument");
+        }
+    };
+
+    for (int i = 1; i < argc; i++) {
+        const std::string arg_prefix = "--";
+
+        std::string arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+        }
+        auto opt = *arg_to_options[arg];
+        std::string val;
+        if (opt.value_hint != nullptr) {
+            // arg with single value
+            check_arg(i);
+            val = argv[++i];
+        }
+        if (opt.value_hint_2 != nullptr) {
+            // TODO: support arg with 2 values
+            throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
+        }
+        out_map[opt] = val;
+    }
+
+    return true;
+}
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
     auto ctx_arg = common_params_parser_init(params, ex, print_usage);
     const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -3270,3 +3317,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
     return ctx_arg;
 }
+
+static std::string rm_leading_dashes(const std::string & str) {
+    size_t pos = 0;
+    while (pos < str.size() && str[pos] == '-') {
+        ++pos;
+    }
+    return str.substr(pos);
+}
diff --git a/common/arg.h b/common/arg.h
index 7ebe0cede07..a0fef251d27 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,6 +3,7 @@
 #include "common.h"
 
 #include <set>
+#include <map>
 #include <string>
 #include <vector>
 
@@ -24,6 +25,8 @@ struct common_arg {
     void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
     void (*handler_int)    (common_params & params, int) = nullptr;
 
+    common_arg() = default;
+
     common_arg(
         const std::initializer_list<const char *> & args,
         const char * value_hint,
@@ -61,7 +64,21 @@ struct common_arg {
     bool is_exclude(enum llama_example ex);
     bool get_value_from_env(std::string & output) const;
     bool has_value_from_env() const;
-    std::string to_string();
+    std::string to_string() const;
+
+    // for using as key in std::map
+    bool operator<(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) < 0;
+    }
+    bool operator==(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) == 0;
+    }
 };
 
 struct common_params_context {
@@ -76,7 +93,11 @@ struct common_params_context {
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
-// function to be used by test-arg-parser
+// parse input arguments from CLI into a map
+// TODO: support repeated args in the future
+bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
+
+// initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
 // Get environment variable name for a CLI flag (e.g. "--ctx-size" -> "LLAMA_ARG_CTX_SIZE")
diff --git a/common/preset.cpp b/common/preset.cpp
new file mode 100644
index 00000000000..7c050728f7b
--- /dev/null
+++ b/common/preset.cpp
@@ -0,0 +1,186 @@
+#include "arg.h"
+#include "preset.h"
+#include "peg-parser.h"
+#include "log.h"
+
+#include <fstream>
+#include <sstream>
+#include <filesystem>
+
+static std::string rm_leading_dashes(const std::string & str) {
+    size_t pos = 0;
+    while (pos < str.size() && str[pos] == '-') {
+        ++pos;
+    }
+    return str.substr(pos);
+}
+
+std::vector<std::string> common_preset::to_args() const {
+    std::vector<std::string> args;
+
+    for (const auto & [opt, value] : options) {
+        args.push_back(opt.args.back()); // use the last arg as the main arg
+        if (opt.value_hint != nullptr) {
+            // single value
+            args.push_back(value);
+        }
+        if (opt.value_hint_2 != nullptr) {
+            throw std::runtime_error(string_format(
+                "common_preset::to_args(): option '%s' has two values, which is not supported yet",
+                opt.args.back()
+            ));
+        }
+    }
+
+    return args;
+}
+
+std::string common_preset::to_ini() const {
+    std::ostringstream ss;
+
+    ss << "[" << name << "]\n";
+    for (const auto & [opt, value] : options) {
+        auto espaced_value = value;
+        string_replace_all(espaced_value, "\n", "\\\n");
+        ss << rm_leading_dashes(opt.args.back()) << " = ";
+        ss << espaced_value << "\n";
+    }
+    ss << "\n";
+
+    return ss.str();
+}
+
+static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
+    std::map<std::string, std::map<std::string, std::string>> parsed;
+
+    if (!std::filesystem::exists(path)) {
+        return parsed; // return empty if file does not exist (expected behavior)
+    }
+
+    std::ifstream file(path);
+    if (!file.good()) {
+        throw std::runtime_error("failed to open server config file: " + path);
+    }
+
+    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+    static const auto parser = build_peg_parser([](auto & p) {
+        // newline ::= "\r\n" / "\n" / "\r"
+        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
+
+        // ws ::= [ \t]*
+        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
+
+        // comment ::= [;#] (!newline .)*
+        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
+
+        // eol ::= ws comment? (newline / EOF)
+        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
+
+        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
+        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
+
+        // value ::= (!eol-start .)*
+        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
+        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
+
+        // header-line ::= "[" ws ident ws "]" eol
+        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
+
+        // kv-line ::= ident ws "=" ws value eol
+        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
+
+        // comment-line ::= ws comment (newline / EOF)
+        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
+
+        // blank-line ::= ws (newline / EOF)
+        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
+
+        // line ::= header-line / kv-line / comment-line / blank-line
+        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
+
+        // ini ::= line* EOF
+        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
+
+        return ini;
+    });
+
+    common_peg_parse_context ctx(contents);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        throw std::runtime_error("failed to parse server config file: " + path);
+    }
+
+    std::string current_section = COMMON_PRESET_DEFAULT_NAME;
+    std::string current_key;
+
+    ctx.ast.visit(result, [&](const auto & node) {
+        if (node.tag == "section-name") {
+            const std::string section = std::string(node.text);
+            current_section = section;
+            parsed[current_section] = {};
+        } else if (node.tag == "key") {
+            const std::string key = std::string(node.text);
+            current_key = key;
+        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
+            parsed[current_section][current_key] = std::string(node.text);
+            current_key.clear();
+        }
+    });
+
+    return parsed;
+}
+
+static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
+    std::map<std::string, common_arg> mapping;
+    for (const auto & opt : ctx_params.options) {
+        if (opt.env != nullptr) {
+            mapping[opt.env] = opt;
+        }
+        for (const auto & arg : opt.args) {
+            mapping[rm_leading_dashes(arg)] = opt;
+        }
+    }
+    return mapping;
+}
+
+common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
+    common_presets out;
+    auto key_to_opt = get_map_key_opt(ctx_params);
+    auto ini_data = parse_ini_from_file(path);
+
+    for (auto section : ini_data) {
+        common_preset preset;
+        if (section.first.empty()) {
+            preset.name = COMMON_PRESET_DEFAULT_NAME;
+        } else {
+            preset.name = section.first;
+        }
+        LOG_DBG("loading preset: %s\n", preset.name.c_str());
+        for (const auto & [key, value] : section.second) {
+            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
+            if (key_to_opt.find(key) != key_to_opt.end()) {
+                preset.options[key_to_opt[key]] = value;
+                LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
+            } else {
+                // TODO: maybe warn about unknown key?
+            }
+        }
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+void common_presets_save(const std::string & path, const common_presets & presets) {
+    std::ofstream file(path);
+    if (!file.good()) {
+        throw std::runtime_error("failed to open preset file for writing: " + path);
+    }
+
+    file << "version = 1\n\n";
+
+    for (const auto & it : presets) {
+        file << it.second.to_ini();
+    }
+}
diff --git a/common/preset.h b/common/preset.h
new file mode 100644
index 00000000000..d200bdce809
--- /dev/null
+++ b/common/preset.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "common.h"
+#include "arg.h"
+
+#include <string>
+#include <vector>
+#include <map>
+
+//
+// INI preset parser and writer
+//
+
+constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
+
+struct common_preset {
+    std::string name;
+    // TODO: support repeated args in the future
+    std::map<common_arg, std::string> options;
+
+    // convert preset to CLI argument list
+    std::vector<std::string> to_args() const;
+
+    // convert preset to INI format string
+    std::string to_ini() const;
+
+    // TODO: maybe implement to_env() if needed
+};
+
+// interface for multiple presets in one file
+using common_presets = std::map<std::string, common_preset>;
+common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
+void common_presets_save(const std::string & path, const common_presets & presets);
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 14459245a95..ae1a497be6d 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -38,8 +38,6 @@ set(TARGET_SRCS
     server-http.h
     server-models.cpp
     server-models.h
-    server-config.cpp
-    server-config.h
     server-task.cpp
     server-task.h
     server-queue.cpp
diff --git a/tools/server/README.md b/tools/server/README.md
index 089ef5a0cbf..f98fb44c7bc 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1383,8 +1383,6 @@ Alternatively, you can point the router to a local directory containing your GGU
 llama-server --models-dir ./models_directory
 ```
 
-The directory is scanned recursively, so nested vendor/model layouts such as `vendor_name/model_name/*.gguf` are supported. The model name in the router UI matches the relative path inside `--models-dir` (for example, `vendor_name/model_name`).
-
 If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
 
 ```sh
diff --git a/tools/server/server-config.cpp b/tools/server/server-config.cpp
deleted file mode 100644
index f6c8746c7ce..00000000000
--- a/tools/server/server-config.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-#include "server-config.h"
-
-#include "peg-parser.h"
-#include "arg.h"
-
-#include <algorithm>
-#include <cctype>
-#include <fstream>
-#include <functional>
-#include <set>
-
-namespace {
-
-bool is_option(const std::string & arg) {
-    return !arg.empty() && arg[0] == '-';
-}
-
-bool is_implicit_value(const std::vector<std::string> & args, size_t index) {
-    return index + 1 < args.size() && !is_option(args[index + 1]);
-}
-
-std::string relativize(const std::string & path, const std::string & base) {
-    if (path.empty()) {
-        return path;
-    }
-
-    std::error_code ec;
-    const auto abs_path = std::filesystem::absolute(path, ec);
-    if (ec) {
-        return path;
-    }
-    const auto abs_base = std::filesystem::absolute(base, ec);
-    if (ec) {
-        return path;
-    }
-
-    const auto rel = std::filesystem::relative(abs_path, abs_base, ec);
-    if (ec) {
-        return path;
-    }
-
-    return rel.generic_string();
-}
-
-} // namespace
-
-server_config_manager::server_config_manager(const std::string & models_dir)
-    : models_dir(models_dir) {
-    if (!models_dir.empty()) {
-        path = (std::filesystem::path(models_dir) / "config.ini").string();
-    }
-}
-
-bool server_config_manager::enabled() const {
-    return !models_dir.empty();
-}
-
-void server_config_manager::ensure_loaded() {
-    if (!enabled()) {
-        return;
-    }
-
-    namespace fs = std::filesystem;
-
-    std::lock_guard<std::mutex> lock(mutex);
-
-    if (!fs::exists(path)) {
-        data.clear();
-        return;
-    }
-
-    std::ifstream file(path);
-    if (!file.good()) {
-        throw std::runtime_error("failed to open server config file: " + path);
-    }
-
-    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-
-    static const auto parser = build_peg_parser([](auto & p) {
-        // newline ::= "\r\n" / "\n" / "\r"
-        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
-
-        // ws ::= [ \t]*
-        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
-
-        // comment ::= [;#] (!newline .)*
-        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
-
-        // eol ::= ws comment? (newline / EOF)
-        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
-
-        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
-        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
-
-        // value ::= (!eol-start .)*
-        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
-        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
-
-        // header-line ::= "[" ws ident ws "]" eol
-        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
-
-        // kv-line ::= ident ws "=" ws value eol
-        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
-
-        // comment-line ::= ws comment (newline / EOF)
-        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
-
-        // blank-line ::= ws (newline / EOF)
-        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
-
-        // line ::= header-line / kv-line / comment-line / blank-line
-        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
-
-        // ini ::= line* EOF
-        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
-
-        return ini;
-    });
-
-    common_peg_parse_context ctx(contents);
-    const auto result = parser.parse(ctx);
-    if (!result.success()) {
-        throw std::runtime_error("failed to parse server config file: " + path);
-    }
-
-    std::map<std::string, std::map<std::string, std::string>> parsed;
-
-    std::string current_section;
-    std::string current_key;
-
-    ctx.ast.visit(result, [&](const auto & node) {
-        if (node.tag == "section-name") {
-            const std::string section = std::string(node.text);
-            if (section.rfind("LLAMA_ARG_", 0) == 0) {
-                current_section.clear();
-                return;
-            }
-
-            current_section = section;
-            parsed[current_section] = {};
-        } else if (node.tag == "key") {
-            const std::string key = std::string(node.text);
-            if (key.rfind("LLAMA_ARG_", 0) == 0) {
-                current_key = key;
-            } else {
-                current_key.clear();
-            }
-        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
-            parsed[current_section][current_key] = std::string(node.text);
-            current_key.clear();
-        }
-    });
-
-    data = std::move(parsed);
-}
-
-// write_locked expects the caller to hold `mutex`.
-void server_config_manager::write_locked() {
-    if (!enabled()) {
-        return;
-    }
-
-    namespace fs = std::filesystem;
-
-    if (!path.empty()) {
-        auto parent = fs::path(path).parent_path();
-        if (!parent.empty()) {
-            fs::create_directories(parent);
-        }
-    }
-
-    std::ofstream file(path);
-    file << "LLAMA_CONFIG_VERSION=1\n\n";
-
-    bool first_section = true;
-    for (const auto & [section, args] : data) {
-        if (!first_section) {
-            file << "\n";
-        }
-        first_section = false;
-
-        file << "[" << section << "]\n";
-        for (const auto & [key, value] : args) {
-            file << key << "=";
-            if (!value.empty()) {
-                file << value;
-            }
-            file << "\n";
-        }
-    }
-
-    file.flush();
-}
-
-bool is_router_control_arg(const std::string & arg) {
-    static const std::set<std::string> blacklist = {
-        "--alias",          // set per-child in server_models::load
-        "--models-dir",     // router-side discovery only
-        "--models-max",     // router capacity control
-        "--no-models-autoload", // router autoload policy
-        "--port",           // router port differs from child port
-        "-m", "--model",  // model path supplied per-child
-        "-hf", "--hf-file" // model source supplied per-child
-    };
-    return blacklist.count(arg) != 0;
-}
-
-void server_config_manager::sync(const std::vector<server_local_model> & models, const std::vector<std::string> & base_args) {
-    if (!enabled()) {
-        return;
-    }
-
-    ensure_loaded();
-
-    std::map<std::string, std::string> router_args;
-
-    for (size_t i = 1; i < base_args.size(); ++i) { // skip argv[0]
-        const auto & arg = base_args[i];
-        if (!is_option(arg)) {
-            continue;
-        }
-
-        if (is_router_control_arg(arg)) {
-            if (is_implicit_value(base_args, i)) {
-                ++i;
-            }
-            continue;
-        }
-
-        std::string value = "true";
-        if (is_implicit_value(base_args, i)) {
-            value = base_args[i + 1];
-            ++i;
-        }
-
-        const auto env_name = common_arg_get_env_name(arg);
-        if (!env_name.empty()) {
-            router_args[env_name] = value;
-        }
-    }
-
-    std::lock_guard<std::mutex> lock(mutex);
-
-    bool changed = !std::filesystem::exists(path);
-
-    const auto model_key  = common_arg_get_env_name("--model");
-    const auto model_alias = common_arg_get_env_name("-m");
-    const auto mmproj_key = common_arg_get_env_name("--mmproj");
-
-    const std::vector<std::string> model_keys = {
-        model_key,
-        model_alias,
-        "LLAMA_ARG_MODEL",
-    };
-
-    const std::vector<std::string> mmproj_keys = {
-        mmproj_key,
-        "LLAMA_ARG_MMPROJ",
-    };
-
-    for (const auto & model : models) {
-        auto & section = data[model.name];
-
-        const auto has_any_key = [](const auto & section_map, const std::vector<std::string> & keys) {
-            for (const auto & key : keys) {
-                if (!key.empty() && section_map.find(key) != section_map.end()) {
-                    return true;
-                }
-            }
-            return false;
-        };
-
-        if (!model_key.empty() && !has_any_key(section, model_keys)) {
-            section[model_key] = relativize(model.path, models_dir);
-            changed = true;
-        }
-
-        if (!model.path_mmproj.empty() && !mmproj_key.empty() && !has_any_key(section, mmproj_keys)) {
-            section[mmproj_key] = relativize(model.path_mmproj, models_dir);
-            changed = true;
-        }
-
-        for (const auto & router_arg : router_args) {
-            if (section.find(router_arg.first) == section.end()) {
-                section[router_arg.first] = router_arg.second;
-                changed = true;
-            }
-        }
-    }
-
-    if (changed) {
-        write_locked();
-    }
-}
-
-std::map<std::string, std::string> server_config_manager::env_for(const std::string & name) {
-    if (!enabled()) {
-        return {};
-    }
-
-    ensure_loaded();
-
-    std::lock_guard<std::mutex> lock(mutex);
-
-    auto it = data.find(name);
-    return it != data.end() ? it->second : std::map<std::string, std::string>{};
-}
-
diff --git a/tools/server/server-config.h b/tools/server/server-config.h
deleted file mode 100644
index 53395890f5f..00000000000
--- a/tools/server/server-config.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "server-common.h"
-
-#include <filesystem>
-#include <map>
-#include <mutex>
-#include <string>
-#include <vector>
-
-struct server_local_model {
-    std::string name;
-    std::string path;
-    std::string path_mmproj;
-};
-
-class server_config_manager {
-public:
-    explicit server_config_manager(const std::string & models_dir);
-
-    bool enabled() const;
-
-    void sync(const std::vector<server_local_model> & models, const std::vector<std::string> & base_args);
-
-    std::map<std::string, std::string> env_for(const std::string & name);
-
-private:
-    void ensure_loaded();
-    void write_locked();
-
-private:
-    std::string path;
-    std::string models_dir;
-    std::map<std::string, std::map<std::string, std::string>> data;
-    std::mutex mutex;
-};
-
-bool is_router_control_arg(const std::string & arg);
-
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 0a4358e76dc..014a1eee952 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -1,7 +1,7 @@
 #include "server-common.h"
 #include "server-models.h"
-#include "server-config.h"
 
+#include "preset.h"
 #include "download.h"
 
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
@@ -12,13 +12,10 @@
 #include <thread>
 #include <mutex>
 #include <condition_variable>
-#include <cctype>
 #include <cstring>
-#include <limits>
 #include <atomic>
 #include <chrono>
 #include <queue>
-#include <map>
 
 #ifdef _WIN32
 #include <winsock2.h>
@@ -78,150 +75,130 @@ static std::filesystem::path get_server_exec_path() {
 #endif
 }
 
-static std::string to_upper_copy(std::string value) {
-    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { return (char) std::toupper(c); });
-    return value;
-}
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
 
-static std::string pick_preferred_mmproj(const std::vector<std::filesystem::path> & paths) {
-    if (paths.empty()) {
-        return "";
+static std::vector<local_model> list_local_models(const std::string & dir) {
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
     }
 
-    auto score = [](const std::string & path) {
-        const auto upper = to_upper_copy(path);
-        if (upper.find("BF16") != std::string::npos) {
-            return 3;
-        }
-        if (upper.find("F16") != std::string::npos) {
-            return 2;
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
         }
-        if (upper.find("F32") != std::string::npos) {
-            return 1;
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
         }
-        return 0;
     };
 
-    const auto * best = &paths.front();
-    int best_score = score(best->string());
-    for (const auto & candidate : paths) {
-        const int candidate_score = score(candidate.string());
-        if (candidate_score > best_score) {
-            best = &candidate;
-            best_score = candidate_score;
+    auto files = fs_list(dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
         }
     }
-
-    return best->string();
+    return models;
 }
 
-static std::vector<server_local_model> list_local_models(const std::string & dir) {
-    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
-        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
-    }
-
-    std::vector<server_local_model> models;
-
-    struct dir_model_files {
-        std::vector<std::filesystem::path> model_files;
-        std::vector<std::filesystem::path> mmproj_files;
-    };
-
-    std::map<std::filesystem::path, dir_model_files> model_directories;
-
-    for (const auto & entry : std::filesystem::recursive_directory_iterator(
-                 dir, std::filesystem::directory_options::skip_permission_denied)) {
-        if (!entry.is_regular_file()) {
-            continue;
-        }
-
-        const auto & path = entry.path();
-        if (!string_ends_with(path.filename().string(), ".gguf")) {
-            continue;
-        }
-
-        auto & files = model_directories[path.parent_path()];
-        const auto filename = path.filename().string();
-        if (filename.find("mmproj") != std::string::npos) {
-            files.mmproj_files.push_back(path);
-            continue;
-        }
-
-        if (filename.find("-00001-of-") != std::string::npos) {
-            files.model_files.push_back(path);
-            continue;
-        }
+//
+// server_presets
+//
 
-        // skip shards that aren't the first chunk
-        if (filename.find("-000") != std::string::npos && filename.find("-of-") != std::string::npos) {
-            continue;
-        }
 
-        files.model_files.push_back(path);
+server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir)
+        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
+    if (!models_dir.empty()) {
+        auto presets_path = models_dir + DIRECTORY_SEPARATOR + "presets.ini";
+        presets = common_presets_load(presets_path, ctx_params);
+        SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
     }
 
-    for (const auto & [parent_path, files] : model_directories) {
-        if (files.model_files.empty()) {
-            continue;
-        }
+    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
 
-        std::string preferred_mmproj = pick_preferred_mmproj(files.mmproj_files);
-
-        const auto * best_model = &files.model_files.front();
-        std::uintmax_t best_size = std::numeric_limits<std::uintmax_t>::max();
-        for (const auto & candidate : files.model_files) {
-            std::error_code size_ec;
-            const auto size = std::filesystem::file_size(candidate, size_ec);
-            if (size_ec) {
-                continue;
-            }
-            if (best_size == std::numeric_limits<std::uintmax_t>::max() || size < best_size) {
-                best_model = &candidate;
-                best_size = size;
-            }
+    // populate reserved args (will be appended by the router)
+    for (auto & opt : ctx_params.options) {
+        if (opt.env == nullptr) {
+            continue;
         }
-
-        std::error_code ec;
-        auto rel_parent = std::filesystem::relative(parent_path, dir, ec);
-        std::string name;
-        if (!ec && !rel_parent.empty() && rel_parent.string() != ".") {
-            name = rel_parent.generic_string();
-        } else {
-            name = parent_path.filename().generic_string();
+        std::string env = opt.env;
+        if (env == "LLAMA_ARG_PORT" ||
+            env == "LLAMA_ARG_HOST" ||
+            env == "LLAMA_ARG_ALIAS" ||
+            env == "LLAMA_ARG_API_KEY" ||
+            env == "LLAMA_ARG_MODELS_DIR" ||
+            env == "LLAMA_ARG_MODELS_MAX" ||
+            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
+            env == "LLAMA_ARG_MODEL" ||
+            env == "LLAMA_ARG_MMPROJ" ||
+            env == "LLAMA_ARG_HF_REPO") {
+            control_args[env] = opt;
         }
-
-        server_local_model model{
-            /* name        */ name,
-            /* path        */ std::filesystem::absolute(*best_model).string(),
-            /* path_mmproj */ preferred_mmproj.empty() ? "" : std::filesystem::absolute(preferred_mmproj).string()
-        };
-        models.push_back(model);
     }
-
-    return models;
 }
 
-static bool is_option(const std::string & arg) {
-    return !arg.empty() && arg[0] == '-';
+common_preset server_presets::get_preset(const std::string & name) {
+    auto it = presets.find(name);
+    if (it != presets.end()) {
+        return it->second;
+    }
+    return common_preset();
 }
 
-static std::vector<std::string> strip_router_control_args(const std::vector<std::string> & args) {
-    std::vector<std::string> filtered;
-    filtered.reserve(args.size());
-
-    for (size_t i = 0; i < args.size(); ++i) {
-        const auto & arg = args[i];
-        if (is_router_control_arg(arg)) {
-            if (i + 1 < args.size() && !is_option(args[i + 1])) {
-                ++i;
-            }
-            continue;
+void server_presets::render_args(server_model_meta & meta) {
+    common_preset preset = meta.preset; // copy
+    // force removing control args if any
+    for (auto & cargs : control_args) {
+        preset.options.erase(cargs.second);
+    }
+    // inherit from base args
+    for (const auto & [arg, value] : base_args) {
+        preset.options[arg] = value;
+    }
+    // set control values
+    preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
+    preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
+    if (meta.in_cache) {
+        preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
+    } else {
+        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
+        if (!meta.path_mmproj.empty()) {
+            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
         }
-
-        filtered.push_back(arg);
     }
-
-    return filtered;
+    meta.args = preset.to_args();
 }
 
 //
@@ -232,12 +209,10 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), server_config(params.models_dir) {
+        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_dir) {
     for (int i = 0; i < argc; i++) {
         base_args.push_back(std::string(argv[i]));
     }
-
-    base_args = strip_router_control_args(base_args);
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
@@ -254,6 +229,7 @@ server_models::server_models(
     auto cached_models = common_list_cached_models();
     for (const auto & model : cached_models) {
         server_model_meta meta{
+            /* preset      */ presets.get_preset(model.to_string()),
             /* name        */ model.to_string(),
             /* path        */ model.manifest_path,
             /* path_mmproj */ "", // auto-detected when loading
@@ -264,6 +240,7 @@ server_models::server_models(
             /* args        */ std::vector<std::string>(),
             /* exit_code   */ 0
         };
+        presets.render_args(meta); // populate meta.args
         mapping[meta.name] = instance_t{
             /* subproc */ std::make_shared<subprocess_s>(),
             /* th      */ std::thread(),
@@ -273,13 +250,13 @@ server_models::server_models(
     // add local models specificed via --models-dir
     if (!params.models_dir.empty()) {
         auto local_models = list_local_models(params.models_dir);
-        server_config.sync(local_models, base_args);
         for (const auto & model : local_models) {
             if (mapping.find(model.name) != mapping.end()) {
                 // already exists in cached models, skip
                 continue;
             }
             server_model_meta meta{
+                /* preset      */ presets.get_preset(model.name),
                 /* name        */ model.name,
                 /* path        */ model.path,
                 /* path_mmproj */ model.path_mmproj,
@@ -290,6 +267,7 @@ server_models::server_models(
                 /* args        */ std::vector<std::string>(),
                 /* exit_code   */ 0
             };
+            presets.render_args(meta); // populate meta.args
             mapping[meta.name] = instance_t{
                 /* subproc */ std::make_shared<subprocess_s>(),
                 /* th      */ std::thread(),
@@ -297,6 +275,11 @@ server_models::server_models(
             };
         }
     }
+    // log available models
+    SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+    for (const auto & [name, inst] : mapping) {
+        SRV_INF("  %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
+    }
 }
 
 void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
@@ -430,25 +413,7 @@ void server_models::unload_lru() {
     }
 }
 
-static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
-    for (size_t i = 0; i < args.size();) {
-        if (args[i] == key) {
-            args.erase(args.begin() + i);
-            if (i < args.size() && !is_option(args[i])) {
-                args.erase(args.begin() + i);
-            }
-        } else {
-            ++i;
-        }
-    }
-
-    args.push_back(key);
-    if (!value.empty()) {
-        args.push_back(value);
-    }
-}
-
-void server_models::load(const std::string & name, bool auto_load) {
+void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
@@ -477,43 +442,8 @@ void server_models::load(const std::string & name, bool auto_load) {
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
-        std::vector<std::string> child_args;
-        if (auto_load && !meta.args.empty()) {
-            child_args = strip_router_control_args(meta.args); // copy previous args minus router-only flags
-        } else {
-            child_args.push_back(base_args[0]);
-            if (inst.meta.in_cache) {
-                child_args.push_back("-hf");
-                child_args.push_back(inst.meta.name);
-            } else {
-                child_args.push_back("-m");
-                child_args.push_back(inst.meta.path);
-                if (!inst.meta.path_mmproj.empty()) {
-                    child_args.push_back("--mmproj");
-                    child_args.push_back(inst.meta.path_mmproj);
-                }
-            }
-
-            child_args.push_back("--port");
-            child_args.push_back(std::to_string(inst.meta.port));
-
-            child_args.push_back("--alias");
-            child_args.push_back(inst.meta.name);
-        }
-
-        std::vector<std::string> child_env = base_env; // copy
-        auto config_env = server_config.env_for(inst.meta.name);
-        for (const auto & [key, value] : config_env) {
-            if (value == "false") {
-                continue;
-            }
-
-            if (value == "true" || value.empty()) {
-                child_env.push_back(key + "=");
-            } else {
-                child_env.push_back(key + "=" + value);
-            }
-        }
+        std::vector<std::string> child_args = inst.meta.args; // copy
+        std::vector<std::string> child_env  = base_env; // copy
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
 
         SRV_INF("%s", "spawning server instance with args:\n");
@@ -659,7 +589,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
     }
     if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
         SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
-        load(name, true);
+        load(name);
     }
 
     SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
@@ -842,38 +772,6 @@ void server_models_routes::init_routes() {
         return models.proxy_request(req, method, name, true); // update last usage for POST request only
     };
 
-    this->get_router_models = [this](const server_http_req &) {
-        auto res = std::make_unique<server_http_res>();
-        json models_json = json::array();
-        auto all_models = models.get_all_meta();
-        std::time_t t = std::time(0);
-        for (const auto & meta : all_models) {
-            json status {
-                {"value", server_model_status_to_string(meta.status)},
-                {"args",  meta.args},
-            };
-            if (meta.is_failed()) {
-                status["exit_code"] = meta.exit_code;
-                status["failed"]    = true;
-            }
-            models_json.push_back(json {
-                {"id",       meta.name},
-                {"object",   "model"},    // for OAI-compat
-                {"owned_by", "llamacpp"}, // for OAI-compat
-                {"created",  t},          // for OAI-compat
-                {"in_cache", meta.in_cache},
-                {"path",     meta.path},
-                {"status",   status},
-                // TODO: add other fields, may require reading GGUF metadata
-            });
-        }
-        res_ok(res, {
-            {"data", models_json},
-            {"object", "list"},
-        });
-        return res;
-    };
-
     this->post_router_models_load = [this](const server_http_req & req) {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
@@ -887,7 +785,7 @@ void server_models_routes::init_routes() {
             res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
-        models.load(name, false);
+        models.load(name);
         res_ok(res, {{"success", true}});
         return res;
     };
@@ -911,9 +809,12 @@ void server_models_routes::init_routes() {
         std::time_t t = std::time(0);
         for (const auto & meta : all_models) {
             json status {
-                {"value", server_model_status_to_string(meta.status)},
-                {"args",  meta.args},
+                {"value",  server_model_status_to_string(meta.status)},
+                {"args",   meta.args},
             };
+            if (!meta.preset.name.empty()) {
+                status["preset"] = meta.preset.to_ini();
+            }
             if (meta.is_failed()) {
                 status["exit_code"] = meta.exit_code;
                 status["failed"]    = true;
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 725e059b87c..ae0ef78ce3f 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include "common.h"
+#include "preset.h"
 #include "server-http.h"
-#include "server-config.h"
 
 #include <mutex>
 #include <condition_variable>
@@ -48,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) {
 }
 
 struct server_model_meta {
+    common_preset preset;
     std::string name;
     std::string path;
     std::string path_mmproj; // only available if in_cache=false
@@ -55,7 +56,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
+    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
 
     bool is_active() const {
@@ -67,6 +68,19 @@ struct server_model_meta {
     }
 };
 
+// the server_presets struct holds the presets read from presets.ini
+// as well as base args from the router server
+struct server_presets {
+    common_presets presets;
+    common_params_context ctx_params;
+    std::map<common_arg, std::string> base_args;
+    std::map<std::string, common_arg> control_args; // args reserved for server control
+
+    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
+    common_preset get_preset(const std::string & name);
+    void render_args(server_model_meta & meta);
+};
+
 struct subprocess_s;
 
 struct server_models {
@@ -86,7 +100,7 @@ struct server_models {
     std::vector<std::string> base_args;
     std::vector<std::string> base_env;
 
-    server_config_manager server_config;
+    server_presets presets;
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 
@@ -105,8 +119,7 @@ struct server_models {
     // return a copy of all model metadata
     std::vector<server_model_meta> get_all_meta();
 
-    // if auto_load is true, load the model with previous args if any
-    void load(const std::string & name, bool auto_load);
+    void load(const std::string & name);
     void unload(const std::string & name);
     void unload_all();
 

From a7c7aca6e21cccf86e5ace74122e2371bdbe2345 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 18:59:09 +0100
Subject: [PATCH 07/19] fix compile

---
 common/arg.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/arg.h b/common/arg.h
index a0fef251d27..78961328cac 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -6,6 +6,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <cstring>
 
 //
 // CLI argument parsing

From e5c3c4712fc272775641b83372332460a0df2fa6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 20:47:57 +0100
Subject: [PATCH 08/19] cont

---
 common/arg.cpp                 |  7 +++++++
 common/common.h                |  7 ++++---
 common/preset.cpp              |  4 ++--
 tools/server/server-models.cpp | 15 +++++++++++----
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 5f181b343bc..318cdbb1289 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2610,6 +2610,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_dir = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+    add_opt(common_arg(
+        {"--models-preset"}, "PATH",
+        "path to INI file containing model presets for the router server (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.models_preset = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
     add_opt(common_arg(
         {"--models-max"}, "N",
         string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
diff --git a/common/common.h b/common/common.h
index ad79f5b425c..6119adcc0f8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -484,9 +484,10 @@ struct common_params {
     bool endpoint_metrics = false;
 
     // router server configs
-    std::string models_dir = ""; // directory containing models for the router server
-    int models_max = 4;          // maximum number of models to load simultaneously
-    bool models_autoload = true; // automatically load models when requested via the router server
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server
 
     bool log_json = false;
 
diff --git a/common/preset.cpp b/common/preset.cpp
index 7c050728f7b..7000ff96c2c 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -54,12 +54,12 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
     std::map<std::string, std::map<std::string, std::string>> parsed;
 
     if (!std::filesystem::exists(path)) {
-        return parsed; // return empty if file does not exist (expected behavior)
+        throw std::runtime_error("preset file does not exist: " + path);
     }
 
     std::ifstream file(path);
     if (!file.good()) {
-        throw std::runtime_error("failed to open server config file: " + path);
+        throw std::runtime_error("failed to open server preset file: " + path);
     }
 
     std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 014a1eee952..a1c7efccca7 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -138,10 +138,9 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
 //
 
 
-server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir)
+server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
         : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
-    if (!models_dir.empty()) {
-        auto presets_path = models_dir + DIRECTORY_SEPARATOR + "presets.ini";
+    if (!presets_path.empty()) {
         presets = common_presets_load(presets_path, ctx_params);
         SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
     }
@@ -167,6 +166,14 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
             control_args[env] = opt;
         }
     }
+
+    // remove any router-controlled args from base_args
+    for (const auto & cargs : control_args) {
+        auto it = base_args.find(cargs.second);
+        if (it != base_args.end()) {
+            base_args.erase(it);
+        }
+    }
 }
 
 common_preset server_presets::get_preset(const std::string & name) {
@@ -209,7 +216,7 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_dir) {
+        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
     for (int i = 0; i < argc; i++) {
         base_args.push_back(std::string(argv[i]));
     }

From 7b96207166a7d45f7de979eec9ffcb443fe32741 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 21:42:16 +0100
Subject: [PATCH 09/19] allow custom-path models

---
 tools/server/README.md         |  50 ++++++++++++++++
 tools/server/server-models.cpp | 105 ++++++++++++++++++++++++++-------
 tools/server/server-models.h   |   5 ++
 3 files changed, 138 insertions(+), 22 deletions(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index f98fb44c7bc..649f1e7ca2b 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1369,6 +1369,11 @@ llama-server
 
 ### Model sources
 
+There are 3 possible sources for model files:
+1. Cached models (controlled by the `LLAMA_CACHE` environment variable)
+2. Custom model directory (set via the `--models-dir` argument)
+3. Custom preset (set via the `--models-preset` argument)
+
 By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
 
 ```sh
@@ -1413,6 +1418,51 @@ llama-server -ctx 8192 -n 1024 -np 2
 
 Note: model instances inherit both command line arguments and environment variables from the router server.
 
+Alternatively, you can also add GGUF based preset (see next section)
+
+### Model presets
+
+Model presets allow advanced users to define custom configurations using an `.ini` file:
+
+```sh
+llama-server --models-preset ./my-models.ini
+```
+
+Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
+
+Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
+
+Example:
+
+```ini
+version = 1
+
+; If the key corresponds to an existing model on the server,
+; this will be used as the default config for that model
+[ggml-org/MY-MODEL-GGUF:Q8_0]
+; string value
+chat-template = chatml
+; numeric value
+n-gpu-layer = 123
+; boolean value
+jinja = false
+; shorthand argument (for example, context size)
+c = 4096
+; environment variable name
+LLAMA_ARG_CACHE_RAM = 0
+; file paths are relative to server's CWD
+model-draft = ./my-models/draft.gguf
+; but it's RECOMMENDED to use absolute path
+model-draft = /Users/abc/my-models/draft.gguf
+
+; If the key does NOT correspond to an existing model,
+; you need to specify at least the model path
+[custom_model]
+model = /Users/abc/my-awesome-model-Q4_K_M.gguf
+```
+
+Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
+
 ### Routing requests
 
 Requests are routed according to the requested model name.
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index a1c7efccca7..18e21c00d66 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -145,8 +145,6 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
         SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
     }
 
-    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
-
     // populate reserved args (will be appended by the router)
     for (auto & opt : ctx_params.options) {
         if (opt.env == nullptr) {
@@ -159,14 +157,17 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
             env == "LLAMA_ARG_API_KEY" ||
             env == "LLAMA_ARG_MODELS_DIR" ||
             env == "LLAMA_ARG_MODELS_MAX" ||
-            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
             env == "LLAMA_ARG_MODEL" ||
             env == "LLAMA_ARG_MMPROJ" ||
-            env == "LLAMA_ARG_HF_REPO") {
+            env == "LLAMA_ARG_HF_REPO" ||
+            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
             control_args[env] = opt;
         }
     }
 
+    // read base args from router's argv
+    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
+
     // remove any router-controlled args from base_args
     for (const auto & cargs : control_args) {
         auto it = base_args.find(cargs.second);
@@ -186,14 +187,21 @@ common_preset server_presets::get_preset(const std::string & name) {
 
 void server_presets::render_args(server_model_meta & meta) {
     common_preset preset = meta.preset; // copy
+    // merging 3 kinds of args:
+    // 1. model-specific args (from preset)
     // force removing control args if any
     for (auto & cargs : control_args) {
-        preset.options.erase(cargs.second);
+        if (preset.options.find(cargs.second) != preset.options.end()) {
+            SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
+            preset.options.erase(cargs.second);
+        }
     }
+    // 2. base args (from router)
     // inherit from base args
     for (const auto & [arg, value] : base_args) {
         preset.options[arg] = value;
     }
+    // 3. control args (from router)
     // set control values
     preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
     preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
@@ -231,8 +239,54 @@ server_models::server_models(
         LOG_WRN("failed to get server executable path: %s\n", e.what());
         LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
     }
-    // TODO: allow refreshing cached model list
-    // add cached models
+    load_models();
+}
+
+void server_models::add_model(server_model_meta && meta) {
+    if (mapping.find(meta.name) != mapping.end()) {
+        throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
+    }
+    presets.render_args(meta); // populate meta.args
+    std::string name = meta.name;
+    mapping[name] = instance_t{
+        /* subproc */ std::make_shared<subprocess_s>(),
+        /* th      */ std::thread(),
+        /* meta    */ std::move(meta)
+    };
+}
+
+static std::vector<local_model> list_custom_path_models(server_presets & presets) {
+    // detect any custom-path models in presets
+    std::vector<local_model> custom_models;
+    for (auto & [model_name, preset] : presets.presets) {
+        local_model model;
+        model.name = model_name;
+        std::vector<common_arg> to_erase;
+        for (auto & [arg, value] : preset.options) {
+            std::string env(arg.env ? arg.env : "");
+            if (env == "LLAMA_ARG_MODEL") {
+                model.path = value;
+                to_erase.push_back(arg);
+            }
+            if (env == "LLAMA_ARG_MMPROJ") {
+                model.path_mmproj = value;
+                to_erase.push_back(arg);
+            }
+        }
+        for (auto & arg : to_erase) {
+            preset.options.erase(arg);
+        }
+        if (!model.name.empty() && !model.path.empty()) {
+            custom_models.push_back(model);
+        }
+    }
+    return custom_models;
+}
+
+// TODO: allow refreshing cached model list
+void server_models::load_models() {
+    // loading models from 3 sources:
+    // 1. cached models
     auto cached_models = common_list_cached_models();
     for (const auto & model : cached_models) {
         server_model_meta meta{
@@ -247,16 +301,11 @@ server_models::server_models(
             /* args        */ std::vector<std::string>(),
             /* exit_code   */ 0
         };
-        presets.render_args(meta); // populate meta.args
-        mapping[meta.name] = instance_t{
-            /* subproc */ std::make_shared<subprocess_s>(),
-            /* th      */ std::thread(),
-            /* meta    */ meta
-        };
+        add_model(std::move(meta));
     }
-    // add local models specificed via --models-dir
-    if (!params.models_dir.empty()) {
-        auto local_models = list_local_models(params.models_dir);
+    // 2. local models specificed via --models-dir
+    if (!base_params.models_dir.empty()) {
+        auto local_models = list_local_models(base_params.models_dir);
         for (const auto & model : local_models) {
             if (mapping.find(model.name) != mapping.end()) {
                 // already exists in cached models, skip
@@ -274,14 +323,26 @@ server_models::server_models(
                 /* args        */ std::vector<std::string>(),
                 /* exit_code   */ 0
             };
-            presets.render_args(meta); // populate meta.args
-            mapping[meta.name] = instance_t{
-                /* subproc */ std::make_shared<subprocess_s>(),
-                /* th      */ std::thread(),
-                /* meta    */ meta
-            };
+            add_model(std::move(meta));
         }
     }
+    // 3. custom-path models specified in presets
+    auto custom_models = list_custom_path_models(presets);
+    for (const auto & model : custom_models) {
+        server_model_meta meta{
+            /* preset      */ presets.get_preset(model.name),
+            /* name        */ model.name,
+            /* path        */ model.path,
+            /* path_mmproj */ model.path_mmproj,
+            /* in_cache    */ false,
+            /* port        */ 0,
+            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used   */ 0,
+            /* args        */ std::vector<std::string>(),
+            /* exit_code   */ 0
+        };
+        add_model(std::move(meta));
+    }
     // log available models
     SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
     for (const auto & [name, inst] : mapping) {
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index ae0ef78ce3f..9cdbbad9b6a 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -107,9 +107,14 @@ struct server_models {
     // unload least recently used models if the limit is reached
     void unload_lru();
 
+    // not thread-safe, caller must hold mutex
+    void add_model(server_model_meta && meta);
+
 public:
     server_models(const common_params & params, int argc, char ** argv, char ** envp);
 
+    void load_models();
+
     // check if a model instance exists
     bool has_model(const std::string & name);
 

From b8d8ffee3c71a578de58cefc8bf4bcfd58831fcc Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 21:55:32 +0100
Subject: [PATCH 10/19] add falsey check

---
 common/arg.cpp    | 7 ++++---
 common/arg.h      | 6 ++++++
 common/preset.cpp | 9 ++++++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 318cdbb1289..f2b460f86dd 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -47,6 +47,7 @@
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
 using json = nlohmann::ordered_json;
+using namespace common_arg_utils;
 
 static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
@@ -759,15 +760,15 @@ static std::string list_builtin_chat_templates() {
     return msg.str();
 }
 
-static bool is_truthy(const std::string & value) {
+bool common_arg_utils::is_truthy(const std::string & value) {
     return value == "on" || value == "enabled" || value == "1";
 }
 
-static bool is_falsey(const std::string & value) {
+bool common_arg_utils::is_falsey(const std::string & value) {
     return value == "off" || value == "disabled" || value == "0";
 }
 
-static bool is_autoy(const std::string & value) {
+bool common_arg_utils::is_autoy(const std::string & value) {
     return value == "auto" || value == "-1";
 }
 
diff --git a/common/arg.h b/common/arg.h
index 78961328cac..161d0688cd0 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -82,6 +82,12 @@ struct common_arg {
     }
 };
 
+namespace common_arg_utils {
+    bool is_truthy(const std::string & value);
+    bool is_falsey(const std::string & value);
+    bool is_autoy(const std::string & value);
+}
+
 struct common_params_context {
     enum llama_example ex = LLAMA_EXAMPLE_COMMON;
     common_params & params;
diff --git a/common/preset.cpp b/common/preset.cpp
index 7000ff96c2c..c07e68b28be 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -20,11 +20,18 @@ std::vector<std::string> common_preset::to_args() const {
 
     for (const auto & [opt, value] : options) {
         args.push_back(opt.args.back()); // use the last arg as the main arg
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // flag option, no value
+            if (common_arg_utils::is_falsey(value)) {
+                // skip the flag
+                args.pop_back();
+            }
+        }
         if (opt.value_hint != nullptr) {
             // single value
             args.push_back(value);
         }
-        if (opt.value_hint_2 != nullptr) {
+        if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
             throw std::runtime_error(string_format(
                 "common_preset::to_args(): option '%s' has two values, which is not supported yet",
                 opt.args.back()

From 0734bbe4ea0fbb5d3ab9e6ff68e2ca04f479beed Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Mon, 8 Dec 2025 21:55:24 +0100
Subject: [PATCH 11/19] server: fix router model discovery and child process
 spawning

- Sanitize model names: replace / and \ with _ for display
- Recursive directory scan with relative path storage
- Convert relative paths to absolute when spawning children
- Filter router control args from child processes
- Refresh args after port assignment for correct port value
- Fallback preset lookup for compatibility
- Fix missing argv[0]: store server binary path before base_args parsing
---
 tools/server/server-models.cpp | 144 ++++++++++++++++++++++-----------
 tools/server/server-models.h   |   4 +-
 2 files changed, 100 insertions(+), 48 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 18e21c00d66..a60cf82b941 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -7,6 +7,7 @@
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
 #include <sheredom/subprocess.h>
 
+#include <cstdio>
 #include <functional>
 #include <algorithm>
 #include <thread>
@@ -77,57 +78,87 @@ static std::filesystem::path get_server_exec_path() {
 
 struct local_model {
     std::string name;
+    std::string display_name;
     std::string path;
     std::string path_mmproj;
 };
 
+static std::string sanitize_model_name(const std::string & name) {
+    std::string sanitized = name;
+    string_replace_all(sanitized, "/", "_");
+    string_replace_all(sanitized, "\\", "_");
+    return sanitized;
+}
+
 static std::vector<local_model> list_local_models(const std::string & dir) {
     if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
         throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
     }
 
     std::vector<local_model> models;
-    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
-        auto files = fs_list(subdir_path, false);
-        common_file_info model_file;
-        common_file_info first_shard_file;
-        common_file_info mmproj_file;
-        for (const auto & file : files) {
-            if (string_ends_with(file.name, ".gguf")) {
-                if (file.name.find("mmproj") != std::string::npos) {
-                    mmproj_file = file;
-                } else if (file.name.find("-00001-of-") != std::string::npos) {
-                    first_shard_file = file;
-                } else {
-                    model_file = file;
+    std::function<void(const std::string &, const std::string &)> scan_subdir =
+        [&](const std::string & subdir_path, const std::string & name) {
+            auto files = fs_list(subdir_path, true);  // Need directories for recursion
+            common_file_info model_file;
+            common_file_info first_shard_file;
+            common_file_info mmproj_file;
+
+            for (const auto & file : files) {
+                if (file.is_dir) {
+                    const std::string child_name = name.empty() ? file.name : name + "/" + file.name;
+                    scan_subdir(file.path, child_name);
+                    continue;
+                }
+
+                if (string_ends_with(file.name, ".gguf")) {
+                    if (file.name.find("mmproj") != std::string::npos) {
+                        mmproj_file = file;
+                    } else if (file.name.find("-00001-of-") != std::string::npos) {
+                        first_shard_file = file;
+                    } else {
+                        model_file = file;
+                    }
+                }
+            }
+
+            // Convert absolute paths to relative
+            std::string model_path = first_shard_file.path.empty() ? model_file.path : first_shard_file.path;
+            if (!model_path.empty()) {
+                std::error_code ec;
+                auto rel_path = std::filesystem::relative(model_path, dir, ec);
+                if (!ec) {
+                    model_path = rel_path.generic_string();
+                }
+            }
+
+            std::string mmproj_path = mmproj_file.path;
+            if (!mmproj_path.empty()) {
+                std::error_code ec;
+                auto rel_path = std::filesystem::relative(mmproj_path, dir, ec);
+                if (!ec) {
+                    mmproj_path = rel_path.generic_string();
                 }
             }
-        }
-        // single file model
-        local_model model{
-            /* name        */ name,
-            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
-            /* path_mmproj */ mmproj_file.path // can be empty
-        };
-        if (!model.path.empty()) {
-            models.push_back(model);
-        }
-    };
 
-    auto files = fs_list(dir, true);
-    for (const auto & file : files) {
-        if (file.is_dir) {
-            scan_subdir(file.path, file.name);
-        } else if (string_ends_with(file.name, ".gguf")) {
-            // single file model
-            std::string name = file.name;
-            string_replace_all(name, ".gguf", "");
             local_model model{
-                /* name        */ name,
-                /* path        */ file.path,
-                /* path_mmproj */ ""
+                /* name         */ name,
+                /* display_name */ sanitize_model_name(name),
+                /* path         */ model_path,
+                /* path_mmproj  */ mmproj_path // can be empty
             };
-            models.push_back(model);
+            if (!model.path.empty()) {
+                models.push_back(model);
+            }
+        };
+
+    scan_subdir(dir, "");
+
+    // when scanning the root, the name is empty, so adjust names for models directly under models_dir
+    for (auto & model : models) {
+        if (model.name.empty() && !model.path.empty()) {
+            model.name = std::filesystem::path(model.path).filename().string();
+            string_replace_all(model.name, ".gguf", "");
+            model.display_name = sanitize_model_name(model.name);
         }
     }
     return models;
@@ -138,8 +169,8 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
 //
 
 
-server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
-        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
+server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir)
+        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)), models_dir(models_dir) {
     if (!presets_path.empty()) {
         presets = common_presets_load(presets_path, ctx_params);
         SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
@@ -154,6 +185,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
         if (env == "LLAMA_ARG_PORT" ||
             env == "LLAMA_ARG_HOST" ||
             env == "LLAMA_ARG_ALIAS" ||
+            env == "LLAMA_ARG_MODELS_PRESET" ||
             env == "LLAMA_ARG_API_KEY" ||
             env == "LLAMA_ARG_MODELS_DIR" ||
             env == "LLAMA_ARG_MODELS_MAX" ||
@@ -208,9 +240,17 @@ void server_presets::render_args(server_model_meta & meta) {
     if (meta.in_cache) {
         preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
     } else {
-        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
+        std::string model_path = meta.path;
+        if (!models_dir.empty() && !std::filesystem::path(model_path).is_absolute()) {
+            model_path = models_dir + "/" + model_path;
+        }
+        preset.options[control_args["LLAMA_ARG_MODEL"]] = model_path;
         if (!meta.path_mmproj.empty()) {
-            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
+            std::string mmproj_path = meta.path_mmproj;
+            if (!models_dir.empty() && !std::filesystem::path(mmproj_path).is_absolute()) {
+                mmproj_path = models_dir + "/" + mmproj_path;
+            }
+            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = mmproj_path;
         }
     }
     meta.args = preset.to_args();
@@ -224,7 +264,7 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
+        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset, params.models_dir) {
     for (int i = 0; i < argc; i++) {
         base_args.push_back(std::string(argv[i]));
     }
@@ -232,12 +272,13 @@ server_models::server_models(
         base_env.push_back(std::string(*env));
     }
     GGML_ASSERT(!base_args.empty());
-    // set binary path
+    // Save binary path before base_args is modified by presets parsing
     try {
-        base_args[0] = get_server_exec_path().string();
+        server_binary_path = get_server_exec_path().string();
     } catch (const std::exception & e) {
         LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
+        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
+        server_binary_path = std::string(argv[0]);
     }
     load_models();
 }
@@ -307,13 +348,18 @@ void server_models::load_models() {
     if (!base_params.models_dir.empty()) {
         auto local_models = list_local_models(base_params.models_dir);
         for (const auto & model : local_models) {
-            if (mapping.find(model.name) != mapping.end()) {
+            const std::string name = model.display_name;
+            if (mapping.find(name) != mapping.end()) {
                 // already exists in cached models, skip
                 continue;
             }
+            auto preset = presets.get_preset(name);
+            if (preset.name.empty() && name != model.name) {
+                preset = presets.get_preset(model.name);
+            }
             server_model_meta meta{
-                /* preset      */ presets.get_preset(model.name),
-                /* name        */ model.name,
+                /* preset      */ preset,
+                /* name        */ name,
                 /* path        */ model.path,
                 /* path_mmproj */ model.path_mmproj,
                 /* in_cache    */ false,
@@ -506,11 +552,15 @@ void server_models::load(const std::string & name) {
         throw std::runtime_error("failed to get a port number");
     }
 
+    presets.render_args(inst.meta);
+
     inst.subproc = std::make_shared<subprocess_s>();
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
         std::vector<std::string> child_args = inst.meta.args; // copy
+        // Insert binary path as argv[0]
+        child_args.insert(child_args.begin(), server_binary_path);
         std::vector<std::string> child_env  = base_env; // copy
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
 
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 9cdbbad9b6a..2aa93cab742 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -75,8 +75,9 @@ struct server_presets {
     common_params_context ctx_params;
     std::map<common_arg, std::string> base_args;
     std::map<std::string, common_arg> control_args; // args reserved for server control
+    std::string models_dir;
 
-    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
+    server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir);
     common_preset get_preset(const std::string & name);
     void render_args(server_model_meta & meta);
 };
@@ -99,6 +100,7 @@ struct server_models {
     common_params base_params;
     std::vector<std::string> base_args;
     std::vector<std::string> base_env;
+    std::string server_binary_path;
 
     server_presets presets;
 

From a7baeab410a7053ed0467809bc0eac2f539d242d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 22:14:31 +0100
Subject: [PATCH 12/19] Revert "server: fix router model discovery and child
 process spawning"

This reverts commit e3832b42eeea7fcb108995966c7584479f745857.
---
 tools/server/server-models.cpp | 144 +++++++++++----------------------
 tools/server/server-models.h   |   4 +-
 2 files changed, 48 insertions(+), 100 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index a60cf82b941..18e21c00d66 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -7,7 +7,6 @@
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
 #include <sheredom/subprocess.h>
 
-#include <cstdio>
 #include <functional>
 #include <algorithm>
 #include <thread>
@@ -78,87 +77,57 @@ static std::filesystem::path get_server_exec_path() {
 
 struct local_model {
     std::string name;
-    std::string display_name;
     std::string path;
     std::string path_mmproj;
 };
 
-static std::string sanitize_model_name(const std::string & name) {
-    std::string sanitized = name;
-    string_replace_all(sanitized, "/", "_");
-    string_replace_all(sanitized, "\\", "_");
-    return sanitized;
-}
-
 static std::vector<local_model> list_local_models(const std::string & dir) {
     if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
         throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
     }
 
     std::vector<local_model> models;
-    std::function<void(const std::string &, const std::string &)> scan_subdir =
-        [&](const std::string & subdir_path, const std::string & name) {
-            auto files = fs_list(subdir_path, true);  // Need directories for recursion
-            common_file_info model_file;
-            common_file_info first_shard_file;
-            common_file_info mmproj_file;
-
-            for (const auto & file : files) {
-                if (file.is_dir) {
-                    const std::string child_name = name.empty() ? file.name : name + "/" + file.name;
-                    scan_subdir(file.path, child_name);
-                    continue;
-                }
-
-                if (string_ends_with(file.name, ".gguf")) {
-                    if (file.name.find("mmproj") != std::string::npos) {
-                        mmproj_file = file;
-                    } else if (file.name.find("-00001-of-") != std::string::npos) {
-                        first_shard_file = file;
-                    } else {
-                        model_file = file;
-                    }
-                }
-            }
-
-            // Convert absolute paths to relative
-            std::string model_path = first_shard_file.path.empty() ? model_file.path : first_shard_file.path;
-            if (!model_path.empty()) {
-                std::error_code ec;
-                auto rel_path = std::filesystem::relative(model_path, dir, ec);
-                if (!ec) {
-                    model_path = rel_path.generic_string();
-                }
-            }
-
-            std::string mmproj_path = mmproj_file.path;
-            if (!mmproj_path.empty()) {
-                std::error_code ec;
-                auto rel_path = std::filesystem::relative(mmproj_path, dir, ec);
-                if (!ec) {
-                    mmproj_path = rel_path.generic_string();
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
                 }
             }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
 
+    auto files = fs_list(dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
             local_model model{
-                /* name         */ name,
-                /* display_name */ sanitize_model_name(name),
-                /* path         */ model_path,
-                /* path_mmproj  */ mmproj_path // can be empty
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
             };
-            if (!model.path.empty()) {
-                models.push_back(model);
-            }
-        };
-
-    scan_subdir(dir, "");
-
-    // when scanning the root, the name is empty, so adjust names for models directly under models_dir
-    for (auto & model : models) {
-        if (model.name.empty() && !model.path.empty()) {
-            model.name = std::filesystem::path(model.path).filename().string();
-            string_replace_all(model.name, ".gguf", "");
-            model.display_name = sanitize_model_name(model.name);
+            models.push_back(model);
         }
     }
     return models;
@@ -169,8 +138,8 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
 //
 
 
-server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir)
-        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)), models_dir(models_dir) {
+server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
+        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
     if (!presets_path.empty()) {
         presets = common_presets_load(presets_path, ctx_params);
         SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
@@ -185,7 +154,6 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
         if (env == "LLAMA_ARG_PORT" ||
             env == "LLAMA_ARG_HOST" ||
             env == "LLAMA_ARG_ALIAS" ||
-            env == "LLAMA_ARG_MODELS_PRESET" ||
             env == "LLAMA_ARG_API_KEY" ||
             env == "LLAMA_ARG_MODELS_DIR" ||
             env == "LLAMA_ARG_MODELS_MAX" ||
@@ -240,17 +208,9 @@ void server_presets::render_args(server_model_meta & meta) {
     if (meta.in_cache) {
         preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
     } else {
-        std::string model_path = meta.path;
-        if (!models_dir.empty() && !std::filesystem::path(model_path).is_absolute()) {
-            model_path = models_dir + "/" + model_path;
-        }
-        preset.options[control_args["LLAMA_ARG_MODEL"]] = model_path;
+        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
         if (!meta.path_mmproj.empty()) {
-            std::string mmproj_path = meta.path_mmproj;
-            if (!models_dir.empty() && !std::filesystem::path(mmproj_path).is_absolute()) {
-                mmproj_path = models_dir + "/" + mmproj_path;
-            }
-            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = mmproj_path;
+            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
         }
     }
     meta.args = preset.to_args();
@@ -264,7 +224,7 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset, params.models_dir) {
+        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
     for (int i = 0; i < argc; i++) {
         base_args.push_back(std::string(argv[i]));
     }
@@ -272,13 +232,12 @@ server_models::server_models(
         base_env.push_back(std::string(*env));
     }
     GGML_ASSERT(!base_args.empty());
-    // Save binary path before base_args is modified by presets parsing
+    // set binary path
     try {
-        server_binary_path = get_server_exec_path().string();
+        base_args[0] = get_server_exec_path().string();
     } catch (const std::exception & e) {
         LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
-        server_binary_path = std::string(argv[0]);
+        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
     }
     load_models();
 }
@@ -348,18 +307,13 @@ void server_models::load_models() {
     if (!base_params.models_dir.empty()) {
         auto local_models = list_local_models(base_params.models_dir);
         for (const auto & model : local_models) {
-            const std::string name = model.display_name;
-            if (mapping.find(name) != mapping.end()) {
+            if (mapping.find(model.name) != mapping.end()) {
                 // already exists in cached models, skip
                 continue;
             }
-            auto preset = presets.get_preset(name);
-            if (preset.name.empty() && name != model.name) {
-                preset = presets.get_preset(model.name);
-            }
             server_model_meta meta{
-                /* preset      */ preset,
-                /* name        */ name,
+                /* preset      */ presets.get_preset(model.name),
+                /* name        */ model.name,
                 /* path        */ model.path,
                 /* path_mmproj */ model.path_mmproj,
                 /* in_cache    */ false,
@@ -552,15 +506,11 @@ void server_models::load(const std::string & name) {
         throw std::runtime_error("failed to get a port number");
     }
 
-    presets.render_args(inst.meta);
-
     inst.subproc = std::make_shared<subprocess_s>();
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
         std::vector<std::string> child_args = inst.meta.args; // copy
-        // Insert binary path as argv[0]
-        child_args.insert(child_args.begin(), server_binary_path);
         std::vector<std::string> child_env  = base_env; // copy
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
 
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 2aa93cab742..9cdbbad9b6a 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -75,9 +75,8 @@ struct server_presets {
     common_params_context ctx_params;
     std::map<common_arg, std::string> base_args;
     std::map<std::string, common_arg> control_args; // args reserved for server control
-    std::string models_dir;
 
-    server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path, const std::string & models_dir);
+    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
     common_preset get_preset(const std::string & name);
     void render_args(server_model_meta & meta);
 };
@@ -100,7 +99,6 @@ struct server_models {
     common_params base_params;
     std::vector<std::string> base_args;
     std::vector<std::string> base_env;
-    std::string server_binary_path;
 
     server_presets presets;
 

From a70419c0115c2393b2ca59f8eb7a4d7a52d1fb2f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 22:16:07 +0100
Subject: [PATCH 13/19] clarify about "no-" prefix

---
 tools/server/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index 649f1e7ca2b..d6b9b87dcf7 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1444,8 +1444,8 @@ version = 1
 chat-template = chatml
 ; numeric value
 n-gpu-layer = 123
-; boolean value
-jinja = false
+; flag value (for certain flags, you need to use the "no-" prefix for negation)
+jinja = true
 ; shorthand argument (for example, context size)
 c = 4096
 ; environment variable name

From 97de3114eca2444b12190c32f816eb63156f9a03 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 22:22:28 +0100
Subject: [PATCH 14/19] correct render_args() to include binary path

---
 tools/server/server-models.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 18e21c00d66..a92c5e06fed 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -214,6 +214,8 @@ void server_presets::render_args(server_model_meta & meta) {
         }
     }
     meta.args = preset.to_args();
+    // add back the binary path at the front
+    meta.args.insert(meta.args.begin(), get_server_exec_path().string());
 }
 
 //
@@ -510,6 +512,8 @@ void server_models::load(const std::string & name) {
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
+        presets.render_args(inst.meta); // update meta.args
+
         std::vector<std::string> child_args = inst.meta.args; // copy
         std::vector<std::string> child_env  = base_env; // copy
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));

From f645e887f23dcda2802abf2ad16a103eb2c4461b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 22:28:35 +0100
Subject: [PATCH 15/19] also remove arg LLAMA_ARG_MODELS_PRESET for child

---
 tools/server/server-models.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index a92c5e06fed..a823df4e1e3 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -157,6 +157,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
             env == "LLAMA_ARG_API_KEY" ||
             env == "LLAMA_ARG_MODELS_DIR" ||
             env == "LLAMA_ARG_MODELS_MAX" ||
+            env == "LLAMA_ARG_MODELS_PRESET" ||
             env == "LLAMA_ARG_MODEL" ||
             env == "LLAMA_ARG_MMPROJ" ||
             env == "LLAMA_ARG_HF_REPO" ||

From 6bda0d47ede87e30074b3bc73c7c5b9d1308512f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 8 Dec 2025 23:52:15 +0100
Subject: [PATCH 16/19] add co-author for ini parser code

Co-authored-by: aldehir <hello@alde.dev>

From 035f56adbc961cf65860c87364f3eb7b5ff8f5e2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 9 Dec 2025 14:43:47 +0100
Subject: [PATCH 17/19] also set LLAMA_ARG_HOST

---
 tools/server/server-models.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index a823df4e1e3..3c4b0015e7f 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -204,6 +204,7 @@ void server_presets::render_args(server_model_meta & meta) {
     }
     // 3. control args (from router)
     // set control values
+    preset.options[control_args["LLAMA_ARG_HOST"]] = "127.0.0.1";
     preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
     preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
     if (meta.in_cache) {

From f2ad7dc9db790d7db6744065d764bc2deb4485fa Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 9 Dec 2025 17:51:42 +0100
Subject: [PATCH 18/19] add CHILD_ADDR

---
 tools/server/server-models.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 3c4b0015e7f..6c618a673c9 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -34,6 +34,10 @@
 
 #define CMD_EXIT "exit"
 
+// address for child process, this is needed because router may run on 0.0.0.0
+// ref: https://github.com/ggml-org/llama.cpp/issues/17862
+#define CHILD_ADDR "127.0.0.1"
+
 static std::filesystem::path get_server_exec_path() {
 #if defined(_WIN32)
     wchar_t buf[32768] = { 0 };  // Large buffer to handle long paths
@@ -204,7 +208,7 @@ void server_presets::render_args(server_model_meta & meta) {
     }
     // 3. control args (from router)
     // set control values
-    preset.options[control_args["LLAMA_ARG_HOST"]] = "127.0.0.1";
+    preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
     preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
     preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
     if (meta.in_cache) {
@@ -693,7 +697,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
     SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
     auto proxy = std::make_unique<server_http_proxy>(
             method,
-            base_params.hostname,
+            CHILD_ADDR,
             meta->port,
             req.path,
             req.headers,

From b36b3fe1a4404348071d90166a5ce7fb93733dbc Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Wed, 10 Dec 2025 14:30:04 +0100
Subject: [PATCH 19/19] Remove dead code

---
 common/arg.cpp    | 19 -------------------
 common/arg.h      |  4 ----
 common/preset.cpp | 13 -------------
 common/preset.h   |  1 -
 4 files changed, 37 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index f2b460f86dd..b333f45c96a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -74,17 +74,6 @@ static const std::vector<common_arg> & get_common_arg_defs() {
     return options;
 }
 
-std::string common_arg_get_env_name(const std::string & flag) {
-    for (const auto & arg : get_common_arg_defs()) {
-        for (const auto & arg_flag : arg.args) {
-            if (arg_flag == flag) {
-                return arg.env ? arg.env : "";
-            }
-        }
-    }
-    return "";
-}
-
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
     this->examples = examples;
     return *this;
@@ -3325,11 +3314,3 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
     return ctx_arg;
 }
-
-static std::string rm_leading_dashes(const std::string & str) {
-    size_t pos = 0;
-    while (pos < str.size() && str[pos] == '-') {
-        ++pos;
-    }
-    return str.substr(pos);
-}
diff --git a/common/arg.h b/common/arg.h
index 161d0688cd0..219c115e635 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -107,10 +107,6 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<comm
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
-// Get environment variable name for a CLI flag (e.g. "--ctx-size" -> "LLAMA_ARG_CTX_SIZE")
-// Returns empty string if flag not found
-std::string common_arg_get_env_name(const std::string & flag);
-
 struct common_remote_params {
     std::vector<std::string> headers;
     long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
diff --git a/common/preset.cpp b/common/preset.cpp
index c07e68b28be..09ac171b720 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -178,16 +178,3 @@ common_presets common_presets_load(const std::string & path, common_params_conte
 
     return out;
 }
-
-void common_presets_save(const std::string & path, const common_presets & presets) {
-    std::ofstream file(path);
-    if (!file.good()) {
-        throw std::runtime_error("failed to open preset file for writing: " + path);
-    }
-
-    file << "version = 1\n\n";
-
-    for (const auto & it : presets) {
-        file << it.second.to_ini();
-    }
-}
diff --git a/common/preset.h b/common/preset.h
index d200bdce809..dceb849eb81 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -30,4 +30,3 @@ struct common_preset {
 // interface for multiple presets in one file
 using common_presets = std::map<std::string, common_preset>;
 common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
-void common_presets_save(const std::string & path, const common_presets & presets);