From da400e0801ce87415f62baa8a14173364a87c9df Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Nov 2025 14:38:51 +0100 Subject: [PATCH 01/32] wip --- tools/CMakeLists.txt | 1 + tools/cli/CMakeLists.txt | 8 + tools/cli/cli.cpp | 1000 +++++++++++++++++++++++++++++++ tools/main/CMakeLists.txt | 2 +- tools/server/server-context.cpp | 35 +- tools/server/server-task.h | 17 + 6 files changed, 1061 insertions(+), 2 deletions(-) create mode 100644 tools/cli/CMakeLists.txt create mode 100644 tools/cli/cli.cpp diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index d64956b8438..0bc42fa59d8 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -18,6 +18,7 @@ else() add_subdirectory(gguf-split) add_subdirectory(imatrix) add_subdirectory(llama-bench) + add_subdirectory(cli) add_subdirectory(main) add_subdirectory(perplexity) add_subdirectory(quantize) diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt new file mode 100644 index 00000000000..89f8305f3e2 --- /dev/null +++ b/tools/cli/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-cli) +add_executable(${TARGET} cli.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp new file mode 100644 index 00000000000..78b42267b59 --- /dev/null +++ b/tools/cli/cli.cpp @@ -0,0 +1,1000 @@ +#include "arg.h" +#include "common.h" +#include "console.h" +#include "log.h" +#include "sampling.h" +#include "llama.h" +#include "chat.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static llama_context ** g_ctx; +static llama_model ** g_model; +static common_sampler ** g_smpl; +static common_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; +static bool is_interacting = false; +static bool need_insert_eot = false; + +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]); + LOG("\n"); +} + +static bool file_exists(const std::string & path) { + std::ifstream f(path.c_str()); + return f.good(); +} + +static bool file_is_empty(const std::string & path) { + std::ifstream f; + f.exceptions(std::ifstream::failbit | std::ifstream::badbit); + f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + return f.tellg() == 0; +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting && g_params->interactive) { + is_interacting = true; + need_insert_eot = true; + } else { + console::cleanup(); + LOG("\n"); + common_perf_print(*g_ctx, *g_smpl); + + // make sure all logs are flushed + LOG("Interrupted by user\n"); + common_log_pause(common_log_main()); + + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + common_params params; + g_params = ¶ms; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + return 1; + } + + common_init(); + + auto & sparams = params.sampling; + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + + if (params.embedding) { + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); + + return 0; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) { + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) { + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + LOG_INF("%s: llama backend init\n", __func__); + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model = nullptr; + llama_context * ctx = nullptr; + common_sampler * smpl = nullptr; + + g_model = &model; + g_ctx = &ctx; + g_smpl = &smpl; + + std::vector chat_msgs; + + // load the model and apply lora adapter, if any + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); + common_init_result llama_init = common_init_from_params(params); + + model = llama_init.model.get(); + ctx = llama_init.context.get(); + + if (model == NULL) { + LOG_ERR("%s: error: unable to load model\n", __func__); + return 1; + } + + llama_memory_t mem = llama_get_memory(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + // note: the time for chat template initialization is not negligible: + auto chat_templates = common_chat_templates_init(model, params.chat_template); + + // start measuring performance timings from here + llama_perf_context_reset(ctx); + + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + LOG_ERR("%s: no CPU backend found\n", __func__); + return 1; + } + auto * reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + + struct ggml_threadpool_params tpp_batch = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + struct ggml_threadpool_params tpp = + ggml_threadpool_params_from_cpu_params(params.cpuparams); + + set_process_priority(params.cpuparams.priority); + + struct ggml_threadpool * threadpool_batch = NULL; + if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); + if (!threadpool_batch) { + LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + return 1; + } + + // start the non-batch threadpool in the paused state + tpp.paused = true; + } + + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); + if (!threadpool) { + LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + return 1; + } + + llama_attach_threadpool(ctx, threadpool, threadpool_batch); + + const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + + if (n_ctx > n_ctx_train) { + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); + } + + // auto enable conversation mode if chat template is available + const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get()); + if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { + if (has_chat_template) { + LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); + params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + } else { + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + } + } + + // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning + if (params.conversation_mode && !has_chat_template) { + LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__); + } + + // print chat template example in conversation mode + if (params.conversation_mode) { + if (params.enable_chat_template) { + if (!params.prompt.empty() && params.system_prompt.empty()) { + LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n"); + } + + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs).c_str()); + } else { + LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + } + } + + // print system information + { + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); + } + + std::string path_session = params.path_prompt_cache; + std::vector session_tokens; + + if (!path_session.empty()) { + LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + if (!file_exists(path_session)) { + LOG_INF("%s: session file does not exist, will create.\n", __func__); + } else if (file_is_empty(path_session)) { + LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); + } else { + // The file exists and is not empty + session_tokens.resize(n_ctx); + size_t n_token_count_out = 0; + if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } + session_tokens.resize(n_token_count_out); + LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + } + } + + const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja; + if (!llama_model_has_encoder(model)) { + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + } + + LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); + + std::vector embd_inp; + + bool waiting_for_first_input = false; + auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { + common_chat_msg new_msg; + new_msg.role = role; + new_msg.content = content; + auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja); + chat_msgs.push_back(new_msg); + LOG_DBG("formatted: '%s'\n", formatted.c_str()); + return formatted; + }; + + std::string prompt; + { + if (params.conversation_mode && params.enable_chat_template) { + if (!params.system_prompt.empty()) { + // format the system prompt (will use template default if empty) + chat_add_and_format("system", params.system_prompt); + } + + if (!params.prompt.empty()) { + // format and append the user prompt + chat_add_and_format("user", params.prompt); + } else { + waiting_for_first_input = true; + } + + if (!params.system_prompt.empty() || !params.prompt.empty()) { + common_chat_templates_inputs inputs; + inputs.use_jinja = g_params->use_jinja; + inputs.messages = chat_msgs; + inputs.add_generation_prompt = !params.prompt.empty(); + + prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; + } + } else { + // otherwise use the prompt as is + prompt = params.prompt; + } + + if (params.interactive_first || !prompt.empty() || session_tokens.empty()) { + LOG_DBG("tokenize the prompt\n"); + embd_inp = common_tokenize(ctx, prompt, true, true); + } else { + LOG_DBG("use session tokens\n"); + embd_inp = session_tokens; + } + + LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); + } + + // Should not run without any tokens + if (!waiting_for_first_input && embd_inp.empty()) { + if (add_bos) { + embd_inp.push_back(llama_vocab_bos(vocab)); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); + } else { + LOG_ERR("input is empty\n"); + return -1; + } + } + + // Tokenize negative prompt + if ((int) embd_inp.size() > n_ctx - 4) { + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + // debug message about similarity of saved session, if applicable + size_t n_matching_session_tokens = 0; + if (!session_tokens.empty()) { + for (llama_token id : session_tokens) { + if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { + break; + } + n_matching_session_tokens++; + } + if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { + LOG_INF("%s: using full prompt from session file\n", __func__); + } else if (n_matching_session_tokens >= embd_inp.size()) { + LOG_INF("%s: session file has exact match for prompt!\n", __func__); + } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } else { + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } + + // remove any "future" tokens that we might have inherited from the previous session + if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) { + LOG_INF("%s: unable to resuse common prefix\n", __func__); + n_matching_session_tokens = 0; + llama_memory_seq_rm(mem, -1, -1, -1); + } + } + + LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", + embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); + + // if we will use the cache for the full prompt without reaching the end of the cache, force + // reevaluation of the last token to recalculate the cached logits + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { + LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); + + session_tokens.resize(embd_inp.size() - 1); + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { + params.n_keep = (int)embd_inp.size(); + } else { + params.n_keep += add_bos; // always keep the BOS token + } + + if (params.conversation_mode) { + if (params.single_turn && !params.prompt.empty()) { + params.interactive = false; + params.interactive_first = false; + } else { + params.interactive_first = true; + } + } + + // enable interactive mode if interactive start is specified + if (params.interactive_first) { + params.interactive = true; + } + + if (params.verbose_prompt) { + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (params.n_keep > add_bos) { + LOG_INF("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); + } + LOG_CNT("'\n"); + } + LOG_INF("\n"); + } + + // ctrl+C handling + { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + } + + if (params.interactive) { + LOG_INF("%s: interactive mode on.\n", __func__); + + if (!params.antiprompt.empty()) { + for (const auto & antiprompt : params.antiprompt) { + LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); + if (params.verbose_prompt) { + auto tmp = common_tokenize(ctx, antiprompt, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + + if (params.input_prefix_bos) { + LOG_INF("Input prefix with BOS\n"); + } + + if (!params.input_prefix.empty()) { + LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); + if (params.verbose_prompt) { + auto tmp = common_tokenize(ctx, params.input_prefix, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + + if (!params.input_suffix.empty()) { + LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); + if (params.verbose_prompt) { + auto tmp = common_tokenize(ctx, params.input_suffix, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + + smpl = common_sampler_init(model, sparams); + if (!smpl) { + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + return 1; + } + + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); + + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + + // group-attention state + // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) + int ga_i = 0; + + const int ga_n = params.grp_attn_n; + const int ga_w = params.grp_attn_w; + + if (ga_n != 1) { + GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT + //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT + //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT + LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + } + LOG_INF("\n"); + + if (params.interactive) { + const char * control_message; + if (params.multiline_input) { + control_message = " - To return control to the AI, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to the AI.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + LOG_INF("== Running in interactive mode. ==\n"); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + LOG_INF( " - Press Ctrl+C to interject at any time.\n"); +#endif + LOG_INF( "%s", control_message); + if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) { + LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n"); + } + LOG_INF("\n"); + + is_interacting = params.interactive_first; + } + + bool is_antiprompt = false; + bool input_echo = true; + bool display = true; + bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode + + // the first thing we will do is to output the prompt, so set color accordingly + console::set_display(console::prompt); + display = params.display_prompt; + + std::vector embd; + + // single-token antiprompts + std::vector antiprompt_token; + + for (const std::string & antiprompt : params.antiprompt) { + auto ids = ::common_tokenize(ctx, antiprompt, false, true); + if (ids.size() == 1) { + antiprompt_token.push_back(ids[0]); + } + } + + if (llama_model_has_encoder(model)) { + int enc_input_size = embd_inp.size(); + llama_token * enc_input_buf = embd_inp.data(); + + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) { + LOG_ERR("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); + } + + embd_inp.clear(); + embd_inp.push_back(decoder_start_token_id); + } + + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { + // predict + if (!embd.empty()) { + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + + console::set_display(console::error); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console::set_display(console::reset); + } + + if (ga_n == 1) { + // infinite text generation via context shifting + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + + if (n_past + (int) embd.size() >= n_ctx) { + if (!params.ctx_shift){ + LOG_WRN("\n\n%s: context full and context shift is disabled => stopping\n", __func__); + break; + } + + if (params.n_predict == -2) { + LOG_WRN("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_memory_seq_rm (mem, 0, params.n_keep , params.n_keep + n_discard); + llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); + } + } else { + // context extension via Self-Extend + while (n_past >= ga_i + ga_w) { + const int ib = (ga_n*ga_i)/ga_w; + const int bd = (ga_w/ga_n)*(ga_n - 1); + const int dd = (ga_w/ga_n) - ib*bd - ga_w; + + LOG_DBG("\n"); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + + llama_memory_seq_add(mem, 0, ga_i, n_past, ib*bd); + llama_memory_seq_div(mem, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + + n_past -= bd; + + ga_i += ga_w/ga_n; + + LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + } + } + + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for ( ; i < embd.size(); i++) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int) session_tokens.size()) { + ++i; + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); + + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { + LOG_ERR("%s : failed to eval\n", __func__); + return 1; + } + + n_past += n_eval; + + LOG_DBG("n_past = %d\n", n_past); + // Display total tokens alongside total time + if (params.n_print > 0 && n_past % params.n_print == 0) { + LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + } + } + + if (!embd.empty() && !path_session.empty()) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } + } + + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + // optionally save the session on first sample (for faster prompt loading next time) + if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { + need_to_save_session = false; + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + + LOG_DBG("saved session to %s\n", path_session.c_str()); + } + + const llama_token id = common_sampler_sample(smpl, ctx, -1); + + common_sampler_accept(smpl, id, /* accept_grammar= */ true); + + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); + + embd.push_back(id); + + if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) { + assistant_ss << common_token_to_piece(ctx, id, false); + } + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + LOG_DBG("n_remain: %d\n", n_remain); + } else { + // some user input remains from prompt or interaction, forward it to processing + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); + + ++n_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (input_echo && display) { + for (auto id : embd) { + const std::string token_str = common_token_to_piece(ctx, id, params.special); + + // Console/Stream Output + LOG("%s", token_str.c_str()); + + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check + if (embd.size() > 1) { + // Incoming Requested Tokens + input_tokens.push_back(id); + } else { + // Outgoing Generated Tokens + output_tokens.push_back(id); + output_ss << token_str; + } + } + } + + // reset color to default if there is no pending user input + if (input_echo && (int) embd_inp.size() == n_consumed) { + console::set_display(console::reset); + display = true; + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + // check for reverse prompt in the last n_prev tokens + if (!params.antiprompt.empty()) { + const int n_prev = 32; + const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + for (std::string & antiprompt : params.antiprompt) { + size_t extra_padding = params.interactive ? 0 : 2; + size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + : 0; + + if (last_output.find(antiprompt, search_start_pos) != std::string::npos) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + + // check for reverse prompt using special tokens + // avoid calling common_sampler_last() if last_output is empty + if (!last_output.empty()) { + llama_token last_token = common_sampler_last(smpl); + for (auto token : antiprompt_token) { + if (token == last_token) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + } + + if (is_antiprompt) { + LOG_DBG("found antiprompt: %s\n", last_output.c_str()); + } + } + + // deal with end of generation tokens in interactive mode + if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { + LOG_DBG("found an EOG token\n"); + + if (params.interactive) { + if (!params.antiprompt.empty()) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + is_antiprompt = true; + } + + if (params.enable_chat_template) { + chat_add_and_format("assistant", assistant_ss.str()); + } + is_interacting = true; + LOG("\n"); + } + } + + if (params.conversation_mode && !waiting_for_first_input) { + if (!prompt.empty()) { + prompt.clear(); + is_interacting = false; + } + } + + if ((n_past > 0 || waiting_for_first_input) && is_interacting) { + LOG_DBG("waiting for user input\n"); + + if (params.conversation_mode) { + LOG("\n> "); + } + + if (params.input_prefix_bos) { + LOG_DBG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_vocab_bos(vocab)); + } + + std::string buffer; + if (!params.input_prefix.empty() && !params.conversation_mode) { + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("%s", params.input_prefix.c_str()); + } + + // color user input only + console::set_display(console::user_input); + display = params.display_prompt; + + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // done taking input, reset color + console::set_display(console::reset); + display = true; + + if (buffer.empty()) { // Ctrl+D on empty line exits + LOG("EOF by user\n"); + break; + } + + if (buffer.back() == '\n') { + // Implement #587: + // If the user wants the text to end in a newline, + // this should be accomplished by explicitly adding a newline by using \ followed by return, + // then returning control by pressing return again. + buffer.pop_back(); + } + + if (buffer.empty()) { // Enter key on empty line lets the user pass control back + LOG_DBG("empty line, passing control back\n"); + } else { // Add tokens to embd only if the input buffer is non-empty + // append input suffix if any + if (!params.input_suffix.empty() && !params.conversation_mode) { + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); + } + + LOG_DBG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + if (params.escape) { + string_process_escapes(buffer); + } + + bool format_chat = params.conversation_mode && params.enable_chat_template; + std::string user_inp = format_chat + ? chat_add_and_format("user", std::move(buffer)) + : std::move(buffer); + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) + const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); + + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); + + // if user stop generation mid-way, we must add EOT to finish model's last response + if (need_insert_eot && format_chat) { + llama_token eot = llama_vocab_eot(vocab); + embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot); + need_insert_eot = false; + } + + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + + if (params.verbose_prompt) { + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size); + } + + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + const std::string token_str = common_token_to_piece(ctx, token); + output_tokens.push_back(token); + output_ss << token_str; + + if (params.verbose_prompt) { + LOG_INF("%6d -> '%s'\n", token, token_str.c_str()); + } + } + + // reset assistant message + assistant_ss.str(""); + + n_remain -= line_inp.size(); + LOG_DBG("n_remain: %d\n", n_remain); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0 || waiting_for_first_input) { + if (is_interacting) { + common_sampler_reset(smpl); + } + is_interacting = false; + + if (waiting_for_first_input && params.single_turn) { + params.interactive = false; + params.interactive_first = false; + } + waiting_for_first_input = false; + } + } + + // end of generation + if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) { + LOG(" [end of text]\n"); + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { + n_remain = params.n_predict; + is_interacting = true; + } + } + + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { + LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + LOG("\n\n"); + common_perf_print(ctx, smpl); + + common_sampler_free(smpl); + + llama_backend_free(); + + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); + + return 0; +} diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt index 8f8e9d444cf..a39a6552246 100644 --- a/tools/main/CMakeLists.txt +++ b/tools/main/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET llama-cli) +set(TARGET llama-cli-old) add_executable(${TARGET} main.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 2bf3924df90..52ce58ec6f2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1640,7 +1640,40 @@ struct server_context_impl { res->id = task.id; queue_results.send(std::move(res)); } break; - + case SERVER_TASK_TYPE_FORMAT_INPUT: + { + auto res = std::make_unique(); + res->id = task.id; + try { + auto & opt = oai_parser_opt; + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(task.input_raw); + inputs.tools = {}; // TODO + inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; + inputs.json_schema = ""; // TODO + inputs.grammar = ""; // TODO + inputs.use_jinja = opt.use_jinja; + inputs.parallel_tool_calls = false; + inputs.add_generation_prompt = true; + inputs.reasoning_format = opt.reasoning_format; + inputs.enable_thinking = opt.enable_thinking; + + // Apply chat template to the list of messages + auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); + + // tokenize the resulting prompt + auto & prompt = chat_params.prompt; + if (mctx != nullptr) { + res->tokens = process_mtmd_prompt(mctx, prompt, task.input_files); + } else { + res->tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]); + } + } catch (const std::exception & e) { + send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST); + break; + } + queue_results.send(std::move(res)); + } break; } } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a22d7cab116..c017a6a7ea3 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -24,6 +24,7 @@ enum server_task_type { SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, SERVER_TASK_TYPE_SET_LORA, + SERVER_TASK_TYPE_FORMAT_INPUT, // only used by CLI }; // TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common @@ -109,6 +110,10 @@ struct server_task { // used by SERVER_TASK_TYPE_SET_LORA std::vector set_lora; + // used by SERVER_TASK_TYPE_FORMAT_INPUT + json input_raw; // TODO: maybe use something more efficient than json + std::vector input_files; + server_task() = default; server_task(server_task_type type) : type(type) {} @@ -401,6 +406,18 @@ struct server_task_result_apply_lora : server_task_result { virtual json to_json() override; }; +struct server_task_result_format_input : server_task_result { + int index = 0; + error_type err_type = ERROR_TYPE_SERVER; + std::string err_msg; + + server_tokens tokens; + + virtual json to_json() override { + return json{}; // unused + } +}; + struct server_prompt_checkpoint { llama_pos pos_min; llama_pos pos_max; From fda30b8b3d94ab852d4760502cd8fb999624aea8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Nov 2025 23:10:53 +0100 Subject: [PATCH 02/32] wip --- common/download.cpp | 2 +- tools/cli/CMakeLists.txt | 17 +- tools/cli/cli.cpp | 1058 +++---------------------------- tools/server/server-context.cpp | 76 ++- tools/server/server-queue.cpp | 6 + tools/server/server-queue.h | 1 + tools/server/server-task.h | 21 +- 7 files changed, 173 insertions(+), 1008 deletions(-) diff --git a/common/download.cpp b/common/download.cpp index eeb32b6a863..436f33a373f 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -430,7 +430,7 @@ std::pair> common_remote_get_content(const std::string & curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L); + curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L); typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { auto data_vec = static_cast *>(data); diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index 89f8305f3e2..56055f6957c 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -1,8 +1,21 @@ set(TARGET llama-cli) -add_executable(${TARGET} cli.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +add_executable(${TARGET} + cli.cpp + ../server/server-task.cpp + ../server/server-task.h + ../server/server-queue.cpp + ../server/server-queue.h + ../server/server-common.cpp + ../server/server-common.h + ../server/server-context.cpp + ../server/server-context.h + ) +target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) +include_directories(../server) +include_directories(../mtmd) + if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 78b42267b59..18e05481f55 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -1,1000 +1,150 @@ -#include "arg.h" #include "common.h" +#include "arg.h" #include "console.h" #include "log.h" -#include "sampling.h" -#include "llama.h" -#include "chat.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#include -#endif - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static llama_context ** g_ctx; -static llama_model ** g_model; -static common_sampler ** g_smpl; -static common_params * g_params; -static std::vector * g_input_tokens; -static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; -static bool is_interacting = false; -static bool need_insert_eot = false; -static void print_usage(int argc, char ** argv) { - (void) argc; +#include "server-context.h" +#include "server-task.h" - LOG("\nexample usage:\n"); - LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]); - LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]); - LOG("\n"); -} +#define PRI(...) LOGV(-1, __VA_ARGS__) -static bool file_exists(const std::string & path) { - std::ifstream f(path.c_str()); - return f.good(); -} +constexpr int POLLING_SECONDS = 1; -static bool file_is_empty(const std::string & path) { - std::ifstream f; - f.exceptions(std::ifstream::failbit | std::ifstream::badbit); - f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - return f.tellg() == 0; +static bool g_is_interrupted = false; +static bool should_stop() { + return g_is_interrupted; } #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { - if (!is_interacting && g_params->interactive) { - is_interacting = true; - need_insert_eot = true; - } else { - console::cleanup(); - LOG("\n"); - common_perf_print(*g_ctx, *g_smpl); - - // make sure all logs are flushed - LOG("Interrupted by user\n"); - common_log_pause(common_log_main()); - - _exit(130); - } + g_is_interrupted = true; } } #endif -int main(int argc, char ** argv) { - common_params params; - g_params = ¶ms; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { - return 1; - } - - common_init(); - - auto & sparams = params.sampling; - - // save choice to use color for later - // (note for later: this is a slightly awkward choice) - console::init(params.simple_io, params.use_color); - atexit([]() { console::cleanup(); }); - - if (params.embedding) { - LOG_ERR("************\n"); - LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - LOG_ERR("************\n\n"); - - return 0; - } - - if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); - params.n_ctx = 8; - } - - if (params.rope_freq_base != 0.0) { - LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); - } - - if (params.rope_freq_scale != 0.0) { - LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); - } - - LOG_INF("%s: llama backend init\n", __func__); - - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model = nullptr; - llama_context * ctx = nullptr; - common_sampler * smpl = nullptr; - - g_model = &model; - g_ctx = &ctx; - g_smpl = &smpl; - - std::vector chat_msgs; - - // load the model and apply lora adapter, if any - LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = common_init_from_params(params); - - model = llama_init.model.get(); - ctx = llama_init.context.get(); - - if (model == NULL) { - LOG_ERR("%s: error: unable to load model\n", __func__); - return 1; - } - - llama_memory_t mem = llama_get_memory(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - // note: the time for chat template initialization is not negligible: - auto chat_templates = common_chat_templates_init(model, params.chat_template); - - // start measuring performance timings from here - llama_perf_context_reset(ctx); - - LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); - - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (!cpu_dev) { - LOG_ERR("%s: no CPU backend found\n", __func__); - return 1; - } - auto * reg = ggml_backend_dev_backend_reg(cpu_dev); - auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); - auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); - - struct ggml_threadpool_params tpp_batch = - ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); - struct ggml_threadpool_params tpp = - ggml_threadpool_params_from_cpu_params(params.cpuparams); - - set_process_priority(params.cpuparams.priority); - - struct ggml_threadpool * threadpool_batch = NULL; - if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); - if (!threadpool_batch) { - LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); - return 1; - } - - // start the non-batch threadpool in the paused state - tpp.paused = true; - } - - struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); - if (!threadpool) { - LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - return 1; - } - - llama_attach_threadpool(ctx, threadpool, threadpool_batch); - - const int n_ctx_train = llama_model_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); - - if (n_ctx > n_ctx_train) { - LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); - } - - // auto enable conversation mode if chat template is available - const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get()); - if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { - if (has_chat_template) { - LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); - params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; - } else { - params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; - } - } - - // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning - if (params.conversation_mode && !has_chat_template) { - LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__); - } - - // print chat template example in conversation mode - if (params.conversation_mode) { - if (params.enable_chat_template) { - if (!params.prompt.empty() && params.system_prompt.empty()) { - LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n"); - } - - LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs).c_str()); - } else { - LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); - } - } - - // print system information - { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); - } - - std::string path_session = params.path_prompt_cache; - std::vector session_tokens; - - if (!path_session.empty()) { - LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); - if (!file_exists(path_session)) { - LOG_INF("%s: session file does not exist, will create.\n", __func__); - } else if (file_is_empty(path_session)) { - LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); - } else { - // The file exists and is not empty - session_tokens.resize(n_ctx); - size_t n_token_count_out = 0; - if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); - return 1; - } - session_tokens.resize(n_token_count_out); - LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); - } - } - - const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja; - if (!llama_model_has_encoder(model)) { - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); - } - - LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); - - std::vector embd_inp; - - bool waiting_for_first_input = false; - auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { - common_chat_msg new_msg; - new_msg.role = role; - new_msg.content = content; - auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja); - chat_msgs.push_back(new_msg); - LOG_DBG("formatted: '%s'\n", formatted.c_str()); - return formatted; - }; - - std::string prompt; - { - if (params.conversation_mode && params.enable_chat_template) { - if (!params.system_prompt.empty()) { - // format the system prompt (will use template default if empty) - chat_add_and_format("system", params.system_prompt); - } - - if (!params.prompt.empty()) { - // format and append the user prompt - chat_add_and_format("user", params.prompt); - } else { - waiting_for_first_input = true; - } - - if (!params.system_prompt.empty() || !params.prompt.empty()) { - common_chat_templates_inputs inputs; - inputs.use_jinja = g_params->use_jinja; - inputs.messages = chat_msgs; - inputs.add_generation_prompt = !params.prompt.empty(); - - prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; - } - } else { - // otherwise use the prompt as is - prompt = params.prompt; - } - - if (params.interactive_first || !prompt.empty() || session_tokens.empty()) { - LOG_DBG("tokenize the prompt\n"); - embd_inp = common_tokenize(ctx, prompt, true, true); - } else { - LOG_DBG("use session tokens\n"); - embd_inp = session_tokens; - } - - LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); - LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); - } - - // Should not run without any tokens - if (!waiting_for_first_input && embd_inp.empty()) { - if (add_bos) { - embd_inp.push_back(llama_vocab_bos(vocab)); - LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); - } else { - LOG_ERR("input is empty\n"); - return -1; - } - } - - // Tokenize negative prompt - if ((int) embd_inp.size() > n_ctx - 4) { - LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); - return 1; - } - - // debug message about similarity of saved session, if applicable - size_t n_matching_session_tokens = 0; - if (!session_tokens.empty()) { - for (llama_token id : session_tokens) { - if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { +struct cli_context { + server_context ctx_server; + json messages = json::array(); + + std::string generate_completion(task_params & params, const json & messages, const std::vector & input_files) { + params.stream = true; // make sure we always use streaming mode + auto queues = ctx_server.get_queues(); + server_response_reader rd(queues, POLLING_SECONDS); + { + server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); + task.id = queues.first.get_new_id(); + task.params = params; // copy + task.cli_input = messages; // copy + task.cli_files = input_files; // copy + rd.post_task({std::move(task)}); + } + + server_task_result_ptr result = rd.next(should_stop); + std::string curr_content; + while (result) { + if (result->is_error()) { + PRI("Error: %s\n", result->to_json().dump().c_str()); + return curr_content; + } + auto res_partial = dynamic_cast(result.get()); + if (res_partial) { + curr_content += res_partial->content; + PRI("%s", res_partial->content.c_str()); + } + auto res_final = dynamic_cast(result.get()); + if (res_final) { break; } - n_matching_session_tokens++; - } - if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_INF("%s: using full prompt from session file\n", __func__); - } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_INF("%s: session file has exact match for prompt!\n", __func__); - } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } else { - LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } - - // remove any "future" tokens that we might have inherited from the previous session - if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) { - LOG_INF("%s: unable to resuse common prefix\n", __func__); - n_matching_session_tokens = 0; - llama_memory_seq_rm(mem, -1, -1, -1); - } - } - - LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", - embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); - - // if we will use the cache for the full prompt without reaching the end of the cache, force - // reevaluation of the last token to recalculate the cached logits - if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); - - session_tokens.resize(embd_inp.size() - 1); - } - - // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { - params.n_keep = (int)embd_inp.size(); - } else { - params.n_keep += add_bos; // always keep the BOS token - } - - if (params.conversation_mode) { - if (params.single_turn && !params.prompt.empty()) { - params.interactive = false; - params.interactive_first = false; - } else { - params.interactive_first = true; - } - } - - // enable interactive mode if interactive start is specified - if (params.interactive_first) { - params.interactive = true; - } - - if (params.verbose_prompt) { - LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); - } - - if (params.n_keep > add_bos) { - LOG_INF("%s: static prompt based on n_keep: '", __func__); - for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); - } - LOG_CNT("'\n"); + result = rd.next(should_stop); } - LOG_INF("\n"); + return curr_content; } +}; - // ctrl+C handling - { -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = sigint_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -#elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); -#endif - } - - if (params.interactive) { - LOG_INF("%s: interactive mode on.\n", __func__); - - if (!params.antiprompt.empty()) { - for (const auto & antiprompt : params.antiprompt) { - LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); - if (params.verbose_prompt) { - auto tmp = common_tokenize(ctx, antiprompt, false, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); - } - } - } - } - - if (params.input_prefix_bos) { - LOG_INF("Input prefix with BOS\n"); - } - - if (!params.input_prefix.empty()) { - LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); - if (params.verbose_prompt) { - auto tmp = common_tokenize(ctx, params.input_prefix, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); - } - } - } - - if (!params.input_suffix.empty()) { - LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); - if (params.verbose_prompt) { - auto tmp = common_tokenize(ctx, params.input_suffix, false, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); - } - } - } - } +int main(int argc, char ** argv) { + common_params params; - smpl = common_sampler_init(model, sparams); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN)) { return 1; } - LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); - LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); - - LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); - - // group-attention state - // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) - int ga_i = 0; - - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; - - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); - } - LOG_INF("\n"); - - if (params.interactive) { - const char * control_message; - if (params.multiline_input) { - control_message = " - To return control to the AI, end your input with '\\'.\n" - " - To return control without starting a new line, end your input with '/'.\n"; - } else { - control_message = " - Press Return to return control to the AI.\n" - " - To return control without starting a new line, end your input with '/'.\n" - " - If you want to submit another line, end your input with '\\'.\n"; - } - LOG_INF("== Running in interactive mode. ==\n"); -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_INF( " - Press Ctrl+C to interject at any time.\n"); -#endif - LOG_INF( "%s", control_message); - if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) { - LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n"); - } - LOG_INF("\n"); - - is_interacting = params.interactive_first; - } - - bool is_antiprompt = false; - bool input_echo = true; - bool display = true; - bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); - - int n_past = 0; - int n_remain = params.n_predict; - int n_consumed = 0; - int n_session_consumed = 0; - - std::vector input_tokens; g_input_tokens = &input_tokens; - std::vector output_tokens; g_output_tokens = &output_tokens; - std::ostringstream output_ss; g_output_ss = &output_ss; - std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode - - // the first thing we will do is to output the prompt, so set color accordingly - console::set_display(console::prompt); - display = params.display_prompt; - - std::vector embd; - - // single-token antiprompts - std::vector antiprompt_token; + common_init(); - for (const std::string & antiprompt : params.antiprompt) { - auto ids = ::common_tokenize(ctx, antiprompt, false, true); - if (ids.size() == 1) { - antiprompt_token.push_back(ids[0]); - } - } + // prefer silent by default; TODO: fix this later + common_log_set_verbosity_thold(0); - if (llama_model_has_encoder(model)) { - int enc_input_size = embd_inp.size(); - llama_token * enc_input_buf = embd_inp.data(); + // struct that contains llama context and inference + cli_context ctx_cli; - if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) { - LOG_ERR("%s : failed to eval\n", __func__); - return 1; - } + llama_backend_init(); + llama_numa_init(params.numa); - llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == LLAMA_TOKEN_NULL) { - decoder_start_token_id = llama_vocab_bos(vocab); - } + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); - embd_inp.clear(); - embd_inp.push_back(decoder_start_token_id); + if (!ctx_cli.ctx_server.load_model(params)) { + PRI("Failed to load the model\n"); + return 1; } + ctx_cli.ctx_server.init(); - while ((n_remain != 0 && !is_antiprompt) || params.interactive) { - // predict - if (!embd.empty()) { - // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via - // --prompt or --file which uses the same value. - int max_embd_size = n_ctx - 4; - - // Ensure the input doesn't exceed the context size by truncating embd if necessary. - if ((int) embd.size() > max_embd_size) { - const int skipped_tokens = (int) embd.size() - max_embd_size; - embd.resize(max_embd_size); - - console::set_display(console::error); - LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); - console::set_display(console::reset); - } - - if (ga_n == 1) { - // infinite text generation via context shifting - // if we run out of context: - // - take the n_keep first tokens from the original prompt (via n_past) - // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - - if (n_past + (int) embd.size() >= n_ctx) { - if (!params.ctx_shift){ - LOG_WRN("\n\n%s: context full and context shift is disabled => stopping\n", __func__); - break; - } - - if (params.n_predict == -2) { - LOG_WRN("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict); - break; - } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_memory_seq_rm (mem, 0, params.n_keep , params.n_keep + n_discard); - llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG_DBG("after swap: n_past = %d\n", n_past); - - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - - LOG_DBG("clear session path\n"); - path_session.clear(); - } - } else { - // context extension via Self-Extend - while (n_past >= ga_i + ga_w) { - const int ib = (ga_n*ga_i)/ga_w; - const int bd = (ga_w/ga_n)*(ga_n - 1); - const int dd = (ga_w/ga_n) - ib*bd - ga_w; - - LOG_DBG("\n"); - LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); - LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); - LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - - llama_memory_seq_add(mem, 0, ga_i, n_past, ib*bd); - llama_memory_seq_div(mem, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); - - n_past -= bd; - - ga_i += ga_w/ga_n; - - LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); - } - } - - // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) - if (n_session_consumed < (int) session_tokens.size()) { - size_t i = 0; - for ( ; i < embd.size(); i++) { - if (embd[i] != session_tokens[n_session_consumed]) { - session_tokens.resize(n_session_consumed); - break; - } - - n_past++; - n_session_consumed++; - - if (n_session_consumed >= (int) session_tokens.size()) { - ++i; - break; - } - } - if (i > 0) { - embd.erase(embd.begin(), embd.begin() + i); - } - } - - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { - int n_eval = (int) embd.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - - LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { - LOG_ERR("%s : failed to eval\n", __func__); - return 1; - } - - n_past += n_eval; - - LOG_DBG("n_past = %d\n", n_past); - // Display total tokens alongside total time - if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); - } - } - - if (!embd.empty() && !path_session.empty()) { - session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); - n_session_consumed = session_tokens.size(); - } - } - - embd.clear(); - - if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - // optionally save the session on first sample (for faster prompt loading next time) - if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { - need_to_save_session = false; - llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - - LOG_DBG("saved session to %s\n", path_session.c_str()); - } - - const llama_token id = common_sampler_sample(smpl, ctx, -1); - - common_sampler_accept(smpl, id, /* accept_grammar= */ true); - - // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); - - embd.push_back(id); - - if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) { - assistant_ss << common_token_to_piece(ctx, id, false); - } - - // echo this to console - input_echo = true; - - // decrement remaining sampling budget - --n_remain; - - LOG_DBG("n_remain: %d\n", n_remain); - } else { - // some user input remains from prompt or interaction, forward it to processing - LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); - while ((int) embd_inp.size() > n_consumed) { - embd.push_back(embd_inp[n_consumed]); - - // push the prompt in the sampling context in order to apply repetition penalties later - // for the prompt, we don't apply grammar rules - common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); - - ++n_consumed; - if ((int) embd.size() >= params.n_batch) { - break; - } - } - } - - // display text - if (input_echo && display) { - for (auto id : embd) { - const std::string token_str = common_token_to_piece(ctx, id, params.special); + std::thread inference_thread([&ctx_cli]() { + ctx_cli.ctx_server.start_loop(); + }); - // Console/Stream Output - LOG("%s", token_str.c_str()); + PRI("\n"); + PRI("llama-cli is ready. Type your messages below.\n"); + PRI("\n"); - // Record Displayed Tokens To Log - // Note: Generated tokens are created one by one hence this check - if (embd.size() > 1) { - // Incoming Requested Tokens - input_tokens.push_back(id); - } else { - // Outgoing Generated Tokens - output_tokens.push_back(id); - output_ss << token_str; - } - } + while (!should_stop()) { + std::string buffer; + console::set_display(console::user_input); + { + PRI("\n> "); + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); } + console::set_display(console::reset); + PRI("\n"); - // reset color to default if there is no pending user input - if (input_echo && (int) embd_inp.size() == n_consumed) { - console::set_display(console::reset); - display = true; + if (buffer.empty()) { + continue; } - // if not currently processing queued inputs; - if ((int) embd_inp.size() <= n_consumed) { - // check for reverse prompt in the last n_prev tokens - if (!params.antiprompt.empty()) { - const int n_prev = 32; - const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); - - is_antiprompt = false; - // Check if each of the reverse prompts appears at the end of the output. - // If we're not running interactively, the reverse prompt might be tokenized with some following characters - // so we'll compensate for that by widening the search window a bit. - for (std::string & antiprompt : params.antiprompt) { - size_t extra_padding = params.interactive ? 0 : 2; - size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) - ? last_output.length() - static_cast(antiprompt.length() + extra_padding) - : 0; - - if (last_output.find(antiprompt, search_start_pos) != std::string::npos) { - if (params.interactive) { - is_interacting = true; - } - is_antiprompt = true; - break; - } - } - - // check for reverse prompt using special tokens - // avoid calling common_sampler_last() if last_output is empty - if (!last_output.empty()) { - llama_token last_token = common_sampler_last(smpl); - for (auto token : antiprompt_token) { - if (token == last_token) { - if (params.interactive) { - is_interacting = true; - } - is_antiprompt = true; - break; - } - } - } - - if (is_antiprompt) { - LOG_DBG("found antiprompt: %s\n", last_output.c_str()); - } - } - - // deal with end of generation tokens in interactive mode - if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { - LOG_DBG("found an EOG token\n"); - - if (params.interactive) { - if (!params.antiprompt.empty()) { - // tokenize and inject first reverse prompt - const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); - embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); - is_antiprompt = true; - } - - if (params.enable_chat_template) { - chat_add_and_format("assistant", assistant_ss.str()); - } - is_interacting = true; - LOG("\n"); - } - } - - if (params.conversation_mode && !waiting_for_first_input) { - if (!prompt.empty()) { - prompt.clear(); - is_interacting = false; - } - } - - if ((n_past > 0 || waiting_for_first_input) && is_interacting) { - LOG_DBG("waiting for user input\n"); - - if (params.conversation_mode) { - LOG("\n> "); - } + try { + ctx_cli.messages.push_back({ + {"role", "user"}, + {"content", buffer} + }); + std::vector input_files; // empty for now - if (params.input_prefix_bos) { - LOG_DBG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_vocab_bos(vocab)); - } + task_params defaults; + defaults.sampling = params.sampling; + defaults.speculative = params.speculative; + defaults.n_keep = params.n_keep; + defaults.n_predict = params.n_predict; + defaults.antiprompt = params.antiprompt; - std::string buffer; - if (!params.input_prefix.empty() && !params.conversation_mode) { - LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - LOG("%s", params.input_prefix.c_str()); - } - - // color user input only - console::set_display(console::user_input); - display = params.display_prompt; - - std::string line; - bool another_line = true; - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - - // done taking input, reset color - console::set_display(console::reset); - display = true; - - if (buffer.empty()) { // Ctrl+D on empty line exits - LOG("EOF by user\n"); - break; - } - - if (buffer.back() == '\n') { - // Implement #587: - // If the user wants the text to end in a newline, - // this should be accomplished by explicitly adding a newline by using \ followed by return, - // then returning control by pressing return again. - buffer.pop_back(); - } - - if (buffer.empty()) { // Enter key on empty line lets the user pass control back - LOG_DBG("empty line, passing control back\n"); - } else { // Add tokens to embd only if the input buffer is non-empty - // append input suffix if any - if (!params.input_suffix.empty() && !params.conversation_mode) { - LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - LOG("%s", params.input_suffix.c_str()); - } - - LOG_DBG("buffer: '%s'\n", buffer.c_str()); - - const size_t original_size = embd_inp.size(); - - if (params.escape) { - string_process_escapes(buffer); - } - - bool format_chat = params.conversation_mode && params.enable_chat_template; - std::string user_inp = format_chat - ? chat_add_and_format("user", std::move(buffer)) - : std::move(buffer); - // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) - const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); - const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); - - LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); - - // if user stop generation mid-way, we must add EOT to finish model's last response - if (need_insert_eot && format_chat) { - llama_token eot = llama_vocab_eot(vocab); - embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot); - need_insert_eot = false; - } - - embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); - - if (params.verbose_prompt) { - LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size); - } - - for (size_t i = original_size; i < embd_inp.size(); ++i) { - const llama_token token = embd_inp[i]; - const std::string token_str = common_token_to_piece(ctx, token); - output_tokens.push_back(token); - output_ss << token_str; - - if (params.verbose_prompt) { - LOG_INF("%6d -> '%s'\n", token, token_str.c_str()); - } - } - - // reset assistant message - assistant_ss.str(""); - - n_remain -= line_inp.size(); - LOG_DBG("n_remain: %d\n", n_remain); - } - - input_echo = false; // do not echo this again - } - - if (n_past > 0 || waiting_for_first_input) { - if (is_interacting) { - common_sampler_reset(smpl); - } - is_interacting = false; - - if (waiting_for_first_input && params.single_turn) { - params.interactive = false; - params.interactive_first = false; - } - waiting_for_first_input = false; - } - } - - // end of generation - if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) { - LOG(" [end of text]\n"); - break; + std::string assistant_content = ctx_cli.generate_completion(defaults, ctx_cli.messages, input_files); + ctx_cli.messages.push_back({ + {"role", "assistant"}, + {"content", assistant_content} + }); + PRI("\n"); + } catch (const std::exception & ex) { + PRI("Error: %s\n", ex.what()); } - - // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). - if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { - n_remain = params.n_predict; - is_interacting = true; - } - } - - if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { - LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); - llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - LOG("\n\n"); - common_perf_print(ctx, smpl); - - common_sampler_free(smpl); - - llama_backend_free(); - - ggml_threadpool_free_fn(threadpool); - ggml_threadpool_free_fn(threadpool_batch); + llama_backend_init(); + llama_numa_init(params.numa); return 0; } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 52ce58ec6f2..69357d685ee 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1424,6 +1424,44 @@ struct server_context_impl { // Functions to process the task // + // tokenize the input if it's set by CLI, return false on error + bool tokenize_cli_input(server_task & task) { + if (task.cli_input == nullptr) { + return true; // nothing to do + } + try { + auto & opt = oai_parser_opt; + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input); + inputs.tools = {}; // TODO + inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; + inputs.json_schema = ""; // TODO + inputs.grammar = ""; // TODO + inputs.use_jinja = opt.use_jinja; + inputs.parallel_tool_calls = false; + inputs.add_generation_prompt = true; + inputs.reasoning_format = opt.reasoning_format; + inputs.enable_thinking = opt.enable_thinking; + + // Apply chat template to the list of messages + auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); + + // tokenize the resulting prompt + auto & prompt = chat_params.prompt; + if (mctx != nullptr) { + task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files); + } else { + task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]); + } + task.cli_input.clear(); + task.cli_files.clear(); + } catch (const std::exception & e) { + send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST); + return false; + } + return true; + } + void process_single_task(server_task && task) { switch (task.type) { case SERVER_TASK_TYPE_COMPLETION: @@ -1431,6 +1469,10 @@ struct server_context_impl { case SERVER_TASK_TYPE_EMBEDDING: case SERVER_TASK_TYPE_RERANK: { + if (!tokenize_cli_input(task)) { + break; + } + const int id_slot = task.id_slot; server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); @@ -1640,40 +1682,6 @@ struct server_context_impl { res->id = task.id; queue_results.send(std::move(res)); } break; - case SERVER_TASK_TYPE_FORMAT_INPUT: - { - auto res = std::make_unique(); - res->id = task.id; - try { - auto & opt = oai_parser_opt; - common_chat_templates_inputs inputs; - inputs.messages = common_chat_msgs_parse_oaicompat(task.input_raw); - inputs.tools = {}; // TODO - inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; - inputs.json_schema = ""; // TODO - inputs.grammar = ""; // TODO - inputs.use_jinja = opt.use_jinja; - inputs.parallel_tool_calls = false; - inputs.add_generation_prompt = true; - inputs.reasoning_format = opt.reasoning_format; - inputs.enable_thinking = opt.enable_thinking; - - // Apply chat template to the list of messages - auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); - - // tokenize the resulting prompt - auto & prompt = chat_params.prompt; - if (mctx != nullptr) { - res->tokens = process_mtmd_prompt(mctx, prompt, task.input_files); - } else { - res->tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]); - } - } catch (const std::exception & e) { - send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST); - break; - } - queue_results.send(std::move(res)); - } break; } } diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index 38a4858522e..5a7f5209191 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -277,6 +277,12 @@ void server_response_reader::post_tasks(std::vector && tasks) { queue_tasks.post(std::move(tasks)); } +void server_response_reader::post_task(server_task && task) { + id_tasks = {task.id}; + queue_results.add_waiting_task_id(task.id); + queue_tasks.post(std::move(task)); +} + bool server_response_reader::has_next() const { return !cancelled && received_count < id_tasks.size(); } diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 209d2017c7e..52b2372b1b8 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -128,6 +128,7 @@ struct server_response_reader { } void post_tasks(std::vector && tasks); + void post_task(server_task && task); bool has_next() const; // return nullptr if should_stop() is true before receiving a result diff --git a/tools/server/server-task.h b/tools/server/server-task.h index c017a6a7ea3..e6f926e88e3 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -24,7 +24,6 @@ enum server_task_type { SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, SERVER_TASK_TYPE_SET_LORA, - SERVER_TASK_TYPE_FORMAT_INPUT, // only used by CLI }; // TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common @@ -94,6 +93,10 @@ struct server_task { task_params params; server_tokens tokens; + // only used by CLI, this delegates the tokenization to the server + json cli_input = nullptr; + std::vector cli_files; + server_task_type type; // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE @@ -110,10 +113,6 @@ struct server_task { // used by SERVER_TASK_TYPE_SET_LORA std::vector set_lora; - // used by SERVER_TASK_TYPE_FORMAT_INPUT - json input_raw; // TODO: maybe use something more efficient than json - std::vector input_files; - server_task() = default; server_task(server_task_type type) : type(type) {} @@ -406,18 +405,6 @@ struct server_task_result_apply_lora : server_task_result { virtual json to_json() override; }; -struct server_task_result_format_input : server_task_result { - int index = 0; - error_type err_type = ERROR_TYPE_SERVER; - std::string err_msg; - - server_tokens tokens; - - virtual json to_json() override { - return json{}; // unused - } -}; - struct server_prompt_checkpoint { llama_pos pos_min; llama_pos pos_max; From 85c85ea772b8c8e2c51aebeaa691f861b8f397a5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 13:27:04 +0100 Subject: [PATCH 03/32] fix logging, add display info --- common/arg.cpp | 4 +- tools/cli/cli.cpp | 124 ++++++++++++++++++++++---------- tools/server/server-context.cpp | 9 +++ tools/server/server-context.h | 11 +++ 4 files changed, 107 insertions(+), 41 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 9e062ee7a1b..fb572a7d24d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -468,6 +468,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context )); } + common_log_set_verbosity_thold(params.verbosity); + return true; } @@ -2705,7 +2707,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", [](common_params & params) { params.verbosity = INT_MAX; - common_log_set_verbosity_thold(INT_MAX); } )); add_opt(common_arg( @@ -2726,7 +2727,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "(default: %d)\n", params.verbosity), [](common_params & params, int value) { params.verbosity = value; - common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); add_opt(common_arg( diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 18e05481f55..cde3696f251 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -6,20 +6,32 @@ #include "server-context.h" #include "server-task.h" -#define PRI(...) LOGV(-1, __VA_ARGS__) +#include constexpr int POLLING_SECONDS = 1; +const char * LLAMA_ASCII_LOGO = R"( +▄▄ ▄▄ +██ ██ +██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄ +██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██ +██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀ + ██ ██ + ▀▀ ▀▀ +)"; + static bool g_is_interrupted = false; static bool should_stop() { return g_is_interrupted; } #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -static void sigint_handler(int signo) { - if (signo == SIGINT) { - g_is_interrupted = true; +static void signal_handler(int) { + if (g_is_interrupted) { + // second Ctrl+C - exit immediately + std::exit(130); } + g_is_interrupted = true; } #endif @@ -32,6 +44,7 @@ struct cli_context { auto queues = ctx_server.get_queues(); server_response_reader rd(queues, POLLING_SECONDS); { + // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); task.id = queues.first.get_new_id(); task.params = params; // copy @@ -44,13 +57,13 @@ struct cli_context { std::string curr_content; while (result) { if (result->is_error()) { - PRI("Error: %s\n", result->to_json().dump().c_str()); + LOG("Error: %s\n", result->to_json().dump().c_str()); return curr_content; } auto res_partial = dynamic_cast(result.get()); if (res_partial) { curr_content += res_partial->content; - PRI("%s", res_partial->content.c_str()); + LOG("%s", res_partial->content.c_str()); } auto res_final = dynamic_cast(result.get()); if (res_final) { @@ -65,15 +78,15 @@ struct cli_context { int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN)) { + params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs + + auto LLAMA_EXAMPLE_CLI = LLAMA_EXAMPLE_SERVER; // TODO: remove this + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) { return 1; } common_init(); - // prefer silent by default; TODO: fix this later - common_log_set_verbosity_thold(0); - // struct that contains llama context and inference cli_context ctx_cli; @@ -85,8 +98,22 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); + sigaction(SIGTERM, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + if (!ctx_cli.ctx_server.load_model(params)) { - PRI("Failed to load the model\n"); + LOG_ERR("Failed to load the model\n"); return 1; } ctx_cli.ctx_server.init(); @@ -95,15 +122,38 @@ int main(int argc, char ** argv) { ctx_cli.ctx_server.start_loop(); }); - PRI("\n"); - PRI("llama-cli is ready. Type your messages below.\n"); - PRI("\n"); + auto inf = ctx_cli.ctx_server.get_info(); + std::string modalities = "text"; + if (inf.has_inp_image) { + modalities += ", vision"; + } + if (inf.has_inp_audio) { + modalities += ", audio"; + } + + LOG("\n"); + LOG("%s\n", LLAMA_ASCII_LOGO); + LOG("build : %s\n", inf.build_info.c_str()); + LOG("model : %s\n", inf.model_name.c_str()); + LOG("modalities : %s\n", modalities.c_str()); + LOG("\n"); + LOG("available commands:\n"); + LOG(" Ctrl+C to stop or exit\n"); + LOG(" /regen re-generate the last response\n"); + LOG(" /clear clear the chat history\n"); + if (inf.has_inp_image) { + LOG(" /image add an image file\n"); + } + if (inf.has_inp_audio) { + LOG(" /audio add an audio file\n"); + } + LOG("\n"); while (!should_stop()) { std::string buffer; console::set_display(console::user_input); { - PRI("\n> "); + LOG("\n> "); std::string line; bool another_line = true; do { @@ -112,35 +162,31 @@ int main(int argc, char ** argv) { } while (another_line); } console::set_display(console::reset); - PRI("\n"); + LOG("\n"); if (buffer.empty()) { continue; } - try { - ctx_cli.messages.push_back({ - {"role", "user"}, - {"content", buffer} - }); - std::vector input_files; // empty for now - - task_params defaults; - defaults.sampling = params.sampling; - defaults.speculative = params.speculative; - defaults.n_keep = params.n_keep; - defaults.n_predict = params.n_predict; - defaults.antiprompt = params.antiprompt; - - std::string assistant_content = ctx_cli.generate_completion(defaults, ctx_cli.messages, input_files); - ctx_cli.messages.push_back({ - {"role", "assistant"}, - {"content", assistant_content} - }); - PRI("\n"); - } catch (const std::exception & ex) { - PRI("Error: %s\n", ex.what()); - } + ctx_cli.messages.push_back({ + {"role", "user"}, + {"content", buffer} + }); + std::vector input_files; // empty for now + + task_params defaults; + defaults.sampling = params.sampling; + defaults.speculative = params.speculative; + defaults.n_keep = params.n_keep; + defaults.n_predict = params.n_predict; + defaults.antiprompt = params.antiprompt; + + std::string assistant_content = ctx_cli.generate_completion(defaults, ctx_cli.messages, input_files); + ctx_cli.messages.push_back({ + {"role", "assistant"}, + {"content", assistant_content} + }); + LOG("\n"); } llama_backend_init(); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index f3288bdb8fa..da3afb3e9d5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2576,6 +2576,15 @@ std::pair server_context::get_queues() { return { impl->queue_tasks, impl->queue_results }; } +server_context_info server_context::get_info() const { + return server_context_info { + /* build_info */ build_info, + /* model_name */ impl->model_name, + /* has_inp_image */ impl->oai_parser_opt.allow_image, + /* has_inp_audio */ impl->oai_parser_opt.allow_audio, + }; +} + // generator-like API for HTTP response generation diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 05b4afaeeb2..82007d48add 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -9,6 +9,13 @@ struct server_context_impl; // private implementation +struct server_context_info { + std::string build_info; + std::string model_name; + bool has_inp_image; + bool has_inp_audio; +}; + struct server_context { std::unique_ptr impl; @@ -34,6 +41,10 @@ struct server_context { // get the underlaying queue_tasks and queue_results // used by CLI application std::pair get_queues(); + + // get server info + // used by CLI application + server_context_info get_info() const; }; From 820c46d04fc7474a75f82dd1c44c03194e967da8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 14:46:12 +0100 Subject: [PATCH 04/32] handle commands --- tools/cli/cli.cpp | 133 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 103 insertions(+), 30 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index cde3696f251..6705941467e 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -6,6 +6,8 @@ #include "server-context.h" #include "server-task.h" +#include +#include #include constexpr int POLLING_SECONDS = 1; @@ -20,34 +22,44 @@ const char * LLAMA_ASCII_LOGO = R"( ▀▀ ▀▀ )"; -static bool g_is_interrupted = false; +static std::atomic g_is_interrupted = false; static bool should_stop() { - return g_is_interrupted; + return g_is_interrupted.load(); } #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void signal_handler(int) { - if (g_is_interrupted) { + if (g_is_interrupted.load()) { // second Ctrl+C - exit immediately std::exit(130); } - g_is_interrupted = true; + g_is_interrupted.store(true); } #endif struct cli_context { server_context ctx_server; json messages = json::array(); + std::vector input_files; + task_params defaults; - std::string generate_completion(task_params & params, const json & messages, const std::vector & input_files) { - params.stream = true; // make sure we always use streaming mode + cli_context(const common_params & params) { + defaults.sampling = params.sampling; + defaults.speculative = params.speculative; + defaults.n_keep = params.n_keep; + defaults.n_predict = params.n_predict; + defaults.antiprompt = params.antiprompt; + defaults.stream = true; // make sure we always use streaming mode + } + + std::string generate_completion() { auto queues = ctx_server.get_queues(); server_response_reader rd(queues, POLLING_SECONDS); { // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); task.id = queues.first.get_new_id(); - task.params = params; // copy + task.params = defaults; // copy task.cli_input = messages; // copy task.cli_files = input_files; // copy rd.post_task({std::move(task)}); @@ -57,7 +69,12 @@ struct cli_context { std::string curr_content; while (result) { if (result->is_error()) { - LOG("Error: %s\n", result->to_json().dump().c_str()); + json err_data = result->to_json(); + if (err_data.contains("message")) { + LOG_ERR("Error: %s\n", err_data["message"].get().c_str()); + } else { + LOG_ERR("Error: %s\n", err_data.dump().c_str()); + } return curr_content; } auto res_partial = dynamic_cast(result.get()); @@ -73,6 +90,19 @@ struct cli_context { } return curr_content; } + + // TODO: support remote files in the future (http, https, etc) + std::string load_input_files(const std::string & fname) { + input_files.clear(); + std::ifstream file(fname, std::ios::binary); + if (!file) { + return ""; + } + raw_buffer buf; + buf.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + input_files.push_back(std::move(buf)); + return mtmd_default_marker(); + } }; int main(int argc, char ** argv) { @@ -88,7 +118,7 @@ int main(int argc, char ** argv) { common_init(); // struct that contains llama context and inference - cli_context ctx_cli; + cli_context ctx_cli(params); llama_backend_init(); llama_numa_init(params.numa); @@ -138,21 +168,23 @@ int main(int argc, char ** argv) { LOG("modalities : %s\n", modalities.c_str()); LOG("\n"); LOG("available commands:\n"); - LOG(" Ctrl+C to stop or exit\n"); - LOG(" /regen re-generate the last response\n"); - LOG(" /clear clear the chat history\n"); + LOG(" /exit or Ctrl+C stop or exit\n"); + LOG(" /regen re-generate the last response\n"); + LOG(" /clear clear the chat history\n"); if (inf.has_inp_image) { - LOG(" /image add an image file\n"); + LOG(" /image add an image file\n"); } if (inf.has_inp_audio) { - LOG(" /audio add an audio file\n"); + LOG(" /audio add an audio file\n"); } LOG("\n"); - while (!should_stop()) { + // interactive loop + std::string cur_msg; + while (true) { std::string buffer; console::set_display(console::user_input); - { + if (params.prompt.empty()) { LOG("\n> "); std::string line; bool another_line = true; @@ -160,28 +192,67 @@ int main(int argc, char ** argv) { another_line = console::readline(line, params.multiline_input); buffer += line; } while (another_line); + } else { + // process input prompt from args + buffer = params.prompt; + params.prompt.clear(); // only use it once } console::set_display(console::reset); LOG("\n"); + if (should_stop()) { + g_is_interrupted.store(false); + break; + } + if (buffer.empty()) { continue; } - ctx_cli.messages.push_back({ - {"role", "user"}, - {"content", buffer} - }); - std::vector input_files; // empty for now + bool add_user_msg = true; - task_params defaults; - defaults.sampling = params.sampling; - defaults.speculative = params.speculative; - defaults.n_keep = params.n_keep; - defaults.n_predict = params.n_predict; - defaults.antiprompt = params.antiprompt; + // process commands + if (string_starts_with(buffer, "/exit")) { + break; + } else if (string_starts_with(buffer, "/regen")) { + if (ctx_cli.messages.size() >= 2) { + size_t last_idx = ctx_cli.messages.size() - 1; + ctx_cli.messages.erase(last_idx); + add_user_msg = false; + } else { + LOG_ERR("No message to regenerate.\n"); + continue; + } + } else if (string_starts_with(buffer, "/clear")) { + ctx_cli.messages.clear(); + LOG("Chat history cleared.\n"); + continue; + } else if ( + (string_starts_with(buffer, "/image ") && inf.has_inp_image) || + (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { + std::string fname = string_strip(buffer.substr(7)); + std::string marker = ctx_cli.load_input_files(fname); + if (marker.empty()) { + LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + continue; + } + cur_msg += marker; + LOG("Loaded image from '%s'\n", fname.c_str()); + continue; + } else { + // not a command + cur_msg += buffer; + } - std::string assistant_content = ctx_cli.generate_completion(defaults, ctx_cli.messages, input_files); + // generate response + if (add_user_msg) { + ctx_cli.messages.push_back({ + {"role", "user"}, + {"content", cur_msg} + }); + cur_msg.clear(); + } + std::string assistant_content = ctx_cli.generate_completion(); ctx_cli.messages.push_back({ {"role", "assistant"}, {"content", assistant_content} @@ -189,8 +260,10 @@ int main(int argc, char ** argv) { LOG("\n"); } - llama_backend_init(); - llama_numa_init(params.numa); + LOG("\nExiting...\n"); + ctx_cli.ctx_server.terminate(); + inference_thread.join(); + llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); return 0; } From 33551d4a8793f54dce873baff608b388286fc67f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 14:57:48 +0100 Subject: [PATCH 05/32] add args --- common/arg.cpp | 33 +++++++++++++++++---------------- common/common.h | 1 + tools/cli/cli.cpp | 20 +++++++++++++++++++- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index fb572a7d24d..6c9305173b1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -51,6 +51,7 @@ using json = nlohmann::ordered_json; static std::initializer_list mmproj_examples = { LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_CLI, }; static std::string read_file(const std::string & fname) { @@ -790,14 +791,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.display_prompt = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-co", "--color"}, string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [](common_params & params) { params.use_color = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads), @@ -997,14 +998,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--context-shift"}, string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), [](common_params & params) { params.ctx_shift = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -1040,7 +1041,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -1070,7 +1071,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.system_prompt.pop_back(); } } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -1146,14 +1147,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-sp", "--special"}, string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), [](common_params & params) { params.special = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, "run in conversation mode:\n" @@ -1231,7 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--spm-infill"}, string_format( @@ -2543,14 +2544,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.use_jinja = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--no-jinja"}, string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"), [](common_params & params) { params.use_jinja = false; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" @@ -2561,7 +2562,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.reasoning_format = common_reasoning_format_from_name(value); } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK")); add_opt(common_arg( {"--reasoning-budget"}, "N", "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)", @@ -2569,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); } params.reasoning_budget = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET")); add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", string_format( @@ -2581,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.chat_template = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", string_format( @@ -2593,7 +2594,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.chat_template = read_file(value); } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); add_opt(common_arg( {"--no-prefill-assistant"}, string_format( @@ -2624,7 +2625,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.simple_io = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--positive-file"}, "FNAME", string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), diff --git a/common/common.h b/common/common.h index 179113a4dbf..648e0d0ab17 100644 --- a/common/common.h +++ b/common/common.h @@ -83,6 +83,7 @@ enum llama_example { LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_MAIN, + LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL, diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 6705941467e..0d0c3be69b3 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -68,6 +68,9 @@ struct cli_context { server_task_result_ptr result = rd.next(should_stop); std::string curr_content; while (result) { + if (should_stop()) { + break; + } if (result->is_error()) { json err_data = result->to_json(); if (err_data.contains("message")) { @@ -88,6 +91,8 @@ struct cli_context { } result = rd.next(should_stop); } + g_is_interrupted.store(false); + // server_response_reader automatically cancels pending tasks upon destruction return curr_content; } @@ -110,7 +115,6 @@ int main(int argc, char ** argv) { params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs - auto LLAMA_EXAMPLE_CLI = LLAMA_EXAMPLE_SERVER; // TODO: remove this if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) { return 1; } @@ -179,6 +183,13 @@ int main(int argc, char ** argv) { } LOG("\n"); + if (!params.system_prompt.empty()) { + ctx_cli.messages.push_back({ + {"role", "system"}, + {"content", params.system_prompt} + }); + } + // interactive loop std::string cur_msg; while (true) { @@ -195,6 +206,7 @@ int main(int argc, char ** argv) { } else { // process input prompt from args buffer = params.prompt; + LOG("\n> %s\n", buffer.c_str()); params.prompt.clear(); // only use it once } console::set_display(console::reset); @@ -209,6 +221,11 @@ int main(int argc, char ** argv) { continue; } + // remove trailing newline + if (buffer.back() == '\n') { + buffer.pop_back(); + } + bool add_user_msg = true; // process commands @@ -230,6 +247,7 @@ int main(int argc, char ** argv) { } else if ( (string_starts_with(buffer, "/image ") && inf.has_inp_image) || (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { + // just in case (bad copy-paste for example), we strip all trailing/leading spaces std::string fname = string_strip(buffer.substr(7)); std::string marker = ctx_cli.load_input_files(fname); if (marker.empty()) { From 92610228eb4b1dfcc2f09b2c7b4f34798a0caa84 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 15:12:16 +0100 Subject: [PATCH 06/32] wip --- tools/cli/cli.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 0d0c3be69b3..dd3adccfaac 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -165,15 +165,25 @@ int main(int argc, char ** argv) { modalities += ", audio"; } + if (!params.system_prompt.empty()) { + ctx_cli.messages.push_back({ + {"role", "system"}, + {"content", params.system_prompt} + }); + } + LOG("\n"); LOG("%s\n", LLAMA_ASCII_LOGO); LOG("build : %s\n", inf.build_info.c_str()); LOG("model : %s\n", inf.model_name.c_str()); LOG("modalities : %s\n", modalities.c_str()); + if (!params.system_prompt.empty()) { + LOG("using custom system prompt\n"); + } LOG("\n"); LOG("available commands:\n"); LOG(" /exit or Ctrl+C stop or exit\n"); - LOG(" /regen re-generate the last response\n"); + LOG(" /regen regenerate the last response\n"); LOG(" /clear clear the chat history\n"); if (inf.has_inp_image) { LOG(" /image add an image file\n"); @@ -183,13 +193,6 @@ int main(int argc, char ** argv) { } LOG("\n"); - if (!params.system_prompt.empty()) { - ctx_cli.messages.push_back({ - {"role", "system"}, - {"content", params.system_prompt} - }); - } - // interactive loop std::string cur_msg; while (true) { From b08a4268a611d308654ec2fafe0917dbb9d59ec2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 15:16:06 +0100 Subject: [PATCH 07/32] move old cli to llama-completion --- common/arg.cpp | 64 +++++++++---------- common/common.h | 2 +- examples/gen-docs/gen-docs.cpp | 2 +- tools/CMakeLists.txt | 2 +- tools/{main => completion}/CMakeLists.txt | 4 +- tools/{main => completion}/README.md | 0 .../main.cpp => completion/completion.cpp} | 2 +- 7 files changed, 38 insertions(+), 38 deletions(-) rename tools/{main => completion}/CMakeLists.txt (74%) rename tools/{main => completion}/README.md (100%) rename tools/{main/main.cpp => completion/completion.cpp} (99%) diff --git a/common/arg.cpp b/common/arg.cpp index 6c9305173b1..e3434d6227c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -791,14 +791,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.display_prompt = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-co", "--color"}, string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [](common_params & params) { params.use_color = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads), @@ -931,7 +931,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", string_format( - ex == LLAMA_EXAMPLE_MAIN + ex == LLAMA_EXAMPLE_COMPLETION ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)" : "number of tokens to predict (default: %d, -1 = infinity)", params.n_predict), @@ -998,14 +998,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--context-shift"}, string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), [](common_params & params) { params.ctx_shift = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -1041,7 +1041,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -1071,7 +1071,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.system_prompt.pop_back(); } } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -1119,42 +1119,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.n_print = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", [](common_params & params, const std::string & value) { params.path_prompt_cache = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", [](common_params & params) { params.prompt_cache_all = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", [](common_params & params) { params.prompt_cache_ro = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", [](common_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-sp", "--special"}, string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), [](common_params & params) { params.special = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, "run in conversation mode:\n" @@ -1164,14 +1164,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-no-cnv", "--no-conversation"}, "force disable conversation mode (default: false)", [](common_params & params) { params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-st", "--single-turn"}, "run conversation for a single turn only, then exit when done\n" @@ -1180,28 +1180,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.single_turn = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-i", "--interactive"}, string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), [](common_params & params) { params.interactive = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-if", "--interactive-first"}, string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), [](common_params & params) { params.interactive_first = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", [](common_params & params) { params.multiline_input = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", @@ -1209,7 +1209,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.input_prefix_bos = true; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", @@ -1217,7 +1217,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.input_prefix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", @@ -1225,14 +1225,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.input_suffix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--spm-infill"}, string_format( @@ -1623,14 +1623,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.grp_attn_n = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); + ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", string_format("group-attention width (default: %d)", params.grp_attn_w), [](common_params & params, int value) { params.grp_attn_w = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", @@ -2544,14 +2544,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.use_jinja = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--no-jinja"}, string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"), [](common_params & params) { params.use_jinja = false; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" @@ -2562,7 +2562,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.reasoning_format = common_reasoning_format_from_name(value); } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK")); add_opt(common_arg( {"--reasoning-budget"}, "N", "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)", @@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); } params.reasoning_budget = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET")); add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", string_format( @@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.chat_template = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", string_format( @@ -2594,7 +2594,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.chat_template = read_file(value); } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); add_opt(common_arg( {"--no-prefill-assistant"}, string_format( @@ -2625,7 +2625,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.simple_io = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_CLI})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--positive-file"}, "FNAME", string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), diff --git a/common/common.h b/common/common.h index 648e0d0ab17..96bbbff7a23 100644 --- a/common/common.h +++ b/common/common.h @@ -82,7 +82,7 @@ int32_t cpu_get_num_math(); enum llama_example { LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_SPECULATIVE, - LLAMA_EXAMPLE_MAIN, + LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_PERPLEXITY, diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 77c59a836e5..420195f1985 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -76,7 +76,7 @@ static void export_md(std::string fname, llama_example ex) { } int main(int, char **) { - export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN); + export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION); export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); return 0; diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 0bc42fa59d8..43a0e819499 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -19,7 +19,7 @@ else() add_subdirectory(imatrix) add_subdirectory(llama-bench) add_subdirectory(cli) - add_subdirectory(main) + add_subdirectory(completion) add_subdirectory(perplexity) add_subdirectory(quantize) if (LLAMA_BUILD_SERVER) diff --git a/tools/main/CMakeLists.txt b/tools/completion/CMakeLists.txt similarity index 74% rename from tools/main/CMakeLists.txt rename to tools/completion/CMakeLists.txt index a39a6552246..126ae6ab3d0 100644 --- a/tools/main/CMakeLists.txt +++ b/tools/completion/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-cli-old) -add_executable(${TARGET} main.cpp) +set(TARGET llama-completion) +add_executable(${TARGET} completion.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/main/README.md b/tools/completion/README.md similarity index 100% rename from tools/main/README.md rename to tools/completion/README.md diff --git a/tools/main/main.cpp b/tools/completion/completion.cpp similarity index 99% rename from tools/main/main.cpp rename to tools/completion/completion.cpp index 960ddbe3910..89c3d1e0b98 100644 --- a/tools/main/main.cpp +++ b/tools/completion/completion.cpp @@ -86,7 +86,7 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { common_params params; g_params = ¶ms; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) { return 1; } From 42e9b3878dccd5ea7a3a6ca94af516c98b86ed40 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 15:16:25 +0100 Subject: [PATCH 08/32] rm deprecation notice --- tools/completion/completion.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 89c3d1e0b98..2fda549c201 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -521,12 +521,6 @@ int main(int argc, char ** argv) { is_interacting = params.interactive_first; } - LOG_WRN("*****************************\n"); - LOG_WRN("IMPORTANT: The current llama-cli will be moved to llama-completion in the near future\n"); - LOG_WRN(" New llama-cli will have enhanced features and improved user experience\n"); - LOG_WRN(" More info: https://github.com/ggml-org/llama.cpp/discussions/17618\n"); - LOG_WRN("*****************************\n"); - bool is_antiprompt = false; bool input_echo = true; bool display = true; From 29f03bc1732b73f28dbdea1fa89625c080f4188a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 15:31:41 +0100 Subject: [PATCH 09/32] move server to a shared library --- tools/cli/CMakeLists.txt | 15 ++------------ tools/cli/cli.cpp | 1 + tools/server/CMakeLists.txt | 40 +++++++++++++++++++++++++++---------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index 56055f6957c..b08fff4c289 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -1,20 +1,9 @@ set(TARGET llama-cli) -add_executable(${TARGET} - cli.cpp - ../server/server-task.cpp - ../server/server-task.h - ../server/server-queue.cpp - ../server/server-queue.h - ../server/server-common.cpp - ../server/server-common.h - ../server/server-context.cpp - ../server/server-context.h - ) -target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT}) +add_executable(${TARGET} cli.cpp) +target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) include_directories(../server) -include_directories(../mtmd) if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index dd3adccfaac..859b08503d5 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -146,6 +146,7 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + LOG("Loading model...\n"); if (!ctx_cli.ctx_server.load_model(params)) { LOG_ERR("Failed to load the model\n"); return 1; diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 1aa659a9066..a39b4c5b35f 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -1,7 +1,33 @@ -set(TARGET llama-server) - include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +# server-context containing the core server logic, used by llama-server and CLI + +set(TARGET server-context) + +add_library(${TARGET} STATIC + server-task.cpp + server-task.h + server-queue.cpp + server-queue.h + server-common.cpp + server-common.h + server-context.cpp + server-context.h +) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PRIVATE ../mtmd) +target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT}) + + +# llama-server executable + +set(TARGET llama-server) + if (NOT LLAMA_HTTPLIB) message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF") endif() @@ -12,14 +38,6 @@ set(TARGET_SRCS server-http.h server-models.cpp server-models.h - server-task.cpp - server-task.h - server-queue.cpp - server-queue.h - server-common.cpp - server-common.h - server-context.cpp - server-context.h ) set(PUBLIC_ASSETS index.html.gz @@ -43,7 +61,7 @@ install(TARGETS ${TARGET} RUNTIME) target_include_directories(${TARGET} PRIVATE ../mtmd) target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) -target_link_libraries(${TARGET} PRIVATE common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) From 61de19b5f8c57a2012063b68a2c816c04bf62949 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 15:36:47 +0100 Subject: [PATCH 10/32] move ci to llama-completion --- README.md | 13 ------------- ci/run.sh | 24 ++++++++++++------------ scripts/snapdragon/adb/run-cli.sh | 2 +- tests/test-lora-conversion-inference.sh | 12 ++++++------ tools/gguf-split/tests.sh | 2 +- tools/quantize/tests.sh | 2 +- 6 files changed, 21 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 2e44ae7d0c7..ab71ccacb75 100644 --- a/README.md +++ b/README.md @@ -346,19 +346,6 @@ To learn more about model quantization, [read this documentation](tools/quantize --
- Run simple text completion - - To disable conversation mode explicitly, use `-no-cnv` - - ```bash - llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv - - # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey. - ``` - -
- -
Constrain the output with a custom grammar diff --git a/ci/run.sh b/ci/run.sh index 83b2603e821..0676504b3e6 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -398,18 +398,18 @@ function gg_run_qwen3_0_6b { ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc) ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc) - (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log - (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log + (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log if [ -z ${GG_BUILD_NO_BF16} ]; then diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh index ab8d6d49a24..cc5e47c2d67 100755 --- a/scripts/snapdragon/adb/run-cli.sh +++ b/scripts/snapdragon/adb/run-cli.sh @@ -46,7 +46,7 @@ adb $adbserial shell " \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ $verbose $experimental $sched $opmask $profile $nhvx $ndev \ - ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \ + ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \ -ngl 99 --device $device $cli_opts $@ \ diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh index 0255494b824..e7d67769331 100755 --- a/tests/test-lora-conversion-inference.sh +++ b/tests/test-lora-conversion-inference.sh @@ -79,19 +79,19 @@ run_conversion_and_inference_lora() { # Run inference echo -e "\n\n---------------------------\n\n" - echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..." - OUTPUT_BASE=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + echo "Running llama-completion without lora for $model_name with hidden_size $hidden_size..." + OUTPUT_BASE=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0) echo -e "\n\n---------------------------\n\n" - echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..." - OUTPUT_LORA_HOT=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + echo "Running llama-completion with hot lora for $model_name with hidden_size $hidden_size..." + OUTPUT_LORA_HOT=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \ -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0) echo -e "\n\n---------------------------\n\n" - echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..." - OUTPUT_LORA_MERGED=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \ + echo "Running llama-completion with merged lora for $model_name with hidden_size $hidden_size..." + OUTPUT_LORA_MERGED=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \ -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0) # Remove any initial white space diff --git a/tools/gguf-split/tests.sh b/tools/gguf-split/tests.sh index e8677018f55..c8dd0b00795 100755 --- a/tools/gguf-split/tests.sh +++ b/tools/gguf-split/tests.sh @@ -19,7 +19,7 @@ fi set -x SPLIT=$1/llama-gguf-split -MAIN=$1/llama-cli +MAIN=$1/llama-completion WORK_PATH=$TMP_DIR/gguf-split ROOT_DIR=$(realpath $(dirname $0)/../../) diff --git a/tools/quantize/tests.sh b/tools/quantize/tests.sh index acc54fd9b15..2cae588e9e4 100644 --- a/tools/quantize/tests.sh +++ b/tools/quantize/tests.sh @@ -20,7 +20,7 @@ set -x SPLIT=$1/llama-gguf-split QUANTIZE=$1/llama-quantize -MAIN=$1/llama-cli +MAIN=$1/llama-completion WORK_PATH=$TMP_DIR/quantize ROOT_DIR=$(realpath $(dirname $0)/../../) From b9b91caf657402baa627cd58669c29769cc5952a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 18:56:46 +0100 Subject: [PATCH 11/32] add loading animation --- common/arg.cpp | 6 +++--- common/console.cpp | 29 ++++++++++++++++++++++++++ common/console.h | 1 + tools/cli/cli.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 82 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index e3434d6227c..22d50d378e4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1171,7 +1171,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; } - ).set_examples({LLAMA_EXAMPLE_COMPLETION})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-st", "--single-turn"}, "run conversation for a single turn only, then exit when done\n" @@ -1180,7 +1180,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.single_turn = true; } - ).set_examples({LLAMA_EXAMPLE_COMPLETION})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-i", "--interactive"}, string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), @@ -1201,7 +1201,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.multiline_input = true; } - ).set_examples({LLAMA_EXAMPLE_COMPLETION})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", diff --git a/common/console.cpp b/common/console.cpp index 078a8d678d9..202c0364a62 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -33,6 +33,8 @@ #define ANSI_COLOR_RESET "\x1b[0m" #define ANSI_BOLD "\x1b[1m" +static const char LOADING_CHARS[] = {'|', '/', '-', '\\'}; + namespace console { // @@ -41,6 +43,8 @@ namespace console { static bool advanced_display = false; static bool simple_io = true; + static bool loading_show = false; + static int loading_idx = 0; static display_t current_display = reset; static FILE* out = stdout; @@ -492,6 +496,31 @@ namespace console { return multiline_input; } + void set_loading(bool enabled) { + if (!simple_io) { + if (!loading_show && enabled) { + // turn on loading + fputc(' ', out); + fflush(out); + } + + if (loading_show && !enabled) { + // turn off loading + replace_last(' '); + pop_cursor(); + fflush(out); + } + + loading_show = enabled; + + if (loading_show) { + loading_idx = (loading_idx + 1) % sizeof(LOADING_CHARS); + replace_last(LOADING_CHARS[loading_idx]); + fflush(out); + } + } + } + bool readline(std::string & line, bool multiline_input) { set_display(user_input); diff --git a/common/console.h b/common/console.h index ec175269b9d..350242c0455 100644 --- a/common/console.h +++ b/common/console.h @@ -15,5 +15,6 @@ namespace console { void init(bool use_simple_io, bool use_advanced_display); void cleanup(); void set_display(display_t display); + void set_loading(bool enabled); // update frame each time it's called bool readline(std::string & line, bool multiline_input); } diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 859b08503d5..9d9f21c48c0 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -8,6 +8,7 @@ #include #include +#include #include constexpr int POLLING_SECONDS = 1; @@ -43,6 +44,10 @@ struct cli_context { std::vector input_files; task_params defaults; + // thread for showing "loading" animation + std::atomic loading_show; + std::thread loading_display_thread; + cli_context(const common_params & params) { defaults.sampling = params.sampling; defaults.speculative = params.speculative; @@ -50,6 +55,28 @@ struct cli_context { defaults.n_predict = params.n_predict; defaults.antiprompt = params.antiprompt; defaults.stream = true; // make sure we always use streaming mode + + // TODO: improve this mechanism later + loading_display_thread = std::thread([this]() { + while (true) { + if (loading_show.load()) { + // update loading frame + console::set_loading(true); + } + std::this_thread::sleep_for(std::chrono::milliseconds(150)); + } + }); + loading_display_thread.detach(); + } + + void show_loading() { + loading_show.store(true); + } + + void hide_loading() { + loading_show.store(false); + // clear loading here in case the thread is sleeping + console::set_loading(false); } std::string generate_completion() { @@ -65,8 +92,13 @@ struct cli_context { rd.post_task({std::move(task)}); } + // wait for first result + show_loading(); server_task_result_ptr result = rd.next(should_stop); + + hide_loading(); std::string curr_content; + while (result) { if (should_stop()) { break; @@ -119,6 +151,11 @@ int main(int argc, char ** argv) { return 1; } + if (params.conversation_mode == COMMON_CONVERSATION_MODE_ENABLED) { + LOG_ERR("--no-conversation is not supported by llama-cli\n"); + LOG_ERR("please use llama-completion instead\n"); + } + common_init(); // struct that contains llama context and inference @@ -146,13 +183,19 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG("Loading model...\n"); + LOG("Loading model... "); // followed by loading animation + ctx_cli.show_loading(); if (!ctx_cli.ctx_server.load_model(params)) { - LOG_ERR("Failed to load the model\n"); + ctx_cli.hide_loading(); + LOG_ERR("\nFailed to load the model\n"); return 1; } + ctx_cli.ctx_server.init(); + ctx_cli.hide_loading(); + LOG("\n"); + std::thread inference_thread([&ctx_cli]() { ctx_cli.ctx_server.start_loop(); }); @@ -280,6 +323,10 @@ int main(int argc, char ** argv) { {"content", assistant_content} }); LOG("\n"); + + if (params.single_turn) { + break; + } } LOG("\nExiting...\n"); From 0799d3084ec8964f9b0e1782770e7b323ae757ab Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 19:40:18 +0100 Subject: [PATCH 12/32] add --show-timings arg --- common/arg.cpp | 7 +++++ common/common.h | 3 ++- common/console.cpp | 3 +++ common/console.h | 1 + tools/cli/cli.cpp | 65 +++++++++++++++++++++++++++++++++++++--------- 5 files changed, 66 insertions(+), 13 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 22d50d378e4..fcd5a83b249 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1050,6 +1050,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); + add_opt(common_arg( + {"--show-timings"}, + string_format("show timing information after each response (default: %s)", params.show_timings ? "true" : "false"), + [](common_params & params) { + params.show_timings = true; + } + ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", diff --git a/common/common.h b/common/common.h index 96bbbff7a23..a15f7287b8a 100644 --- a/common/common.h +++ b/common/common.h @@ -395,7 +395,7 @@ struct common_params { bool usage = false; // print usage bool completion = false; // print source-able completion script - bool use_color = false; // use color to distinguish generations and inputs + bool use_color = true; // use color to distinguish generations and inputs bool special = false; // enable special token output bool interactive = false; // interactive mode bool interactive_first = false; // wait for user input immediately @@ -407,6 +407,7 @@ struct common_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool no_perf = false; // disable performance metrics + bool show_timings = false; // show timing information on CLI bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache diff --git a/common/console.cpp b/common/console.cpp index 202c0364a62..aac2893a75f 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -151,6 +151,9 @@ namespace console { case reset: fprintf(out, ANSI_COLOR_RESET); break; + case info: + fprintf(out, ANSI_COLOR_MAGENTA); + break; case prompt: fprintf(out, ANSI_COLOR_YELLOW); break; diff --git a/common/console.h b/common/console.h index 350242c0455..c2c2f9dceef 100644 --- a/common/console.h +++ b/common/console.h @@ -7,6 +7,7 @@ namespace console { enum display_t { reset = 0, + info, prompt, user_input, error diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 9d9f21c48c0..7534546db21 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -11,6 +11,12 @@ #include #include +// TODO: without doing this, the colors get messed up +#ifdef LOG +#undef LOG +#endif +#define LOG(...) fprintf(stdout, __VA_ARGS__) + constexpr int POLLING_SECONDS = 1; const char * LLAMA_ASCII_LOGO = R"( @@ -54,7 +60,9 @@ struct cli_context { defaults.n_keep = params.n_keep; defaults.n_predict = params.n_predict; defaults.antiprompt = params.antiprompt; + defaults.stream = true; // make sure we always use streaming mode + defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // TODO: improve this mechanism later loading_display_thread = std::thread([this]() { @@ -70,6 +78,7 @@ struct cli_context { } void show_loading() { + fflush(stdout); loading_show.store(true); } @@ -79,7 +88,7 @@ struct cli_context { console::set_loading(false); } - std::string generate_completion() { + std::string generate_completion(result_timings & out_timings) { auto queues = ctx_server.get_queues(); server_response_reader rd(queues, POLLING_SECONDS); { @@ -114,11 +123,14 @@ struct cli_context { } auto res_partial = dynamic_cast(result.get()); if (res_partial) { + out_timings = std::move(res_partial->timings); curr_content += res_partial->content; LOG("%s", res_partial->content.c_str()); + fflush(stdout); } auto res_final = dynamic_cast(result.get()); if (res_final) { + out_timings = std::move(res_final->timings); break; } result = rd.next(should_stop); @@ -169,6 +181,8 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); + console::set_display(console::reset); + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; @@ -226,14 +240,15 @@ int main(int argc, char ** argv) { } LOG("\n"); LOG("available commands:\n"); - LOG(" /exit or Ctrl+C stop or exit\n"); - LOG(" /regen regenerate the last response\n"); - LOG(" /clear clear the chat history\n"); + LOG(" /exit or Ctrl+C stop or exit\n"); + LOG(" /regen regenerate the last response\n"); + LOG(" /clear clear the chat history\n"); + LOG(" /timings show timings for next responses\n"); if (inf.has_inp_image) { - LOG(" /image add an image file\n"); + LOG(" /image add an image file\n"); } if (inf.has_inp_audio) { - LOG(" /audio add an audio file\n"); + LOG(" /audio add an audio file\n"); } LOG("\n"); @@ -264,15 +279,16 @@ int main(int argc, char ** argv) { break; } - if (buffer.empty()) { - continue; - } - // remove trailing newline - if (buffer.back() == '\n') { + if (!buffer.empty() &&buffer.back() == '\n') { buffer.pop_back(); } + // skip empty messages + if (buffer.empty()) { + continue; + } + bool add_user_msg = true; // process commands @@ -304,6 +320,18 @@ int main(int argc, char ** argv) { cur_msg += marker; LOG("Loaded image from '%s'\n", fname.c_str()); continue; + } else if (string_starts_with(buffer, "/timings ")) { + std::string arg = string_strip(buffer.substr(9)); + if (arg == "on") { + params.show_timings = true; + LOG("Timings enabled.\n"); + } else if (arg == "off") { + params.show_timings = false; + LOG("Timings disabled.\n"); + } else { + LOG_ERR("Invalid argument for /timings: '%s'\n", arg.c_str()); + } + continue; } else { // not a command cur_msg += buffer; @@ -317,18 +345,31 @@ int main(int argc, char ** argv) { }); cur_msg.clear(); } - std::string assistant_content = ctx_cli.generate_completion(); + result_timings timings; + std::string assistant_content = ctx_cli.generate_completion(timings); ctx_cli.messages.push_back({ {"role", "assistant"}, {"content", assistant_content} }); LOG("\n"); + if (params.show_timings) { + console::set_display(console::info); + LOG("\n"); + LOG("Prompt: %.1f t/s | Generation: %.1f t/s\n", timings.prompt_per_second, timings.predicted_per_second); + console::set_display(console::reset); + } + if (params.single_turn) { break; } } + console::set_display(console::reset); + + // bump the log level to display timings + common_log_set_verbosity_thold(LOG_LEVEL_INFO); + LOG("\nExiting...\n"); ctx_cli.ctx_server.terminate(); inference_thread.join(); From fb252d721f72c7e09654a0e595f7e39da5162ceb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 23:44:57 +0100 Subject: [PATCH 13/32] add /read command, improve LOG_ERR --- tools/cli/cli.cpp | 49 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 7534546db21..f91ba92660a 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -1,7 +1,7 @@ #include "common.h" #include "arg.h" #include "console.h" -#include "log.h" +// #include "log.h" #include "server-context.h" #include "server-task.h" @@ -12,11 +12,22 @@ #include // TODO: without doing this, the colors get messed up +// the log.cpp doesn't play well with console.cpp, this should be fixed later #ifdef LOG #undef LOG #endif #define LOG(...) fprintf(stdout, __VA_ARGS__) +// redirect error logs to stdout in order to color them properly +#ifdef LOG_ERR +#undef LOG_ERR +#endif +#define LOG_ERR(...) do { \ + console::set_display(console::error); \ + LOG(__VA_ARGS__); \ + console::set_display(console::reset); \ + } while (0) + constexpr int POLLING_SECONDS = 1; const char * LLAMA_ASCII_LOGO = R"( @@ -63,6 +74,7 @@ struct cli_context { defaults.stream = true; // make sure we always use streaming mode defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way + // defaults.return_progress = true; // TODO: show progress // TODO: improve this mechanism later loading_display_thread = std::thread([this]() { @@ -141,16 +153,20 @@ struct cli_context { } // TODO: support remote files in the future (http, https, etc) - std::string load_input_files(const std::string & fname) { - input_files.clear(); + std::string load_input_file(const std::string & fname, bool is_media) { std::ifstream file(fname, std::ios::binary); if (!file) { return ""; } - raw_buffer buf; - buf.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - input_files.push_back(std::move(buf)); - return mtmd_default_marker(); + if (is_media) { + raw_buffer buf; + buf.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + input_files.push_back(std::move(buf)); + return mtmd_default_marker(); + } else { + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + return content; + } } }; @@ -197,7 +213,7 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG("Loading model... "); // followed by loading animation + LOG("\nLoading model... "); // followed by loading animation ctx_cli.show_loading(); if (!ctx_cli.ctx_server.load_model(params)) { ctx_cli.hide_loading(); @@ -244,6 +260,7 @@ int main(int argc, char ** argv) { LOG(" /regen regenerate the last response\n"); LOG(" /clear clear the chat history\n"); LOG(" /timings show timings for next responses\n"); + LOG(" /read add a text file\n"); if (inf.has_inp_image) { LOG(" /image add an image file\n"); } @@ -312,13 +329,23 @@ int main(int argc, char ** argv) { (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { // just in case (bad copy-paste for example), we strip all trailing/leading spaces std::string fname = string_strip(buffer.substr(7)); - std::string marker = ctx_cli.load_input_files(fname); + std::string marker = ctx_cli.load_input_file(fname, true); + if (marker.empty()) { + LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + continue; + } + cur_msg += marker; + LOG("Loaded media from '%s'\n", fname.c_str()); + continue; + } else if (string_starts_with(buffer, "/read ")) { + std::string fname = string_strip(buffer.substr(6)); + std::string marker = ctx_cli.load_input_file(fname, false); if (marker.empty()) { LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); continue; } cur_msg += marker; - LOG("Loaded image from '%s'\n", fname.c_str()); + LOG("Loaded text from '%s'\n", fname.c_str()); continue; } else if (string_starts_with(buffer, "/timings ")) { std::string arg = string_strip(buffer.substr(9)); @@ -329,7 +356,7 @@ int main(int argc, char ** argv) { params.show_timings = false; LOG("Timings disabled.\n"); } else { - LOG_ERR("Invalid argument for /timings: '%s'\n", arg.c_str()); + LOG_ERR("Invalid argument for /timings : '%s'\n", arg.c_str()); } continue; } else { From 9987ccb0430bfd0fc0b3dee5c291994d99184920 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Dec 2025 23:56:19 +0100 Subject: [PATCH 14/32] add args for speculative decoding, enable show timings by default --- common/arg.cpp | 50 +++++++++++++++++++++++------------------------ common/common.h | 2 +- tools/cli/cli.cpp | 18 +++-------------- 3 files changed, 29 insertions(+), 41 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index fcd5a83b249..c8c82f5e0fb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -975,7 +975,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.n_ctx_checkpoints = value; } - ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--cache-ram", "-cram"}, "N", string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n" @@ -983,7 +983,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.cache_ram_mib = value; } - ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--kv-unified", "-kvu"}, string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n" @@ -1034,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.prompt = value; } - ).set_excludes({LLAMA_EXAMPLE_SERVER})); + ).set_excludes({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-sys", "--system-prompt"}, "PROMPT", "system prompt to use with model (if applicable, depending on chat template)", @@ -1051,12 +1051,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_NO_PERF")); add_opt(common_arg( - {"--show-timings"}, - string_format("show timing information after each response (default: %s)", params.show_timings ? "true" : "false"), + {"--no-show-timings"}, + string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"), [](common_params & params) { - params.show_timings = true; + params.show_timings = false; } - ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS")); + ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", @@ -1068,7 +1068,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt.pop_back(); } } - ).set_excludes({LLAMA_EXAMPLE_SERVER})); + ).set_excludes({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-sysf", "--system-prompt-file"}, "FNAME", "a file containing the system prompt (default: none)", @@ -1105,7 +1105,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } - ).set_excludes({LLAMA_EXAMPLE_SERVER})); + ).set_excludes({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), @@ -1920,7 +1920,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--cpu-moe", "-cmoe"}, "keep all Mixture of Experts (MoE) weights in the CPU", @@ -1949,7 +1949,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); add_opt(common_arg( {"--n-cpu-moe-draft", "-ncmoed"}, "N", "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model", @@ -1963,7 +1963,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers), @@ -2442,7 +2442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.default_template_kwargs[item.key()] = item.value().dump(); } } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), @@ -2867,14 +2867,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.n_max = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX")); add_opt(common_arg( {"--draft-min", "--draft-n-min"}, "N", string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), [](common_params & params, int value) { params.speculative.n_min = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), @@ -2888,14 +2888,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.speculative.p_min = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN")); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), [](common_params & params, int value) { params.speculative.n_ctx = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); add_opt(common_arg( {"-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" @@ -2903,7 +2903,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.speculative.devices = parse_device_list(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", @@ -2915,21 +2915,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { params.speculative.model.path = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT")); add_opt(common_arg( {"--spec-replace"}, "TARGET", "DRAFT", "translate the string in TARGET into DRAFT if the draft model and main model are not compatible", [](common_params & params, const std::string & tgt, const std::string & dft) { params.speculative.replacements.push_back({ tgt, dft }); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-ctkd", "--cache-type-k-draft"}, "TYPE", string_format( @@ -3193,7 +3193,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_jinja = true; //params.default_template_kwargs["reasoning_effort"] = "\"high\""; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--gpt-oss-120b-default"}, @@ -3212,7 +3212,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_jinja = true; //params.default_template_kwargs["reasoning_effort"] = "\"high\""; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--vision-gemma-4b-default"}, @@ -3223,7 +3223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_ctx = 0; params.use_jinja = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--vision-gemma-12b-default"}, @@ -3234,7 +3234,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_ctx = 0; params.use_jinja = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); return ctx_arg; } diff --git a/common/common.h b/common/common.h index a15f7287b8a..5a4e9e5d94d 100644 --- a/common/common.h +++ b/common/common.h @@ -407,7 +407,7 @@ struct common_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool no_perf = false; // disable performance metrics - bool show_timings = false; // show timing information on CLI + bool show_timings = true; // show timing information on CLI bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index f91ba92660a..a99e526a6f9 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -179,7 +179,8 @@ int main(int argc, char ** argv) { return 1; } - if (params.conversation_mode == COMMON_CONVERSATION_MODE_ENABLED) { + // TODO: maybe support it later? + if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) { LOG_ERR("--no-conversation is not supported by llama-cli\n"); LOG_ERR("please use llama-completion instead\n"); } @@ -259,7 +260,6 @@ int main(int argc, char ** argv) { LOG(" /exit or Ctrl+C stop or exit\n"); LOG(" /regen regenerate the last response\n"); LOG(" /clear clear the chat history\n"); - LOG(" /timings show timings for next responses\n"); LOG(" /read add a text file\n"); if (inf.has_inp_image) { LOG(" /image add an image file\n"); @@ -347,18 +347,6 @@ int main(int argc, char ** argv) { cur_msg += marker; LOG("Loaded text from '%s'\n", fname.c_str()); continue; - } else if (string_starts_with(buffer, "/timings ")) { - std::string arg = string_strip(buffer.substr(9)); - if (arg == "on") { - params.show_timings = true; - LOG("Timings enabled.\n"); - } else if (arg == "off") { - params.show_timings = false; - LOG("Timings disabled.\n"); - } else { - LOG_ERR("Invalid argument for /timings : '%s'\n", arg.c_str()); - } - continue; } else { // not a command cur_msg += buffer; @@ -383,7 +371,7 @@ int main(int argc, char ** argv) { if (params.show_timings) { console::set_display(console::info); LOG("\n"); - LOG("Prompt: %.1f t/s | Generation: %.1f t/s\n", timings.prompt_per_second, timings.predicted_per_second); + LOG("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); console::set_display(console::reset); } From 57b8d60cbc6f6308ce755bb8921ffb801702fb22 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 7 Dec 2025 00:03:49 +0100 Subject: [PATCH 15/32] add arg --image and --audio --- common/arg.cpp | 8 ++++---- tools/cli/cli.cpp | 11 ++++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index c8c82f5e0fb..6b9c488cedb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1034,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.prompt = value; } - ).set_excludes({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-sys", "--system-prompt"}, "PROMPT", "system prompt to use with model (if applicable, depending on chat template)", @@ -1068,7 +1068,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt.pop_back(); } } - ).set_excludes({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-sysf", "--system-prompt-file"}, "FNAME", "a file containing the system prompt (default: none)", @@ -1105,7 +1105,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } - ).set_excludes({LLAMA_EXAMPLE_COMPLETION})); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), @@ -1827,7 +1827,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.image.emplace_back(value); } - ).set_examples({LLAMA_EXAMPLE_MTMD})); + ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--image-min-tokens"}, "N", "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)", diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index a99e526a6f9..c09368430cc 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -284,8 +284,17 @@ int main(int argc, char ** argv) { } while (another_line); } else { // process input prompt from args + for (auto & fname : params.image) { + std::string marker = ctx_cli.load_input_file(fname, true); + if (marker.empty()) { + LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + break; + } + LOG("Loaded media from '%s'\n", fname.c_str()); + cur_msg += marker; + } buffer = params.prompt; - LOG("\n> %s\n", buffer.c_str()); + LOG("\n> %s\n", buffer.c_str()); // TODO: maybe truncate if too long to display params.prompt.clear(); // only use it once } console::set_display(console::reset); From f193bbfd3b67234cd24782a1770213657c8c7e3b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 7 Dec 2025 11:29:53 +0100 Subject: [PATCH 16/32] fix windows build --- tools/cli/cli.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index c09368430cc..3b366a5ba72 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -11,6 +11,14 @@ #include #include +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + // TODO: without doing this, the colors get messed up // the log.cpp doesn't play well with console.cpp, this should be fixed later #ifdef LOG From fa95df053ea6969e158b1c5be45802930b814b8a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 7 Dec 2025 13:27:35 +0100 Subject: [PATCH 17/32] support reasoning_content --- common/console.cpp | 4 ++++ common/console.h | 1 + tools/cli/cli.cpp | 27 ++++++++++++++++++++++++--- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/common/console.cpp b/common/console.cpp index aac2893a75f..a6a730a573a 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -30,6 +30,7 @@ #define ANSI_COLOR_BLUE "\x1b[34m" #define ANSI_COLOR_MAGENTA "\x1b[35m" #define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_GRAY "\x1b[90m" #define ANSI_COLOR_RESET "\x1b[0m" #define ANSI_BOLD "\x1b[1m" @@ -157,6 +158,9 @@ namespace console { case prompt: fprintf(out, ANSI_COLOR_YELLOW); break; + case reasoning: + fprintf(out, ANSI_COLOR_GRAY); + break; case user_input: fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN); break; diff --git a/common/console.h b/common/console.h index c2c2f9dceef..565634136b3 100644 --- a/common/console.h +++ b/common/console.h @@ -9,6 +9,7 @@ namespace console { reset = 0, info, prompt, + reasoning, user_input, error }; diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 3b366a5ba72..47c6ee1ead7 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -83,6 +83,7 @@ struct cli_context { defaults.stream = true; // make sure we always use streaming mode defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // defaults.return_progress = true; // TODO: show progress + defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; // TODO: improve this mechanism later loading_display_thread = std::thread([this]() { @@ -115,9 +116,11 @@ struct cli_context { // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); task.id = queues.first.get_new_id(); + task.index = 0; task.params = defaults; // copy task.cli_input = messages; // copy task.cli_files = input_files; // copy + rd.set_states({task_result_state(defaults.oaicompat_chat_syntax)}); rd.post_task({std::move(task)}); } @@ -127,6 +130,7 @@ struct cli_context { hide_loading(); std::string curr_content; + bool is_thinking = false; while (result) { if (should_stop()) { @@ -144,9 +148,26 @@ struct cli_context { auto res_partial = dynamic_cast(result.get()); if (res_partial) { out_timings = std::move(res_partial->timings); - curr_content += res_partial->content; - LOG("%s", res_partial->content.c_str()); - fflush(stdout); + for (const auto & diff : res_partial->oaicompat_msg_diffs) { + if (!diff.content_delta.empty()) { + if (is_thinking) { + LOG("\n[End thinking]\n\n"); + console::set_display(console::reset); + is_thinking = false; + } + curr_content += diff.content_delta; + LOG("%s", diff.content_delta.c_str()); + } + if (!diff.reasoning_content_delta.empty()) { + console::set_display(console::reasoning); + if (!is_thinking) { + LOG("[Start thinking]\n"); + } + is_thinking = true; + LOG("%s", diff.reasoning_content_delta.c_str()); + } + fflush(stdout); + } } auto res_final = dynamic_cast(result.get()); if (res_final) { From 7d76234e6d504c46f84a6da2a0340fdd118cebdd Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 7 Dec 2025 14:20:56 +0100 Subject: [PATCH 18/32] fix llama2c workflow --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ad205f3ec96..182d433b1b9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -243,7 +243,7 @@ jobs: echo "Fetch llama2c model" wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - name: Test llama2c (s390x) id: llama2c_test_s390x @@ -252,7 +252,7 @@ jobs: cd build echo "Fetch llama2c big-endian model" wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf - ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest From 9b26375a06fc69ee21bde3107b54a87a20ba0283 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 7 Dec 2025 14:23:37 +0100 Subject: [PATCH 19/32] color default is auto --- common/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index a9f2e13a5b1..ad79f5b425c 100644 --- a/common/common.h +++ b/common/common.h @@ -395,7 +395,7 @@ struct common_params { bool usage = false; // print usage bool completion = false; // print source-able completion script - bool use_color = true; // use color to distinguish generations and inputs + bool use_color = false; // use color to distinguish generations and inputs bool special = false; // enable special token output bool interactive = false; // interactive mode bool interactive_first = false; // wait for user input immediately From a03b2af8f138268a639fff513dfba6957204001d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 14:56:40 +0100 Subject: [PATCH 20/32] fix merge conflicts --- tools/cli/cli.cpp | 8 ++------ tools/server/server-queue.cpp | 6 ------ tools/server/server-queue.h | 6 ++++-- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 47c6ee1ead7..92bfcbdc2e6 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -36,8 +36,6 @@ console::set_display(console::reset); \ } while (0) -constexpr int POLLING_SECONDS = 1; - const char * LLAMA_ASCII_LOGO = R"( ▄▄ ▄▄ ██ ██ @@ -110,17 +108,15 @@ struct cli_context { } std::string generate_completion(result_timings & out_timings) { - auto queues = ctx_server.get_queues(); - server_response_reader rd(queues, POLLING_SECONDS); + server_response_reader rd = ctx_server.get_response_reader(); { // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); - task.id = queues.first.get_new_id(); + task.id = rd.get_new_id(); task.index = 0; task.params = defaults; // copy task.cli_input = messages; // copy task.cli_files = input_files; // copy - rd.set_states({task_result_state(defaults.oaicompat_chat_syntax)}); rd.post_task({std::move(task)}); } diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index c4c213d3d4a..3cceb2bbe21 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -290,12 +290,6 @@ void server_response_reader::post_tasks(std::vector && tasks) { queue_tasks.post(std::move(tasks)); } -void server_response_reader::post_task(server_task && task) { - id_tasks = {task.id}; - queue_results.add_waiting_task_id(task.id); - queue_tasks.post(std::move(task)); -} - bool server_response_reader::has_next() const { return !cancelled && received_count < id_tasks.size(); } diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 39829136668..8780d7fe129 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -135,9 +135,11 @@ struct server_response_reader { stop(); } - void post_task(server_task && tasks); - void post_tasks(std::vector && tasks); + int get_new_id() { + return queue_tasks.get_new_id(); + } void post_task(server_task && task); + void post_tasks(std::vector && tasks); bool has_next() const; // return nullptr if should_stop() is true before receiving a result From 3bb5471ebec5c5c9f348a6d3a0187cc1bbaf84fb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 15:01:21 +0100 Subject: [PATCH 21/32] properly fix color problem Co-authored-by: bandoti --- common/console.cpp | 4 +++- common/log.cpp | 17 +++++++++++++++++ common/log.h | 1 + tools/cli/cli.cpp | 17 ----------------- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/common/console.cpp b/common/console.cpp index b8fbe51e4ea..7b82b82ac83 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -1,4 +1,5 @@ #include "console.h" +#include "log.h" #include #include #include @@ -169,7 +170,8 @@ namespace console { // Keep track of current display and only emit ANSI code if it changes void set_display(display_t display) { if (advanced_display && current_display != display) { - fflush(stdout); + common_log_flush(common_log_main()); + fflush(out); switch(display) { case reset: fprintf(out, ANSI_COLOR_RESET); diff --git a/common/log.cpp b/common/log.cpp index 00a03f158d3..db52fe89677 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -151,6 +151,7 @@ struct common_log { std::mutex mtx; std::thread thrd; std::condition_variable cv; + std::condition_variable cv_flushed; FILE * file; @@ -265,6 +266,10 @@ struct common_log { cur = entries[head]; head = (head + 1) % entries.size(); + + if (head == tail) { + cv_flushed.notify_all(); + } } if (cur.is_end) { @@ -353,6 +358,14 @@ struct common_log { this->timestamps = timestamps; } + + void flush() { + if (!running) { + return; + } + std::unique_lock lock(mtx); + cv_flushed.wait(lock, [this]() { return head == tail; }); + } }; // @@ -420,6 +433,10 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) { log->set_timestamps(timestamps); } +void common_log_flush(struct common_log * log) { + log->flush(); +} + static int common_get_verbosity(enum ggml_log_level level) { switch (level) { case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG; diff --git a/common/log.h b/common/log.h index b24f5f000a6..954e88cb449 100644 --- a/common/log.h +++ b/common/log.h @@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix +void common_log_flush (struct common_log * log); // flush all pending log messages // helper macros for logging // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 92bfcbdc2e6..276cd5e87ef 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -19,23 +19,6 @@ #include #endif -// TODO: without doing this, the colors get messed up -// the log.cpp doesn't play well with console.cpp, this should be fixed later -#ifdef LOG -#undef LOG -#endif -#define LOG(...) fprintf(stdout, __VA_ARGS__) - -// redirect error logs to stdout in order to color them properly -#ifdef LOG_ERR -#undef LOG_ERR -#endif -#define LOG_ERR(...) do { \ - console::set_display(console::error); \ - LOG(__VA_ARGS__); \ - console::set_display(console::reset); \ - } while (0) - const char * LLAMA_ASCII_LOGO = R"( ▄▄ ▄▄ ██ ██ From f00041db19160c832f9e33a568f9abd0e4bf1f56 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 15:46:06 +0100 Subject: [PATCH 22/32] better loading spinner --- common/console.cpp | 85 ++++++++++++++++++++++++++++++---------------- common/console.h | 6 +++- common/log.cpp | 1 + common/log.h | 2 +- tools/cli/cli.cpp | 34 +++---------------- 5 files changed, 67 insertions(+), 61 deletions(-) diff --git a/common/console.cpp b/common/console.cpp index 7b82b82ac83..5e6a7fddf97 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -40,8 +43,6 @@ #define ANSI_COLOR_RESET "\x1b[0m" #define ANSI_BOLD "\x1b[1m" -static const char LOADING_CHARS[] = {'|', '/', '-', '\\'}; - namespace console { #if defined (_WIN32) @@ -67,8 +68,6 @@ namespace console { static bool advanced_display = false; static bool simple_io = true; - static bool loading_show = false; - static int loading_idx = 0; static display_t current_display = reset; static FILE* out = stdout; @@ -171,7 +170,6 @@ namespace console { void set_display(display_t display) { if (advanced_display && current_display != display) { common_log_flush(common_log_main()); - fflush(out); switch(display) { case reset: fprintf(out, ANSI_COLOR_RESET); @@ -1058,31 +1056,6 @@ namespace console { return multiline_input; } - void set_loading(bool enabled) { - if (!simple_io) { - if (!loading_show && enabled) { - // turn on loading - fputc(' ', out); - fflush(out); - } - - if (loading_show && !enabled) { - // turn off loading - replace_last(' '); - pop_cursor(); - fflush(out); - } - - loading_show = enabled; - - if (loading_show) { - loading_idx = (loading_idx + 1) % sizeof(LOADING_CHARS); - replace_last(LOADING_CHARS[loading_idx]); - fflush(out); - } - } - } - bool readline(std::string & line, bool multiline_input) { set_display(user_input); @@ -1092,4 +1065,56 @@ namespace console { return readline_advanced(line, multiline_input); } + namespace spinner { + static const char LOADING_CHARS[] = {'|', '/', '-', '\\'}; + static std::condition_variable cv_stop; + static std::thread th; + static size_t frame = 0; // only modified by one thread + static bool running = false; + static std::mutex mtx; + static auto wait_time = std::chrono::milliseconds(100); + static void draw_next_frame() { + // don't need lock because only one thread modifies running + frame = (frame + 1) % sizeof(LOADING_CHARS); + replace_last(LOADING_CHARS[frame]); + fflush(out); + } + void start() { + std::unique_lock lock(mtx); + if (simple_io || running) { + return; + } + common_log_flush(common_log_main()); + fprintf(out, "%c", LOADING_CHARS[0]); + fflush(out); + frame = 1; + running = true; + th = std::thread([]() { + std::unique_lock lock(mtx); + while (true) { + if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) { + break; + } + draw_next_frame(); + } + }); + } + void stop() { + { + std::unique_lock lock(mtx); + if (simple_io || !running) { + return; + } + running = false; + cv_stop.notify_all(); + } + if (th.joinable()) { + th.join(); + } + replace_last(' '); + pop_cursor(); + fflush(out); + } + } + } diff --git a/common/console.h b/common/console.h index 565634136b3..04098788585 100644 --- a/common/console.h +++ b/common/console.h @@ -17,6 +17,10 @@ namespace console { void init(bool use_simple_io, bool use_advanced_display); void cleanup(); void set_display(display_t display); - void set_loading(bool enabled); // update frame each time it's called bool readline(std::string & line, bool multiline_input); + + namespace spinner { + void start(); + void stop(); + } } diff --git a/common/log.cpp b/common/log.cpp index db52fe89677..6f01d6c02be 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -365,6 +365,7 @@ struct common_log { } std::unique_lock lock(mtx); cv_flushed.wait(lock, [this]() { return head == tail; }); + fflush(stdout); } }; diff --git a/common/log.h b/common/log.h index 954e88cb449..f0f8471b5f4 100644 --- a/common/log.h +++ b/common/log.h @@ -84,7 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix -void common_log_flush (struct common_log * log); // flush all pending log messages +void common_log_flush (struct common_log * log); // flush all pending log messages // helper macros for logging // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 276cd5e87ef..2b66437dd96 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -52,7 +52,6 @@ struct cli_context { // thread for showing "loading" animation std::atomic loading_show; - std::thread loading_display_thread; cli_context(const common_params & params) { defaults.sampling = params.sampling; @@ -65,29 +64,6 @@ struct cli_context { defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // defaults.return_progress = true; // TODO: show progress defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; - - // TODO: improve this mechanism later - loading_display_thread = std::thread([this]() { - while (true) { - if (loading_show.load()) { - // update loading frame - console::set_loading(true); - } - std::this_thread::sleep_for(std::chrono::milliseconds(150)); - } - }); - loading_display_thread.detach(); - } - - void show_loading() { - fflush(stdout); - loading_show.store(true); - } - - void hide_loading() { - loading_show.store(false); - // clear loading here in case the thread is sleeping - console::set_loading(false); } std::string generate_completion(result_timings & out_timings) { @@ -104,10 +80,10 @@ struct cli_context { } // wait for first result - show_loading(); + console::spinner::start(); server_task_result_ptr result = rd.next(should_stop); - hide_loading(); + console::spinner::stop(); std::string curr_content; bool is_thinking = false; @@ -223,16 +199,16 @@ int main(int argc, char ** argv) { #endif LOG("\nLoading model... "); // followed by loading animation - ctx_cli.show_loading(); + console::spinner::start(); if (!ctx_cli.ctx_server.load_model(params)) { - ctx_cli.hide_loading(); + console::spinner::stop(); LOG_ERR("\nFailed to load the model\n"); return 1; } ctx_cli.ctx_server.init(); - ctx_cli.hide_loading(); + console::spinner::stop(); LOG("\n"); std::thread inference_thread([&ctx_cli]() { From 7de769747224acaf71b1d87d1c6d32c435f46caa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 15:53:34 +0100 Subject: [PATCH 23/32] make sure to clean color on force-exit --- tools/cli/cli.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 2b66437dd96..467ddb08a9c 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -38,6 +38,9 @@ static bool should_stop() { static void signal_handler(int) { if (g_is_interrupted.load()) { // second Ctrl+C - exit immediately + // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock) + fprintf(stdout, "\033[0m\n"); + fflush(stdout); std::exit(130); } g_is_interrupted.store(true); @@ -278,7 +281,11 @@ int main(int argc, char ** argv) { cur_msg += marker; } buffer = params.prompt; - LOG("\n> %s\n", buffer.c_str()); // TODO: maybe truncate if too long to display + if (buffer.size() > 500) { + LOG("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); + } else { + LOG("\n> %s\n", buffer.c_str()); + } params.prompt.clear(); // only use it once } console::set_display(console::reset); From 2731b4d6e16ca64a6220fab66853c54f1e783164 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 15:55:56 +0100 Subject: [PATCH 24/32] also clear input files on "/clear" --- tools/cli/cli.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 467ddb08a9c..aa841f14c83 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -322,6 +322,7 @@ int main(int argc, char ** argv) { } } else if (string_starts_with(buffer, "/clear")) { ctx_cli.messages.clear(); + ctx_cli.input_files.clear(); LOG("Chat history cleared.\n"); continue; } else if ( From 6defee4ab435b7eb4d8d2b730a7bf3161f9ca44f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 11:36:26 +0100 Subject: [PATCH 25/32] simplify common_log_flush --- common/log.cpp | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/common/log.cpp b/common/log.cpp index 6f01d6c02be..b17d2b62c35 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -151,7 +151,6 @@ struct common_log { std::mutex mtx; std::thread thrd; std::condition_variable cv; - std::condition_variable cv_flushed; FILE * file; @@ -266,10 +265,6 @@ struct common_log { cur = entries[head]; head = (head + 1) % entries.size(); - - if (head == tail) { - cv_flushed.notify_all(); - } } if (cur.is_end) { @@ -358,15 +353,6 @@ struct common_log { this->timestamps = timestamps; } - - void flush() { - if (!running) { - return; - } - std::unique_lock lock(mtx); - cv_flushed.wait(lock, [this]() { return head == tail; }); - fflush(stdout); - } }; // @@ -435,7 +421,8 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) { } void common_log_flush(struct common_log * log) { - log->flush(); + log->pause(); + log->resume(); } static int common_get_verbosity(enum ggml_log_level level) { From a7a3fbe840c1e621e52533c6f961584ab7d13141 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 11:40:50 +0100 Subject: [PATCH 26/32] add warning in mtmd-cli --- tools/mtmd/mtmd-cli.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index b5bbc6536b5..c5dbad72e91 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -310,6 +310,9 @@ int main(int argc, char ** argv) { if (g_is_interrupted) return 130; + LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n"); + LOG_WRN(" For normal use cases, please use the standard llama-cli\n"); + if (is_single_turn) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { From 9627d2c02ae6f61ab413d11803aae7c5dcd8ee55 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 12:05:52 +0100 Subject: [PATCH 27/32] implement console writter --- common/console.cpp | 54 +++++++++++++++--------- common/console.h | 24 ++++++----- tools/cli/cli.cpp | 102 +++++++++++++++++++++++---------------------- 3 files changed, 101 insertions(+), 79 deletions(-) diff --git a/common/console.cpp b/common/console.cpp index 5e6a7fddf97..2bdcff9f754 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -66,17 +66,17 @@ namespace console { // #endif - static bool advanced_display = false; - static bool simple_io = true; - static display_t current_display = reset; + static bool advanced_display = false; + static bool simple_io = true; + static display_type current_display = DISPLAY_TYPE_RESET; - static FILE* out = stdout; + static FILE* out = stdout; #if defined (_WIN32) - static void* hConsole; + static void* hConsole; #else - static FILE* tty = nullptr; - static termios initial_state; + static FILE* tty = nullptr; + static termios initial_state; #endif // @@ -147,7 +147,7 @@ namespace console { void cleanup() { // Reset console display - set_display(reset); + set_display(DISPLAY_TYPE_RESET); #if !defined(_WIN32) // Restore settings on POSIX systems @@ -167,26 +167,26 @@ namespace console { // // Keep track of current display and only emit ANSI code if it changes - void set_display(display_t display) { + void set_display(display_type display) { if (advanced_display && current_display != display) { common_log_flush(common_log_main()); switch(display) { - case reset: + case DISPLAY_TYPE_RESET: fprintf(out, ANSI_COLOR_RESET); break; - case info: + case DISPLAY_TYPE_INFO: fprintf(out, ANSI_COLOR_MAGENTA); break; - case prompt: + case DISPLAY_TYPE_PROMPT: fprintf(out, ANSI_COLOR_YELLOW); break; - case reasoning: + case DISPLAY_TYPE_REASONING: fprintf(out, ANSI_COLOR_GRAY); break; - case user_input: + case DISPLAY_TYPE_USER_INPUT: fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN); break; - case error: + case DISPLAY_TYPE_ERROR: fprintf(out, ANSI_BOLD ANSI_COLOR_RED); } current_display = display; @@ -789,7 +789,6 @@ namespace console { } if (is_special_char) { - set_display(user_input); replace_last(line.back()); is_special_char = false; } @@ -972,7 +971,6 @@ namespace console { } if (!line.empty() && (line.back() == '\\' || line.back() == '/')) { - set_display(prompt); replace_last(line.back()); is_special_char = true; } @@ -1057,8 +1055,6 @@ namespace console { } bool readline(std::string & line, bool multiline_input) { - set_display(user_input); - if (simple_io) { return readline_simple(line, multiline_input); } @@ -1117,4 +1113,24 @@ namespace console { } } + void log(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(out, fmt, args); + va_end(args); + } + + void error(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + display_type cur = current_display; + set_display(DISPLAY_TYPE_ERROR); + vfprintf(out, fmt, args); + set_display(cur); // restore previous color + va_end(args); + } + + void flush() { + fflush(out); + } } diff --git a/common/console.h b/common/console.h index 04098788585..ff83f96dc14 100644 --- a/common/console.h +++ b/common/console.h @@ -4,23 +4,27 @@ #include -namespace console { - enum display_t { - reset = 0, - info, - prompt, - reasoning, - user_input, - error - }; +enum display_type { + DISPLAY_TYPE_RESET = 0, + DISPLAY_TYPE_INFO, + DISPLAY_TYPE_PROMPT, + DISPLAY_TYPE_REASONING, + DISPLAY_TYPE_USER_INPUT, + DISPLAY_TYPE_ERROR +}; +namespace console { void init(bool use_simple_io, bool use_advanced_display); void cleanup(); - void set_display(display_t display); + void set_display(display_type display); bool readline(std::string & line, bool multiline_input); namespace spinner { void start(); void stop(); } + + void log(const char * fmt, ...); + void error(const char * fmt, ...); + void flush(); } diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index aa841f14c83..07b09d50d9c 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -97,9 +97,9 @@ struct cli_context { if (result->is_error()) { json err_data = result->to_json(); if (err_data.contains("message")) { - LOG_ERR("Error: %s\n", err_data["message"].get().c_str()); + console::error("Error: %s\n", err_data["message"].get().c_str()); } else { - LOG_ERR("Error: %s\n", err_data.dump().c_str()); + console::error("Error: %s\n", err_data.dump().c_str()); } return curr_content; } @@ -109,20 +109,22 @@ struct cli_context { for (const auto & diff : res_partial->oaicompat_msg_diffs) { if (!diff.content_delta.empty()) { if (is_thinking) { - LOG("\n[End thinking]\n\n"); - console::set_display(console::reset); + console::log("\n[End thinking]\n\n"); + console::set_display(DISPLAY_TYPE_RESET); is_thinking = false; } curr_content += diff.content_delta; - LOG("%s", diff.content_delta.c_str()); + console::log("%s", diff.content_delta.c_str()); + console::flush(); } if (!diff.reasoning_content_delta.empty()) { - console::set_display(console::reasoning); + console::set_display(DISPLAY_TYPE_REASONING); if (!is_thinking) { - LOG("[Start thinking]\n"); + console::log("[Start thinking]\n"); } is_thinking = true; - LOG("%s", diff.reasoning_content_delta.c_str()); + console::log("%s", diff.reasoning_content_delta.c_str()); + console::flush(); } fflush(stdout); } @@ -168,8 +170,8 @@ int main(int argc, char ** argv) { // TODO: maybe support it later? if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) { - LOG_ERR("--no-conversation is not supported by llama-cli\n"); - LOG_ERR("please use llama-completion instead\n"); + console::error("--no-conversation is not supported by llama-cli\n"); + console::error("please use llama-completion instead\n"); } common_init(); @@ -185,7 +187,7 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_RESET); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; @@ -201,18 +203,18 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG("\nLoading model... "); // followed by loading animation + console::log("\nLoading model... "); // followed by loading animation console::spinner::start(); if (!ctx_cli.ctx_server.load_model(params)) { console::spinner::stop(); - LOG_ERR("\nFailed to load the model\n"); + console::error("\nFailed to load the model\n"); return 1; } ctx_cli.ctx_server.init(); console::spinner::stop(); - LOG("\n"); + console::log("\n"); std::thread inference_thread([&ctx_cli]() { ctx_cli.ctx_server.start_loop(); @@ -234,35 +236,35 @@ int main(int argc, char ** argv) { }); } - LOG("\n"); - LOG("%s\n", LLAMA_ASCII_LOGO); - LOG("build : %s\n", inf.build_info.c_str()); - LOG("model : %s\n", inf.model_name.c_str()); - LOG("modalities : %s\n", modalities.c_str()); + console::log("\n"); + console::log("%s\n", LLAMA_ASCII_LOGO); + console::log("build : %s\n", inf.build_info.c_str()); + console::log("model : %s\n", inf.model_name.c_str()); + console::log("modalities : %s\n", modalities.c_str()); if (!params.system_prompt.empty()) { - LOG("using custom system prompt\n"); + console::log("using custom system prompt\n"); } - LOG("\n"); - LOG("available commands:\n"); - LOG(" /exit or Ctrl+C stop or exit\n"); - LOG(" /regen regenerate the last response\n"); - LOG(" /clear clear the chat history\n"); - LOG(" /read add a text file\n"); + console::log("\n"); + console::log("available commands:\n"); + console::log(" /exit or Ctrl+C stop or exit\n"); + console::log(" /regen regenerate the last response\n"); + console::log(" /clear clear the chat history\n"); + console::log(" /read add a text file\n"); if (inf.has_inp_image) { - LOG(" /image add an image file\n"); + console::log(" /image add an image file\n"); } if (inf.has_inp_audio) { - LOG(" /audio add an audio file\n"); + console::log(" /audio add an audio file\n"); } - LOG("\n"); + console::log("\n"); // interactive loop std::string cur_msg; while (true) { std::string buffer; - console::set_display(console::user_input); + console::set_display(DISPLAY_TYPE_USER_INPUT); if (params.prompt.empty()) { - LOG("\n> "); + console::log("\n> "); std::string line; bool another_line = true; do { @@ -274,22 +276,22 @@ int main(int argc, char ** argv) { for (auto & fname : params.image) { std::string marker = ctx_cli.load_input_file(fname, true); if (marker.empty()) { - LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); break; } - LOG("Loaded media from '%s'\n", fname.c_str()); + console::log("Loaded media from '%s'\n", fname.c_str()); cur_msg += marker; } buffer = params.prompt; if (buffer.size() > 500) { - LOG("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); + console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); } else { - LOG("\n> %s\n", buffer.c_str()); + console::log("\n> %s\n", buffer.c_str()); } params.prompt.clear(); // only use it once } - console::set_display(console::reset); - LOG("\n"); + console::set_display(DISPLAY_TYPE_RESET); + console::log("\n"); if (should_stop()) { g_is_interrupted.store(false); @@ -317,13 +319,13 @@ int main(int argc, char ** argv) { ctx_cli.messages.erase(last_idx); add_user_msg = false; } else { - LOG_ERR("No message to regenerate.\n"); + console::error("No message to regenerate.\n"); continue; } } else if (string_starts_with(buffer, "/clear")) { ctx_cli.messages.clear(); ctx_cli.input_files.clear(); - LOG("Chat history cleared.\n"); + console::log("Chat history cleared.\n"); continue; } else if ( (string_starts_with(buffer, "/image ") && inf.has_inp_image) || @@ -332,21 +334,21 @@ int main(int argc, char ** argv) { std::string fname = string_strip(buffer.substr(7)); std::string marker = ctx_cli.load_input_file(fname, true); if (marker.empty()) { - LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); continue; } cur_msg += marker; - LOG("Loaded media from '%s'\n", fname.c_str()); + console::log("Loaded media from '%s'\n", fname.c_str()); continue; } else if (string_starts_with(buffer, "/read ")) { std::string fname = string_strip(buffer.substr(6)); std::string marker = ctx_cli.load_input_file(fname, false); if (marker.empty()) { - LOG_ERR("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); continue; } cur_msg += marker; - LOG("Loaded text from '%s'\n", fname.c_str()); + console::log("Loaded text from '%s'\n", fname.c_str()); continue; } else { // not a command @@ -367,13 +369,13 @@ int main(int argc, char ** argv) { {"role", "assistant"}, {"content", assistant_content} }); - LOG("\n"); + console::log("\n"); if (params.show_timings) { - console::set_display(console::info); - LOG("\n"); - LOG("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_INFO); + console::log("\n"); + console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); + console::set_display(DISPLAY_TYPE_RESET); } if (params.single_turn) { @@ -381,12 +383,12 @@ int main(int argc, char ** argv) { } } - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_RESET); // bump the log level to display timings common_log_set_verbosity_thold(LOG_LEVEL_INFO); - LOG("\nExiting...\n"); + console::log("\nExiting...\n"); ctx_cli.ctx_server.terminate(); inference_thread.join(); llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); From d00d00a508d097f2d9811a25798cc20e9f609baa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 12:14:15 +0100 Subject: [PATCH 28/32] fix data race --- tools/cli/cli.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 07b09d50d9c..8a8639207b8 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -126,7 +126,6 @@ struct cli_context { console::log("%s", diff.reasoning_content_delta.c_str()); console::flush(); } - fflush(stdout); } } auto res_final = dynamic_cast(result.get()); @@ -182,8 +181,7 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - // save choice to use color for later - // (note for later: this is a slightly awkward choice) + // TODO: avoid using atexit() here by making `console` a singleton console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); @@ -385,12 +383,12 @@ int main(int argc, char ** argv) { console::set_display(DISPLAY_TYPE_RESET); - // bump the log level to display timings - common_log_set_verbosity_thold(LOG_LEVEL_INFO); - console::log("\nExiting...\n"); ctx_cli.ctx_server.terminate(); inference_thread.join(); + + // bump the log level to display timings + common_log_set_verbosity_thold(LOG_LEVEL_INFO); llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); return 0; From e135b41616e6ca79910565bae90a78fbae45db84 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 12:18:37 +0100 Subject: [PATCH 29/32] add attribute --- common/console.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common/console.h b/common/console.h index ff83f96dc14..b80172ef970 100644 --- a/common/console.h +++ b/common/console.h @@ -2,6 +2,8 @@ #pragma once +#include "common.h" + #include enum display_type { @@ -24,7 +26,11 @@ namespace console { void stop(); } + LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) void log(const char * fmt, ...); + + LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) void error(const char * fmt, ...); + void flush(); } From ed3fe1934a0ff43dc6c9499ac25a3dbeaff5d789 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 12:23:06 +0100 Subject: [PATCH 30/32] fix llama-completion and mtmd-cli --- tools/completion/completion.cpp | 12 ++++++------ tools/mtmd/mtmd-cli.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 2fda549c201..79581eacb5d 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -537,7 +537,7 @@ int main(int argc, char ** argv) { std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode // the first thing we will do is to output the prompt, so set color accordingly - console::set_display(console::prompt); + console::set_display(DISPLAY_TYPE_PROMPT); display = params.display_prompt; std::vector embd; @@ -582,9 +582,9 @@ int main(int argc, char ** argv) { const int skipped_tokens = (int) embd.size() - max_embd_size; embd.resize(max_embd_size); - console::set_display(console::error); + console::set_display(DISPLAY_TYPE_ERROR); LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_RESET); } if (ga_n == 1) { @@ -766,7 +766,7 @@ int main(int argc, char ** argv) { // reset color to default if there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_RESET); display = true; } @@ -862,7 +862,7 @@ int main(int argc, char ** argv) { } // color user input only - console::set_display(console::user_input); + console::set_display(DISPLAY_TYPE_USER_INPUT); display = params.display_prompt; std::string line; @@ -873,7 +873,7 @@ int main(int argc, char ** argv) { } while (another_line); // done taking input, reset color - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_RESET); display = true; if (buffer.empty()) { // Ctrl+D on empty line exits diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index c5dbad72e91..a75af406cd9 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -352,11 +352,11 @@ int main(int argc, char ** argv) { while (!g_is_interrupted) { g_is_generating = false; LOG("\n> "); - console::set_display(console::user_input); + console::set_display(DISPLAY_TYPE_USER_INPUT); std::string line; console::readline(line, false); if (g_is_interrupted) break; - console::set_display(console::reset); + console::set_display(DISPLAY_TYPE_RESET); line = string_strip(line); if (line.empty()) { continue; From 9fdf5977c2073ad747aa75c7fbfe5e29bc619403 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 13:51:26 +0100 Subject: [PATCH 31/32] add some notes about console::log --- common/console.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common/console.h b/common/console.h index b80172ef970..fad6d395316 100644 --- a/common/console.h +++ b/common/console.h @@ -26,6 +26,11 @@ namespace console { void stop(); } + // note: the logging API below output directly to stdout + // it can negatively impact performance if used on inference thread + // only use in in a dedicated CLI thread + // for logging in inference thread, use log.h instead + LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) void log(const char * fmt, ...); From c5faae9a5ced9772403b8bf8dd02e6e6b46bb090 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Dec 2025 13:52:07 +0100 Subject: [PATCH 32/32] fix compilation --- common/console.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/console.cpp b/common/console.cpp index 2bdcff9f754..2ea178f81ed 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN