diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp index 390e9ec203..8d0762638b 100644 --- a/llamafile/flags.cpp +++ b/llamafile/flags.cpp @@ -65,6 +65,7 @@ const char *FLAG_prompt = nullptr; const char *FLAG_url_prefix = ""; const char *FLAG_www_root = "/zip/www"; double FLAG_token_rate = 1; +float FLAG_decay_growth = .01; float FLAG_frequency_penalty = 0; float FLAG_presence_penalty = 0; float FLAG_reserve_tokens = .15; @@ -72,6 +73,7 @@ float FLAG_temperature = .8; float FLAG_top_p = .95; int FLAG_batch = 256; int FLAG_ctx_size = 8192; +int FLAG_decay_delay = 60 * 5; int FLAG_flash_attn = false; int FLAG_gpu = 0; int FLAG_http_ibuf_size = 5 * 1024 * 1024; @@ -396,13 +398,6 @@ void llamafile_get_flags(int argc, char **argv) { continue; } - if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) { - if (i == argc) - missing("--slots"); - FLAG_slots = atoi(argv[i++]); - continue; - } - if (!strcmp(flag, "-m") || !strcmp(flag, "--model")) { if (i == argc) missing("--model"); @@ -482,6 +477,36 @@ void llamafile_get_flags(int argc, char **argv) { continue; } + ////////////////////////////////////////////////////////////////////// + // resource management flags + + if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) { + if (i == argc) + missing("--slots"); + FLAG_slots = atoi(argv[i++]); + continue; + } + + if (!strcmp(flag, "--decay-delay")) { + if (i == argc) + missing("--decay-delay"); + int n = atoi(argv[i++]); + if (!(0 <= n && n <= 31536000)) + error("--decay-delay INT must be between 1 and 31536000"); + FLAG_decay_delay = n; + continue; + } + + if (!strcmp(flag, "--decay-growth")) { + if (i == argc) + missing("--decay-growth"); + float n = atof(argv[i++]); + if (!(isnormal(n) && n > 0)) + error("--decay-growth FLOAT must be greater than 0"); + FLAG_decay_growth = n; + continue; + } + ////////////////////////////////////////////////////////////////////// // cpu flags diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index caf04f2dc3..fcc770bc08 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -35,12 +35,15 @@ extern const char *FLAG_prompt; extern const char *FLAG_url_prefix; extern const char *FLAG_www_root; extern double FLAG_token_rate; +extern float FLAG_decay_growth; extern float FLAG_frequency_penalty; extern float FLAG_presence_penalty; +extern float FLAG_reserve_tokens; extern float FLAG_temperature; extern float FLAG_top_p; extern int FLAG_batch; extern int FLAG_ctx_size; +extern int FLAG_decay_delay; extern int FLAG_flash_attn; extern int FLAG_gpu; extern int FLAG_gpu; @@ -49,7 +52,6 @@ extern int FLAG_http_obuf_size; extern int FLAG_keepalive; extern int FLAG_main_gpu; extern int FLAG_n_gpu_layers; -extern float FLAG_reserve_tokens; extern int FLAG_slots; extern int FLAG_split_mode; extern int FLAG_threads; diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index ed622c2d15..52d576b6de 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -228,6 +228,18 @@ Client::transport() } } + if (effective_ip_ != client_ip_) { + char name[17]; + snprintf(name, + sizeof(name), + "%hhu.%hhu.%hhu.%hhu", + effective_ip_ >> 24, + effective_ip_ >> 16, + effective_ip_ >> 8, + effective_ip_); + set_thread_name(name); + } + if (get_header("X-Priority") == "batch") { worker_->deprioritize(); } else if (!effective_ip_trusted_) { @@ -661,9 +673,10 @@ Client::dispatcher() } // get request-uri path + char method[9] = { 0 }; std::string_view p1 = path(); - if (FLAG_verbose >= 2) - SLOG("request path %.*s", (int)p1.size(), p1.data()); + WRITE64LE(method, msg_.method); + SLOG("%s %.*s", method, (int)p1.size(), p1.data()); if (!p1.starts_with(FLAG_url_prefix)) { SLOG("path prefix mismatch"); return send_error(404); @@ -779,7 +792,8 @@ Client::dispatcher() return false; } } - SLOG("served %s", resolved_.c_str()); + if (FLAG_verbose >= 1) + SLOG("served %s", resolved_.c_str()); cleanup(); return true; } diff --git a/llamafile/server/main.1 b/llamafile/server/main.1 index d9679ca37b..4dcd8d895e 100644 --- a/llamafile/server/main.1 +++ b/llamafile/server/main.1 @@ -35,6 +35,26 @@ Specifies path of sqlite3 database. .Pp The default is .Pa ~/.llamafile/llamafile.sqlite3 +.It Fl ngl Ar N , Fl Fl gpu-layers Ar N , Fl Fl n-gpu-layers Ar N +Specifies number of layers to offload to GPU. +.Pp +This flag must be passed in order to use GPU on systems with NVIDIA or +AMD GPUs. If you're confident that you have enough VRAM, then you can +pass +.Fl ngl Ar 999 +to enable full offloading, since this number is automatically downtuned +to however many number of layers the model has. If VRAM is limited, then +the +.Fl Fl verbose +flag may be passed to learn how many layers the model has, e.g. 35, +which can then be down-tuned until the out of memory error goes away. +.Pp +On Apple Silicon systems with Metal, GPU offloading is enabled by +default. Since these GPUs use unified memory, they're treated as having +a single layer; therefore, using values higher than 1 will be treated as +1. You can pass +.Fl ngl Ar 0 +to disable GPU offloading and run in CPU mode on Apple Metal systems. .It Fl l Ar HOSTPORT , Fl Fl listen Ar HOSTPORT Specifies the local [HOST:]PORT on which the HTTP server should listen. By default this is 0.0.0.0:8080 which means llamafiler will bind to port @@ -58,6 +78,16 @@ resources, and control how much completion parallelism can happen. Please note that .Fl Fl ctx-size has a strong influence on how many slots can be created. +.It Fl Fl decay-delay Ar INT +Number of seconds a context window slot needs to be inactive before the +system starts to strongly consider giving it to other clients. The +default is 300 which is five minutes. +.It Fl Fl decay-growth Ar FLOAT +Sets slot decay growth factor. Context window slots are assigned in a +least recently used fashion, based on the formula +.EQ +age + e sup {growth * (age - delay)} +.EN .It Fl p Ar TEXT , Fl Fl prompt Ar TEXT , Fl Fl system-prompt Ar TEXT Specifies system prompt. This value is passed along to the web frontend. .It Fl Fl no-display-prompt @@ -69,6 +99,11 @@ Specifies a URL prefix (subdirectory) under which the HTTP server will make the API accessible, e.g. /lamafiler. Useful when running llamafiler behind a reverse proxy such as NGINX or Redbean. By default, this is set to / (root). +.It Fl Fl verbose +Enable logging of diagnostic information. This flag is useful for +learning more about the model and hardware. It can also be helpful for +troubleshooting errors. We currently recommend that this flag be avoided +in production since the llama.cpp logger may disrupt thread cancelation. .It Fl w Ar N , Fl Fl workers Ar N Number of HTTP client handling threads. .It Fl Fl trust Ar CIDR @@ -161,7 +196,7 @@ models do. If it's a base model, then the web ui will automatically use completion mode only, without needing to specify this flag. This flag is useful in cases where a prompt template is defined by the gguf, but it is desirable for the chat interface to be disabled. -.It Fl Fl db-startup-sql +.It Fl Fl db-startup-sql Ar CODE Specifies SQL code that should be executed whenever connecting to the SQLite database. The default is the following code, which enables the write-ahead log. @@ -169,14 +204,14 @@ write-ahead log. PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL; .Ed -.It Fl Fl reserve-tokens +.It Fl Fl reserve-tokens Ar N Percent of context window to reserve for predicted tokens. When the server runs out of context window, old chat messages will be forgotten until this percent of the context is empty. The default is 15%. If this is specified as a floating point number, e.g. 0.15, then it'll be multiplied by 100 to get the percent. .El -.Sh EXAMPLE +.Sh EXAMPLES Here's an example of how you might start this server: .Pp .Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf" diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc index fd8b41b846..ae91c8bb05 100644 --- a/llamafile/server/main.1.asc +++ b/llamafile/server/main.1.asc @@ -37,6 +37,23 @@ OOPPTTIIOONNSS The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3 + --nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N + Specifies number of layers to offload to GPU. + + This flag must be passed in order to use GPU on systems with NVIDIA + or AMD GPUs. If you're confident that you have enough VRAM, then + you can pass --nnggll _9_9_9 to enable full offloading, since this number + is automatically downtuned to however many number of layers the + model has. If VRAM is limited, then the ----vveerrbboossee flag may be + passed to learn how many layers the model has, e.g. 35, which can + then be down-tuned until the out of memory error goes away. + + On Apple Silicon systems with Metal, GPU offloading is enabled by + default. Since these GPUs use unified memory, they're treated as + having a single layer; therefore, using values higher than 1 will + be treated as 1. You can pass --nnggll _0 to disable GPU offloading and + run in CPU mode on Apple Metal systems. + --ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T Specifies the local [HOST:]PORT on which the HTTP server should listen. By default this is 0.0.0.0:8080 which means llamafiler @@ -63,6 +80,16 @@ OOPPTTIIOONNSS parallelism can happen. Please note that ----ccttxx--ssiizzee has a strong influence on how many slots can be created. + ----ddeeccaayy--ddeellaayy _I_N_T + Number of seconds a context window slot needs to be inactive before + the system starts to strongly consider giving it to other clients. + The default is 300 which is five minutes. + + ----ddeeccaayy--ggrroowwtthh _F_L_O_A_T + Sets slot decay growth factor. Context window slots are assigned in + a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h + * (_a_g_e - _d_e_l_a_y)) + --pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T Specifies system prompt. This value is passed along to the web frontend. @@ -79,6 +106,13 @@ OOPPTTIIOONNSS llamafiler behind a reverse proxy such as NGINX or Redbean. By default, this is set to / (root). + ----vveerrbboossee + Enable logging of diagnostic information. This flag is useful for + learning more about the model and hardware. It can also be helpful + for troubleshooting errors. We currently recommend that this flag + be avoided in production since the llama.cpp logger may disrupt + thread cancelation. + --ww _N, ----wwoorrkkeerrss _N Number of HTTP client handling threads. @@ -193,7 +227,7 @@ OOPPTTIIOONNSS defined by the gguf, but it is desirable for the chat interface to be disabled. - ----ddbb--ssttaarrttuupp--ssqqll + ----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E Specifies SQL code that should be executed whenever connecting to the SQLite database. The default is the following code, which enables the write-ahead log. @@ -201,14 +235,14 @@ OOPPTTIIOONNSS PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL; - ----rreesseerrvvee--ttookkeennss + ----rreesseerrvvee--ttookkeennss _N Percent of context window to reserve for predicted tokens. When the server runs out of context window, old chat messages will be forgotten until this percent of the context is empty. The default is 15%. If this is specified as a floating point number, e.g. 0.15, then it'll be multiplied by 100 to get the percent. -EEXXAAMMPPLLEE +EEXXAAMMPPLLEESS Here's an example of how you might start this server: llamafiler -m all-MiniLM-L6-v2.F32.gguf diff --git a/llamafile/server/main.cpp b/llamafile/server/main.cpp index c57882b17f..5b0a5b4050 100644 --- a/llamafile/server/main.cpp +++ b/llamafile/server/main.cpp @@ -65,7 +65,8 @@ main(int argc, char* argv[]) // we must disable the llama.cpp logger // otherwise pthread_cancel() will cause deadlocks - FLAG_log_disable = true; + if (!llamafile_has(argv, "--verbose")) + FLAG_log_disable = true; // load model llama_model_params mparams = { diff --git a/llamafile/server/server.cpp b/llamafile/server/server.cpp index 67f0288460..29da798051 100644 --- a/llamafile/server/server.cpp +++ b/llamafile/server/server.cpp @@ -128,14 +128,19 @@ Server::accept(unsigned* out_ip) // set name char name[17]; + int port = ntohs(clientaddr.sin_port); unsigned ip = ntohl(clientaddr.sin_addr.s_addr); - snprintf(name, - sizeof(name), - "%hhu.%hhu.%hhu.%hhu", - ip >> 24, - ip >> 16, - ip >> 8, - ip); + if (ip == 0x7f000001) { + snprintf(name, sizeof(name), "%hu", port); + } else { + snprintf(name, + sizeof(name), + "%hhu.%hhu.%hhu.%hhu", + ip >> 24, + ip >> 16, + ip >> 8, + ip); + } set_thread_name(name); // keep sockets open diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp index 304c5a0f24..c57ca2541c 100644 --- a/llamafile/server/slot.cpp +++ b/llamafile/server/slot.cpp @@ -79,9 +79,10 @@ Slot::describe_error(int err) } } -Slot::Slot(llama_model* model) : model_(model) +Slot::Slot(int id, llama_model* model) : id_(id), model_(model) { dll_init(&elem_); + last_used_ = time(0); } Slot::~Slot() diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h index 9fe26afb8c..7fdd7bf881 100644 --- a/llamafile/server/slot.h +++ b/llamafile/server/slot.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -49,7 +50,9 @@ struct Slot static const char* describe_error(int); + int id_; Dll elem_; + time_t last_used_; llama_model* model_; clip_ctx* clip_ctx_ = nullptr; llama_context* ctx_ = nullptr; @@ -57,7 +60,7 @@ struct Slot std::string system_fingerprint_; ~Slot(); - explicit Slot(llama_model*); + Slot(int, llama_model*); int ctx_size() const; int ctx_used() const; bool start(); diff --git a/llamafile/server/slots.cpp b/llamafile/server/slots.cpp index 1bc275c459..7506f5ec90 100644 --- a/llamafile/server/slots.cpp +++ b/llamafile/server/slots.cpp @@ -16,12 +16,17 @@ // limitations under the License. #include "slots.h" +#include "llamafile/llamafile.h" +#include "llamafile/macros.h" #include "llamafile/server/atom.h" #include "llamafile/server/log.h" #include "llamafile/server/slot.h" #include "llamafile/server/slot_entry.h" #include "llamafile/vector.h" +#include #include +#include +#include namespace lf { namespace server { @@ -50,7 +55,7 @@ Slots::start(int count) int made = 0; pthread_mutex_lock(&lock_); for (int i = 0; i < count; ++i) { - Slot* slot = new Slot(model_); + Slot* slot = new Slot(i, model_); if (slot->start()) { ++made; slots_.emplace_back(slot); @@ -68,19 +73,52 @@ Slots::start(int count) } Slot* -Slots::take(const std::vector& prefix) +Slots::take(const std::vector& atoms) { pthread_mutex_lock(&lock_); for (;;) { - // find slot with longest matching prefix - // favoring least recently used if multiple ones - int best_cpl = 0; + // find best slot + // iteration order favors lru + time_t now = time(0); Dll* best_slot = nullptr; + double best_score = INT_MIN; for (Dll* e = dll_first(free_slots_); e; e = dll_next(free_slots_, e)) { - int cpl = vector_common_prefix_length(SLOT(e)->history_, prefix); - if (cpl >= best_cpl) { - best_cpl = cpl; + + // least recently used is good + int age = now - SLOT(e)->last_used_; + double decay = + age + exp(FLAG_decay_growth * (age - FLAG_decay_delay)); + + // common prefix length is good + int cpl = vector_common_prefix_length(SLOT(e)->history_, atoms); + + // common suffix length is good + int csl = 0; + int size = SLOT(e)->history_.size(); + for (int i = cpl + 1; i < size; ++i) { + if (size - i > atoms.size() - cpl) + continue; + if (std::equal(SLOT(e)->history_.begin() + i, + SLOT(e)->history_.end(), + atoms.begin() + cpl)) { + csl = size - i; + break; + } + } + + // discarded atoms is bad + int discard; + if (csl) { + discard = 0; + } else { + discard = size - cpl; + } + + // tally up score to determine best + double score = cpl + csl + decay - discard; + if (score >= best_score) { + best_score = score; best_slot = e; } } @@ -89,6 +127,9 @@ Slots::take(const std::vector& prefix) if (best_slot) { dll_remove(&free_slots_, best_slot); pthread_mutex_unlock(&lock_); + SLOG("acquired slot #%d with score %d", + SLOT(best_slot)->id_, + (int)MIN(INT_MAX, best_score)); return SLOT(best_slot); } @@ -101,8 +142,9 @@ Slots::take(const std::vector& prefix) void Slots::give(Slot* slot) { - SLOG("relinquishing slot"); unassert(slot); + SLOG("relinquishing slot #%d", slot->id_); + slot->last_used_ = time(0); pthread_mutex_lock(&lock_); dll_make_first(&free_slots_, &slot->elem_); pthread_cond_signal(&cond_); diff --git a/llamafile/server/v1_chat_completions.cpp b/llamafile/server/v1_chat_completions.cpp index aa7f65ee8e..659815ee62 100644 --- a/llamafile/server/v1_chat_completions.cpp +++ b/llamafile/server/v1_chat_completions.cpp @@ -652,6 +652,9 @@ Client::v1_chat_completions() } } choice["finish_reason"] = finish_reason; + SLOG("predicted %d tokens finished on %s", // + completion_tokens, + finish_reason); // finalize response cleanup_slot(this);