diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
index 390e9ec203..8d0762638b 100644
--- a/llamafile/flags.cpp
+++ b/llamafile/flags.cpp
@@ -65,6 +65,7 @@ const char *FLAG_prompt = nullptr;
 const char *FLAG_url_prefix = "";
 const char *FLAG_www_root = "/zip/www";
 double FLAG_token_rate = 1;
+float FLAG_decay_growth = .01;
 float FLAG_frequency_penalty = 0;
 float FLAG_presence_penalty = 0;
 float FLAG_reserve_tokens = .15;
@@ -72,6 +73,7 @@ float FLAG_temperature = .8;
 float FLAG_top_p = .95;
 int FLAG_batch = 256;
 int FLAG_ctx_size = 8192;
+int FLAG_decay_delay = 60 * 5;
 int FLAG_flash_attn = false;
 int FLAG_gpu = 0;
 int FLAG_http_ibuf_size = 5 * 1024 * 1024;
@@ -396,13 +398,6 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
-        if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) {
-            if (i == argc)
-                missing("--slots");
-            FLAG_slots = atoi(argv[i++]);
-            continue;
-        }
-
         if (!strcmp(flag, "-m") || !strcmp(flag, "--model")) {
             if (i == argc)
                 missing("--model");
@@ -482,6 +477,36 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
+        //////////////////////////////////////////////////////////////////////
+        // resource management flags
+
+        if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) {
+            if (i == argc)
+                missing("--slots");
+            FLAG_slots = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "--decay-delay")) {
+            if (i == argc)
+                missing("--decay-delay");
+            int n = atoi(argv[i++]);
+            if (!(0 <= n && n <= 31536000))
+                error("--decay-delay INT must be between 1 and 31536000");
+            FLAG_decay_delay = n;
+            continue;
+        }
+
+        if (!strcmp(flag, "--decay-growth")) {
+            if (i == argc)
+                missing("--decay-growth");
+            float n = atof(argv[i++]);
+            if (!(isnormal(n) && n > 0))
+                error("--decay-growth FLOAT must be greater than 0");
+            FLAG_decay_growth = n;
+            continue;
+        }
+
         //////////////////////////////////////////////////////////////////////
         // cpu flags
 
diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
index caf04f2dc3..fcc770bc08 100644
--- a/llamafile/llamafile.h
+++ b/llamafile/llamafile.h
@@ -35,12 +35,15 @@ extern const char *FLAG_prompt;
 extern const char *FLAG_url_prefix;
 extern const char *FLAG_www_root;
 extern double FLAG_token_rate;
+extern float FLAG_decay_growth;
 extern float FLAG_frequency_penalty;
 extern float FLAG_presence_penalty;
+extern float FLAG_reserve_tokens;
 extern float FLAG_temperature;
 extern float FLAG_top_p;
 extern int FLAG_batch;
 extern int FLAG_ctx_size;
+extern int FLAG_decay_delay;
 extern int FLAG_flash_attn;
 extern int FLAG_gpu;
 extern int FLAG_gpu;
@@ -49,7 +52,6 @@ extern int FLAG_http_obuf_size;
 extern int FLAG_keepalive;
 extern int FLAG_main_gpu;
 extern int FLAG_n_gpu_layers;
-extern float FLAG_reserve_tokens;
 extern int FLAG_slots;
 extern int FLAG_split_mode;
 extern int FLAG_threads;
diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
index ed622c2d15..52d576b6de 100644
--- a/llamafile/server/client.cpp
+++ b/llamafile/server/client.cpp
@@ -228,6 +228,18 @@ Client::transport()
         }
     }
 
+    if (effective_ip_ != client_ip_) {
+        char name[17];
+        snprintf(name,
+                 sizeof(name),
+                 "%hhu.%hhu.%hhu.%hhu",
+                 effective_ip_ >> 24,
+                 effective_ip_ >> 16,
+                 effective_ip_ >> 8,
+                 effective_ip_);
+        set_thread_name(name);
+    }
+
     if (get_header("X-Priority") == "batch") {
         worker_->deprioritize();
     } else if (!effective_ip_trusted_) {
@@ -661,9 +673,10 @@ Client::dispatcher()
     }
 
     // get request-uri path
+    char method[9] = { 0 };
     std::string_view p1 = path();
-    if (FLAG_verbose >= 2)
-        SLOG("request path %.*s", (int)p1.size(), p1.data());
+    WRITE64LE(method, msg_.method);
+    SLOG("%s %.*s", method, (int)p1.size(), p1.data());
     if (!p1.starts_with(FLAG_url_prefix)) {
         SLOG("path prefix mismatch");
         return send_error(404);
@@ -779,7 +792,8 @@ Client::dispatcher()
             return false;
         }
     }
-    SLOG("served %s", resolved_.c_str());
+    if (FLAG_verbose >= 1)
+        SLOG("served %s", resolved_.c_str());
     cleanup();
     return true;
 }
diff --git a/llamafile/server/main.1 b/llamafile/server/main.1
index d9679ca37b..4dcd8d895e 100644
--- a/llamafile/server/main.1
+++ b/llamafile/server/main.1
@@ -35,6 +35,26 @@ Specifies path of sqlite3 database.
 .Pp
 The default is
 .Pa ~/.llamafile/llamafile.sqlite3
+.It Fl ngl Ar N , Fl Fl gpu-layers Ar N , Fl Fl n-gpu-layers Ar N
+Specifies number of layers to offload to GPU.
+.Pp
+This flag must be passed in order to use GPU on systems with NVIDIA or
+AMD GPUs. If you're confident that you have enough VRAM, then you can
+pass
+.Fl ngl Ar 999
+to enable full offloading, since this number is automatically downtuned
+to however many number of layers the model has. If VRAM is limited, then
+the
+.Fl Fl verbose
+flag may be passed to learn how many layers the model has, e.g. 35,
+which can then be down-tuned until the out of memory error goes away.
+.Pp
+On Apple Silicon systems with Metal, GPU offloading is enabled by
+default. Since these GPUs use unified memory, they're treated as having
+a single layer; therefore, using values higher than 1 will be treated as
+1. You can pass
+.Fl ngl Ar 0
+to disable GPU offloading and run in CPU mode on Apple Metal systems.
 .It Fl l Ar HOSTPORT , Fl Fl listen Ar HOSTPORT
 Specifies the local [HOST:]PORT on which the HTTP server should listen.
 By default this is 0.0.0.0:8080 which means llamafiler will bind to port
@@ -58,6 +78,16 @@ resources, and control how much completion parallelism can happen.
 Please note that
 .Fl Fl ctx-size
 has a strong influence on how many slots can be created.
+.It Fl Fl decay-delay Ar INT
+Number of seconds a context window slot needs to be inactive before the
+system starts to strongly consider giving it to other clients. The
+default is 300 which is five minutes.
+.It Fl Fl decay-growth Ar FLOAT
+Sets slot decay growth factor. Context window slots are assigned in a
+least recently used fashion, based on the formula
+.EQ
+age + e sup {growth * (age - delay)}
+.EN
 .It Fl p Ar TEXT , Fl Fl prompt Ar TEXT , Fl Fl system-prompt Ar TEXT
 Specifies system prompt. This value is passed along to the web frontend.
 .It Fl Fl no-display-prompt
@@ -69,6 +99,11 @@ Specifies a URL prefix (subdirectory) under which the HTTP server will
 make the API accessible, e.g. /lamafiler. Useful when running llamafiler
 behind a reverse proxy such as NGINX or Redbean. By default, this is set
 to / (root).
+.It Fl Fl verbose
+Enable logging of diagnostic information. This flag is useful for
+learning more about the model and hardware. It can also be helpful for
+troubleshooting errors. We currently recommend that this flag be avoided
+in production since the llama.cpp logger may disrupt thread cancelation.
 .It Fl w Ar N , Fl Fl workers Ar N
 Number of HTTP client handling threads.
 .It Fl Fl trust Ar CIDR
@@ -161,7 +196,7 @@ models do. If it's a base model, then the web ui will automatically use
 completion mode only, without needing to specify this flag. This flag is
 useful in cases where a prompt template is defined by the gguf, but it
 is desirable for the chat interface to be disabled.
-.It Fl Fl db-startup-sql
+.It Fl Fl db-startup-sql Ar CODE
 Specifies SQL code that should be executed whenever connecting to the
 SQLite database. The default is the following code, which enables the
 write-ahead log.
@@ -169,14 +204,14 @@ write-ahead log.
 PRAGMA journal_mode=WAL;
 PRAGMA synchronous=NORMAL;
 .Ed
-.It Fl Fl reserve-tokens
+.It Fl Fl reserve-tokens Ar N
 Percent of context window to reserve for predicted tokens. When the
 server runs out of context window, old chat messages will be forgotten
 until this percent of the context is empty. The default is 15%. If this
 is specified as a floating point number, e.g. 0.15, then it'll be
 multiplied by 100 to get the percent.
 .El
-.Sh EXAMPLE
+.Sh EXAMPLES
 Here's an example of how you might start this server:
 .Pp
 .Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf"
diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc
index fd8b41b846..ae91c8bb05 100644
--- a/llamafile/server/main.1.asc
+++ b/llamafile/server/main.1.asc
@@ -37,6 +37,23 @@ OOPPTTIIOONNSS
 
              The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3
 
+     --nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N
+             Specifies number of layers to offload to GPU.
+
+             This flag must be passed in order to use GPU on systems with NVIDIA
+             or AMD GPUs. If you're confident that you have enough VRAM, then
+             you can pass --nnggll _9_9_9 to enable full offloading, since this number
+             is automatically downtuned to however many number of layers the
+             model has. If VRAM is limited, then the ----vveerrbboossee flag may be
+             passed to learn how many layers the model has, e.g. 35, which can
+             then be down-tuned until the out of memory error goes away.
+
+             On Apple Silicon systems with Metal, GPU offloading is enabled by
+             default. Since these GPUs use unified memory, they're treated as
+             having a single layer; therefore, using values higher than 1 will
+             be treated as 1. You can pass --nnggll _0 to disable GPU offloading and
+             run in CPU mode on Apple Metal systems.
+
      --ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T
              Specifies the local [HOST:]PORT on which the HTTP server should
              listen.  By default this is 0.0.0.0:8080 which means llamafiler
@@ -63,6 +80,16 @@ OOPPTTIIOONNSS
              parallelism can happen.  Please note that ----ccttxx--ssiizzee has a strong
              influence on how many slots can be created.
 
+     ----ddeeccaayy--ddeellaayy _I_N_T
+             Number of seconds a context window slot needs to be inactive before
+             the system starts to strongly consider giving it to other clients.
+             The default is 300 which is five minutes.
+
+     ----ddeeccaayy--ggrroowwtthh _F_L_O_A_T
+             Sets slot decay growth factor. Context window slots are assigned in
+             a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h
+             * (_a_g_e - _d_e_l_a_y))
+
      --pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T
              Specifies system prompt. This value is passed along to the web
              frontend.
@@ -79,6 +106,13 @@ OOPPTTIIOONNSS
              llamafiler behind a reverse proxy such as NGINX or Redbean. By
              default, this is set to / (root).
 
+     ----vveerrbboossee
+             Enable logging of diagnostic information. This flag is useful for
+             learning more about the model and hardware. It can also be helpful
+             for troubleshooting errors. We currently recommend that this flag
+             be avoided in production since the llama.cpp logger may disrupt
+             thread cancelation.
+
      --ww _N, ----wwoorrkkeerrss _N
              Number of HTTP client handling threads.
 
@@ -193,7 +227,7 @@ OOPPTTIIOONNSS
              defined by the gguf, but it is desirable for the chat interface to
              be disabled.
 
-     ----ddbb--ssttaarrttuupp--ssqqll
+     ----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E
              Specifies SQL code that should be executed whenever connecting to
              the SQLite database. The default is the following code, which
              enables the write-ahead log.
@@ -201,14 +235,14 @@ OOPPTTIIOONNSS
                    PRAGMA journal_mode=WAL;
                    PRAGMA synchronous=NORMAL;
 
-     ----rreesseerrvvee--ttookkeennss
+     ----rreesseerrvvee--ttookkeennss _N
              Percent of context window to reserve for predicted tokens. When the
              server runs out of context window, old chat messages will be
              forgotten until this percent of the context is empty. The default
              is 15%. If this is specified as a floating point number, e.g. 0.15,
              then it'll be multiplied by 100 to get the percent.
 
-EEXXAAMMPPLLEE
+EEXXAAMMPPLLEESS
      Here's an example of how you might start this server:
 
            llamafiler -m all-MiniLM-L6-v2.F32.gguf
diff --git a/llamafile/server/main.cpp b/llamafile/server/main.cpp
index c57882b17f..5b0a5b4050 100644
--- a/llamafile/server/main.cpp
+++ b/llamafile/server/main.cpp
@@ -65,7 +65,8 @@ main(int argc, char* argv[])
 
     // we must disable the llama.cpp logger
     // otherwise pthread_cancel() will cause deadlocks
-    FLAG_log_disable = true;
+    if (!llamafile_has(argv, "--verbose"))
+        FLAG_log_disable = true;
 
     // load model
     llama_model_params mparams = {
diff --git a/llamafile/server/server.cpp b/llamafile/server/server.cpp
index 67f0288460..29da798051 100644
--- a/llamafile/server/server.cpp
+++ b/llamafile/server/server.cpp
@@ -128,14 +128,19 @@ Server::accept(unsigned* out_ip)
 
     // set name
     char name[17];
+    int port = ntohs(clientaddr.sin_port);
     unsigned ip = ntohl(clientaddr.sin_addr.s_addr);
-    snprintf(name,
-             sizeof(name),
-             "%hhu.%hhu.%hhu.%hhu",
-             ip >> 24,
-             ip >> 16,
-             ip >> 8,
-             ip);
+    if (ip == 0x7f000001) {
+        snprintf(name, sizeof(name), "%hu", port);
+    } else {
+        snprintf(name,
+                 sizeof(name),
+                 "%hhu.%hhu.%hhu.%hhu",
+                 ip >> 24,
+                 ip >> 16,
+                 ip >> 8,
+                 ip);
+    }
     set_thread_name(name);
 
     // keep sockets open
diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
index 304c5a0f24..c57ca2541c 100644
--- a/llamafile/server/slot.cpp
+++ b/llamafile/server/slot.cpp
@@ -79,9 +79,10 @@ Slot::describe_error(int err)
     }
 }
 
-Slot::Slot(llama_model* model) : model_(model)
+Slot::Slot(int id, llama_model* model) : id_(id), model_(model)
 {
     dll_init(&elem_);
+    last_used_ = time(0);
 }
 
 Slot::~Slot()
diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
index 9fe26afb8c..7fdd7bf881 100644
--- a/llamafile/server/slot.h
+++ b/llamafile/server/slot.h
@@ -17,6 +17,7 @@
 
 #pragma once
 #include <cosmo.h>
+#include <ctime>
 #include <functional>
 #include <string>
 #include <vector>
@@ -49,7 +50,9 @@ struct Slot
 
     static const char* describe_error(int);
 
+    int id_;
     Dll elem_;
+    time_t last_used_;
     llama_model* model_;
     clip_ctx* clip_ctx_ = nullptr;
     llama_context* ctx_ = nullptr;
@@ -57,7 +60,7 @@ struct Slot
     std::string system_fingerprint_;
 
     ~Slot();
-    explicit Slot(llama_model*);
+    Slot(int, llama_model*);
     int ctx_size() const;
     int ctx_used() const;
     bool start();
diff --git a/llamafile/server/slots.cpp b/llamafile/server/slots.cpp
index 1bc275c459..7506f5ec90 100644
--- a/llamafile/server/slots.cpp
+++ b/llamafile/server/slots.cpp
@@ -16,12 +16,17 @@
 // limitations under the License.
 
 #include "slots.h"
+#include "llamafile/llamafile.h"
+#include "llamafile/macros.h"
 #include "llamafile/server/atom.h"
 #include "llamafile/server/log.h"
 #include "llamafile/server/slot.h"
 #include "llamafile/server/slot_entry.h"
 #include "llamafile/vector.h"
+#include <algorithm>
 #include <cassert>
+#include <climits>
+#include <cmath>
 
 namespace lf {
 namespace server {
@@ -50,7 +55,7 @@ Slots::start(int count)
     int made = 0;
     pthread_mutex_lock(&lock_);
     for (int i = 0; i < count; ++i) {
-        Slot* slot = new Slot(model_);
+        Slot* slot = new Slot(i, model_);
         if (slot->start()) {
             ++made;
             slots_.emplace_back(slot);
@@ -68,19 +73,52 @@ Slots::start(int count)
 }
 
 Slot*
-Slots::take(const std::vector<Atom>& prefix)
+Slots::take(const std::vector<Atom>& atoms)
 {
     pthread_mutex_lock(&lock_);
     for (;;) {
 
-        // find slot with longest matching prefix
-        // favoring least recently used if multiple ones
-        int best_cpl = 0;
+        // find best slot
+        // iteration order favors lru
+        time_t now = time(0);
         Dll* best_slot = nullptr;
+        double best_score = INT_MIN;
         for (Dll* e = dll_first(free_slots_); e; e = dll_next(free_slots_, e)) {
-            int cpl = vector_common_prefix_length(SLOT(e)->history_, prefix);
-            if (cpl >= best_cpl) {
-                best_cpl = cpl;
+
+            // least recently used is good
+            int age = now - SLOT(e)->last_used_;
+            double decay =
+              age + exp(FLAG_decay_growth * (age - FLAG_decay_delay));
+
+            // common prefix length is good
+            int cpl = vector_common_prefix_length(SLOT(e)->history_, atoms);
+
+            // common suffix length is good
+            int csl = 0;
+            int size = SLOT(e)->history_.size();
+            for (int i = cpl + 1; i < size; ++i) {
+                if (size - i > atoms.size() - cpl)
+                    continue;
+                if (std::equal(SLOT(e)->history_.begin() + i,
+                               SLOT(e)->history_.end(),
+                               atoms.begin() + cpl)) {
+                    csl = size - i;
+                    break;
+                }
+            }
+
+            // discarded atoms is bad
+            int discard;
+            if (csl) {
+                discard = 0;
+            } else {
+                discard = size - cpl;
+            }
+
+            // tally up score to determine best
+            double score = cpl + csl + decay - discard;
+            if (score >= best_score) {
+                best_score = score;
                 best_slot = e;
             }
         }
@@ -89,6 +127,9 @@ Slots::take(const std::vector<Atom>& prefix)
         if (best_slot) {
             dll_remove(&free_slots_, best_slot);
             pthread_mutex_unlock(&lock_);
+            SLOG("acquired slot #%d with score %d",
+                 SLOT(best_slot)->id_,
+                 (int)MIN(INT_MAX, best_score));
             return SLOT(best_slot);
         }
 
@@ -101,8 +142,9 @@ Slots::take(const std::vector<Atom>& prefix)
 void
 Slots::give(Slot* slot)
 {
-    SLOG("relinquishing slot");
     unassert(slot);
+    SLOG("relinquishing slot #%d", slot->id_);
+    slot->last_used_ = time(0);
     pthread_mutex_lock(&lock_);
     dll_make_first(&free_slots_, &slot->elem_);
     pthread_cond_signal(&cond_);
diff --git a/llamafile/server/v1_chat_completions.cpp b/llamafile/server/v1_chat_completions.cpp
index aa7f65ee8e..659815ee62 100644
--- a/llamafile/server/v1_chat_completions.cpp
+++ b/llamafile/server/v1_chat_completions.cpp
@@ -652,6 +652,9 @@ Client::v1_chat_completions()
         }
     }
     choice["finish_reason"] = finish_reason;
+    SLOG("predicted %d tokens finished on %s", //
+         completion_tokens,
+         finish_reason);
 
     // finalize response
     cleanup_slot(this);