Avoid activating GPU when we can't use the GPU

Loading the GPU module imposes about a second of latency. If the user isn't passing the -ngl 35 flag and it isn't Apple Silicon, then don't bother running all the GPU-related initialization routines.
Mozilla-Ocho · Jan 4, 2024 · 01b9aaf · 01b9aaf
1 parent 50bdf69
commit 01b9aaf
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 1 deletion.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -562,7 +562,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_gpu_layers = std::stoi(argv[i]);
-            if (params.n_gpu_layers == 0) {
+            if (params.n_gpu_layers <= 0) {
                 FLAG_gpu = LLAMAFILE_GPU_DISABLE;
             }
         } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {

diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
@@ -124,6 +124,13 @@ int main(int argc, char ** argv) {
         __builtin_unreachable();
     }
 
+    if (!IsXnuSilicon() &&
+        (!has_argument(argc, argv, "-ngl") &&
+         !has_argument(argc, argv, "--gpu-layers") &&
+         !has_argument(argc, argv, "--n-gpu-layers"))) {
+        FLAG_gpu = LLAMAFILE_GPU_DISABLE;
+    }
+
     if (!has_argument(argc, argv, "--cli") &&
         (has_argument(argc, argv, "--server") ||
          (!has_argument(argc, argv, "-p") &&