SciSharp · martindevans · Dec 3, 2024 · Dec 20, 2024 · Dec 27, 2024 · Jan 4, 2025
diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
@@ -1,6 +1,5 @@
-using LLama.Native;
+using LLama.Native;
 using Spectre.Console;
-using System.Runtime.InteropServices;
 
 AnsiConsole.MarkupLineInterpolated(
     $"""
@@ -31,8 +30,7 @@ __       __                                       ____     __
 NativeLibraryConfig
    .All
    .WithCuda()
-   //.WithAutoDownload() // An experimental feature
-   .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
+   .WithVulkan();
 
 // Calling this method forces loading to occur now.
 NativeApi.llama_empty_call();

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
 
             var @params = new ModelParams(config.ModelPath)
             {
-                ContextSize = config.ContextSize ?? 2048,
+                ContextSize = config.ContextSize,
                 GpuLayerCount = config.GpuLayerCount ?? 20,
                 Embeddings = true,
-                MainGpu = config.MainGpu,
-                SplitMode = config.SplitMode,
                 PoolingType = LLamaPoolingType.Mean,
             };
             _weights = LLamaWeights.LoadFromFile(@params);

diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
             {
                 ContextSize = config.ContextSize ?? 2048,
                 GpuLayerCount = config.GpuLayerCount ?? 20,
-                MainGpu = config.MainGpu,
-                SplitMode = config.SplitMode
             };
             _weights = LLamaWeights.LoadFromFile(parameters);
             _context = _weights.CreateContext(parameters);

diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -1,21 +1,15 @@
-using LLama.Common;
 using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
 
 namespace LLama.Unittest.KernelMemory
 {
-    public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+    public class LLamaSharpTextEmbeddingGeneratorTests
+        : ITextTokenizerTests, IDisposable
     {
         private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
 
-        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
+            : base(testOutputHelper)
         {
             _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
 

diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -1,25 +1,15 @@
-using LLama.Common;
 using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Linq;
-using System.Reflection.Emit;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
-using Xunit.Sdk;
-using static System.Net.Mime.MediaTypeNames;
 
 namespace LLama.Unittest.KernelMemory
 {
-    public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+    public class LlamaSharpTextGeneratorTests
+        : ITextTokenizerTests, IDisposable
     {        
         private readonly LlamaSharpTextGenerator _textGenerator;
 
-        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
+            : base(testOutputHelper)
         {            
             _textGenerator = new LlamaSharpTextGenerator(_lsConfig);
 

diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
@@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
             var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());
 
             chain.AddPenalties(
-                vocabSize: context.VocabCount,
-                eos: context.ModelHandle.Tokens.EOS,
-                newline: context.ModelHandle.Tokens.Newline ?? 0,
-                penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
-                penalizeNewline: false, ignoreEOS: false
+                penaltyCount: 60, repeat: 1, freq: 0, presence: 0
             );
 
             if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -24,7 +24,7 @@ public class ModelOptions
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -59,6 +59,9 @@ public class ModelOptions
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; } = new();
 

diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -36,7 +36,7 @@ public interface IModelParams
         /// <summary>
         /// How to split the model across multiple GPUs
         /// </summary>
-        GPUSplitMode SplitMode { get; }
+        GPUSplitMode? SplitMode { get; }
 
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
@@ -68,6 +68,11 @@ public interface IModelParams
         /// </summary>
         bool VocabOnly { get; }
 
+        /// <summary>
+        /// Validate model tensor data before loading
+        /// </summary>
+        bool CheckTensors { get; }
+
         /// <summary>
         /// Override specific metadata items in the model
         /// </summary>

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -19,7 +19,7 @@ public record ModelParams
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -54,6 +54,9 @@ public record ModelParams
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; set; } = new();
 

diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -1,4 +1,4 @@
-using System.IO;
+using System.IO;
 using System;
 using System.Text;
 using LLama.Abstractions;
@@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
         result = LLamaModelParams.Default();
 
         result.main_gpu = @params.MainGpu;
-        result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
+        if (@params.SplitMode.HasValue)
+            result.split_mode = @params.SplitMode.Value;
+
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;
         result.vocab_only = @params.VocabOnly;
+        result.check_tensors = @params.CheckTensors;
 
         unsafe
         {

diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs
@@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
                     PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
                     PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
                     RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
-                    RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
+                    PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
                     Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
                     MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
                     MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,

diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
@@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
                 case LLamaFtype.MOSTLY_IQ3_S:
                 case LLamaFtype.MOSTLY_IQ3_M:
 
-                case LLamaFtype.MOSTLY_Q4_0_4_4:
-                case LLamaFtype.MOSTLY_Q4_0_4_8:
-                case LLamaFtype.MOSTLY_Q4_0_8_8:
                     return true;
 
                 case LLamaFtype.GUESSED:

diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
@@ -12,6 +12,15 @@
           <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
           <Link>runtimes/win-x64/native/noavx/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-base.dll">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/win-x64/native/noavx/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/noavx/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx/llama.dll</Link>
@@ -20,22 +29,49 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-cpu.dll">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/win-x64/native/avx/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx2/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx2/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx2/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx2/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx512/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx512/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx512/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx512/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda11/llama.dll</Link>
@@ -118,6 +154,14 @@
         <Link>runtimes/linux-x64/native/vulkan/libggml.so</Link>
       </None>
 
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-base.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-arm64/native/libggml-base.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-cpu.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-arm64/native/libggml-cpu.dylib</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-arm64/native/libggml.dylib</Link>
@@ -134,7 +178,15 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-arm64/native/ggml-metal.metal</Link>
       </None>
-
+
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-base.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/libggml-base.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-cpu.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/libggml-cpu.dylib</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-x64/native/libggml.dylib</Link>
@@ -148,6 +200,14 @@
         <Link>runtimes/osx-x64/native/libllava_shared.dylib</Link>
       </None>
 
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-base.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/rosetta2/libggml-base.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-cpu.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib</Link>
+      </None>        
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-x64/native/rosetta2/libggml.dylib</Link>

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -56,7 +56,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>958367bf530d943a90</BinaryReleaseId>
+    <BinaryReleaseId>0827b2c1da</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>

diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
@@ -23,14 +23,14 @@ public class StatelessExecutor
         private readonly ILogger? _logger;
         private readonly LLamaBatch _batch;
 
-        // LLava Section
+        /// <inheritdoc />
         public bool IsMultiModal => false;
 
         /// <inheritdoc />
-        public LLavaWeights? ClipModel { get;  }
+        public LLavaWeights? ClipModel => default;
 
         /// <inheritdoc />
-        public List<byte[]> Images { get; set; }
+        public List<byte[]> Images { get; }
 
         /// <summary>
         /// The context used by the executor when running the inference.
@@ -68,7 +68,7 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
             Context = context;
 
             // Reset the sampling pipeline (if there is one)
-            inferenceParams?.SamplingPipeline?.Reset();
+            inferenceParams?.SamplingPipeline.Reset();
 
             // Sanity check inference params
             inferenceParams ??= new InferenceParams();
@@ -134,8 +134,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                     var n_left = n_past - tokensKeep;
                     var n_discard = n_left / 2;
 
-                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
-                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
+                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
+                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
                 }

diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
@@ -17,7 +17,7 @@ public enum GPUSplitMode
     Layer = 1,
 
     /// <summary>
-    /// split rows across GPUs
+    /// split layers and KV across GPUs, use tensor parallelism if supported
     /// </summary>
     Row = 2,
 }