Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

January 2025 Update #1036

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions LLama.Examples/Program.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using LLama.Native;
using LLama.Native;
using Spectre.Console;
using System.Runtime.InteropServices;

AnsiConsole.MarkupLineInterpolated(
$"""
Expand Down Expand Up @@ -31,8 +30,7 @@ __ __ ____ __
NativeLibraryConfig
.All
.WithCuda()
//.WithAutoDownload() // An experimental feature
.DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
.WithVulkan();

// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();
Expand Down
4 changes: 1 addition & 3 deletions LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)

var @params = new ModelParams(config.ModelPath)
{
ContextSize = config.ContextSize ?? 2048,
ContextSize = config.ContextSize,
GpuLayerCount = config.GpuLayerCount ?? 20,
Embeddings = true,
MainGpu = config.MainGpu,
SplitMode = config.SplitMode,
PoolingType = LLamaPoolingType.Mean,
};
_weights = LLamaWeights.LoadFromFile(@params);
Expand Down
2 changes: 0 additions & 2 deletions LLama.KernelMemory/LlamaSharpTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
{
ContextSize = config.ContextSize ?? 2048,
GpuLayerCount = config.GpuLayerCount ?? 20,
MainGpu = config.MainGpu,
SplitMode = config.SplitMode
};
_weights = LLamaWeights.LoadFromFile(parameters);
_context = _weights.CreateContext(parameters);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
using LLama.Common;
using LLamaSharp.KernelMemory;
using Microsoft.KernelMemory.AI;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Xunit.Abstractions;

namespace LLama.Unittest.KernelMemory
{
public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
public class LLamaSharpTextEmbeddingGeneratorTests
: ITextTokenizerTests, IDisposable
{
private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;

public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
: base(testOutputHelper)
{
_embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);

Expand Down
18 changes: 4 additions & 14 deletions LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
Original file line number Diff line number Diff line change
@@ -1,25 +1,15 @@
using LLama.Common;
using LLamaSharp.KernelMemory;
using Microsoft.KernelMemory.AI;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Reflection.Emit;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Xunit.Abstractions;
using Xunit.Sdk;
using static System.Net.Mime.MediaTypeNames;

namespace LLama.Unittest.KernelMemory
{
public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
public class LlamaSharpTextGeneratorTests
: ITextTokenizerTests, IDisposable
{
private readonly LlamaSharpTextGenerator _textGenerator;

public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
: base(testOutputHelper)
{
_textGenerator = new LlamaSharpTextGenerator(_lsConfig);

Expand Down
6 changes: 1 addition & 5 deletions LLama.Unittest/SamplingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());

chain.AddPenalties(
vocabSize: context.VocabCount,
eos: context.ModelHandle.Tokens.EOS,
newline: context.ModelHandle.Tokens.Newline ?? 0,
penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
penalizeNewline: false, ignoreEOS: false
penaltyCount: 60, repeat: 1, freq: 0, presence: 0
);

if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }
Expand Down
5 changes: 4 additions & 1 deletion LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class ModelOptions
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
public GPUSplitMode? SplitMode { get; set; }

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
Expand Down Expand Up @@ -59,6 +59,9 @@ public class ModelOptions
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <inheritdoc />
public bool CheckTensors { get; }

/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; } = new();

Expand Down
7 changes: 6 additions & 1 deletion LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public interface IModelParams
/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; }
GPUSplitMode? SplitMode { get; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
Expand Down Expand Up @@ -68,6 +68,11 @@ public interface IModelParams
/// </summary>
bool VocabOnly { get; }

/// <summary>
/// Validate model tensor data before loading
/// </summary>
bool CheckTensors { get; }

/// <summary>
/// Override specific metadata items in the model
/// </summary>
Expand Down
5 changes: 4 additions & 1 deletion LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public record ModelParams
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
public GPUSplitMode? SplitMode { get; set; }

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
Expand Down Expand Up @@ -54,6 +54,9 @@ public record ModelParams
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <inheritdoc />
public bool CheckTensors { get; }

/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; set; } = new();

Expand Down
7 changes: 5 additions & 2 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.IO;
using System.IO;
using System;
using System.Text;
using LLama.Abstractions;
Expand Down Expand Up @@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
result = LLamaModelParams.Default();

result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
if (@params.SplitMode.HasValue)
result.split_mode = @params.SplitMode.Value;

result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
result.vocab_only = @params.VocabOnly;
result.check_tensors = @params.CheckTensors;

unsafe
{
Expand Down
2 changes: 1 addition & 1 deletion LLama/Extensions/LLamaExecutorExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
Expand Down
3 changes: 0 additions & 3 deletions LLama/LLamaQuantizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
case LLamaFtype.MOSTLY_IQ3_S:
case LLamaFtype.MOSTLY_IQ3_M:

case LLamaFtype.MOSTLY_Q4_0_4_4:
case LLamaFtype.MOSTLY_Q4_0_4_8:
case LLamaFtype.MOSTLY_Q4_0_8_8:
return true;

case LLamaFtype.GUESSED:
Expand Down
62 changes: 61 additions & 1 deletion LLama/LLamaSharp.Runtime.targets
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/noavx/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/noavx/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/noavx/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/llama.dll</Link>
Expand All @@ -20,22 +29,49 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/llama.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/llama.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/cuda11/llama.dll</Link>
Expand Down Expand Up @@ -118,6 +154,14 @@
<Link>runtimes/linux-x64/native/vulkan/libggml.so</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-base.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-arm64/native/libggml-base.dylib</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-cpu.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-arm64/native/libggml-cpu.dylib</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-arm64/native/libggml.dylib</Link>
Expand All @@ -134,7 +178,15 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-arm64/native/ggml-metal.metal</Link>
</None>


<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-base.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-x64/native/libggml-base.dylib</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-cpu.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-x64/native/libggml-cpu.dylib</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-x64/native/libggml.dylib</Link>
Expand All @@ -148,6 +200,14 @@
<Link>runtimes/osx-x64/native/libllava_shared.dylib</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-base.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-x64/native/rosetta2/libggml-base.dylib</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-cpu.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/osx-x64/native/rosetta2/libggml.dylib</Link>
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>958367bf530d943a90</BinaryReleaseId>
<BinaryReleaseId>0827b2c1da</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
Expand Down
12 changes: 6 additions & 6 deletions LLama/LLamaStatelessExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ public class StatelessExecutor
private readonly ILogger? _logger;
private readonly LLamaBatch _batch;

// LLava Section
/// <inheritdoc />
public bool IsMultiModal => false;

/// <inheritdoc />
public LLavaWeights? ClipModel { get; }
public LLavaWeights? ClipModel => default;

/// <inheritdoc />
public List<byte[]> Images { get; set; }
public List<byte[]> Images { get; }

/// <summary>
/// The context used by the executor when running the inference.
Expand Down Expand Up @@ -68,7 +68,7 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
Context = context;

// Reset the sampling pipeline (if there is one)
inferenceParams?.SamplingPipeline?.Reset();
inferenceParams?.SamplingPipeline.Reset();

// Sanity check inference params
inferenceParams ??= new InferenceParams();
Expand Down Expand Up @@ -134,8 +134,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
var n_left = n_past - tokensKeep;
var n_discard = n_left / 2;

NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);

n_past -= n_discard;
}
Expand Down
2 changes: 1 addition & 1 deletion LLama/Native/GPUSplitMode.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public enum GPUSplitMode
Layer = 1,

/// <summary>
/// split rows across GPUs
/// split layers and KV across GPUs, use tensor parallelism if supported
/// </summary>
Row = 2,
}
Loading
Loading