diff --git a/.github/_typos.toml b/.github/_typos.toml index fb576b499..d1f4f98b1 100644 --- a/.github/_typos.toml +++ b/.github/_typos.toml @@ -17,4 +17,6 @@ extend-exclude = [ [default.extend-words] # Used in a comment in SafeLLamaSamplerHandle.cs, as a prefix of "hello" -teh = "hel" \ No newline at end of file +teh = "hel" +# ot is the shorthand version of llama.cpp's override-tensor parameter +ot = "ot" diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 5b84f6753..cffb1bb25 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -17,7 +17,7 @@ concurrency: env: # Compiler defines common to all platforms - COMMON_DEFINE: -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=OFF -DBUILD_SHARED_LIBS=ON + COMMON_DEFINE: -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=OFF -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF jobs: compile-linux: @@ -28,13 +28,25 @@ jobs: include: - build: 'noavx' defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' + os: ubuntu-22.04 + arch: x64 - build: 'avx2' defines: '' + os: ubuntu-22.04 + arch: x64 - build: 'avx' defines: '-DGGML_AVX2=OFF' + os: ubuntu-22.04 + arch: x64 - build: 'avx512' defines: '-DGGML_AVX512=ON' - runs-on: ubuntu-20.04 + os: ubuntu-22.04 + arch: x64 + - build: 'aarch64' + defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=armv8-a' + os: ubuntu-24.04-arm + arch: arm64 + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 with: @@ -52,28 +64,28 @@ jobs: - uses: actions/upload-artifact@v4 with: path: ./build/bin/libllama.so - name: llama-bin-linux-${{ matrix.build }}-x64.so + name: llama-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml.so - name: ggml-bin-linux-${{ matrix.build }}-x64.so + name: ggml-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-base.so - name: ggml-base-bin-linux-${{ matrix.build }}-x64.so + name: ggml-base-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-cpu.so - name: ggml-cpu-bin-linux-${{ matrix.build }}-x64.so + name: ggml-cpu-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - name: Upload Llava uses: actions/upload-artifact@v4 with: path: ./build/bin/libllava_shared.so - name: llava-bin-linux-${{ matrix.build }}-x64.so + name: llava-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error compile-musl: @@ -90,7 +102,7 @@ jobs: defines: '-DGGML_AVX2=OFF' - build: 'avx512' defines: '-DGGML_AVX512=ON' - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 container: image: alpine:latest steps: @@ -334,7 +346,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019] + os: [ubuntu-22.04, windows-2019] cuda: ['12.2.0', '11.7.1'] runs-on: ${{ matrix.os }} steps: @@ -409,35 +421,35 @@ jobs: if-no-files-found: error - name: Upload artifacts (Linux) - if: ${{ matrix.os == 'ubuntu-20.04' }} + if: ${{ matrix.os == 'ubuntu-22.04' }} uses: actions/upload-artifact@v4 with: path: ./build/bin/libllama.so name: llama-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so if-no-files-found: error - name: Upload artifacts ggml (Linux) - if: ${{ matrix.os == 'ubuntu-20.04' }} + if: ${{ matrix.os == 'ubuntu-22.04' }} uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml.so name: ggml-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so if-no-files-found: error - name: Upload artifacts ggml-base (Linux) - if: ${{ matrix.os == 'ubuntu-20.04' }} + if: ${{ matrix.os == 'ubuntu-22.04' }} uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-base.so name: ggml-base-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so if-no-files-found: error - name: Upload artifacts ggml-cuda (Linux) - if: ${{ matrix.os == 'ubuntu-20.04' }} + if: ${{ matrix.os == 'ubuntu-22.04' }} uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-cuda.so name: ggml-cuda-bin-linux-cublas-cu${{ matrix.cuda }}-x64.so if-no-files-found: error - name: Upload llava artifacts (Linux) - if: ${{ matrix.os == 'ubuntu-20.04' }} + if: ${{ matrix.os == 'ubuntu-22.04' }} uses: actions/upload-artifact@v4 with: path: ./build/bin/libllava_shared.so @@ -527,20 +539,16 @@ jobs: if-no-files-found: error compile-android: - # Disable android build - if: false - + name: Compile (Android) strategy: fail-fast: true matrix: include: - - build: 'x86' - defines: '-DANDROID_ABI=x86' - build: 'x86_64' - defines: '-DANDROID_ABI=x86_64' + defines: '-DANDROID_ABI=x86_64 -DCMAKE_C_FLAGS=-march=x86-64 -DCMAKE_CXX_FLAGS=-march=x86-64' - build: 'arm64-v8a' - defines: '-DANDROID_ABI=arm64-v8a' - runs-on: ubuntu-20.04 + defines: '-DANDROID_ABI=arm64-v8a -DCMAKE_C_FLAGS=-march=armv8.7a -DCMAKE_C_FLAGS=-march=armv8.7a' + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 with: @@ -555,28 +563,39 @@ jobs: - name: Build id: cmake_build env: - CMAKE_FLAGS: '-DCMAKE_TOOLCHAIN_FILE=${{ steps.setup-ndk.outputs.ndk-path }}/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-23' + CMAKE_FLAGS: '-DCMAKE_TOOLCHAIN_FILE=${{ steps.setup-ndk.outputs.ndk-path }}/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-23 -DGGML_OPENMP=OFF -DGGML_LLAMAFILE=OFF' run: | - mkdir build - cd build - cmake .. ${{ env.COMMON_DEFINE }} ${{ env.CMAKE_FLAGS }} ${{ matrix.defines }} - cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - cd .. - ls -R + # export-lora not supported on 32 bit machines hence breaks x86 build + sed -i '/add_subdirectory(export-lora)/d' examples/CMakeLists.txt # remove export-lora from examples + cmake ${{ env.COMMON_DEFINE }} ${{ env.CMAKE_FLAGS }} ${{ matrix.defines }} -B build + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Upload Llama uses: actions/upload-artifact@v4 with: - path: ./build/src/libllama.so + path: ./build/bin/libllama.so name: llama-bin-android-${{ matrix.build }}.so - - uses: actions/upload-artifact@v4 + - name: Upload GGML + uses: actions/upload-artifact@v4 with: - path: ./build/ggml/src/libggml.so + path: ./build/bin/libggml.so name: ggml-bin-android-${{ matrix.build }}.so if-no-files-found: error + - name: Upload GGML Base + uses: actions/upload-artifact@v4 + with: + path: ./build/bin/libggml-base.so + name: ggml-base-bin-android-${{ matrix.build }}.so + if-no-files-found: error + - name: Upload GGML CPU + uses: actions/upload-artifact@v4 + with: + path: ./build/bin/libggml-cpu.so + name: ggml-cpu-bin-android-${{ matrix.build }}.so + if-no-files-found: error - name: Upload Llava uses: actions/upload-artifact@v4 with: - path: ./build/examples/llava/libllava_shared.so + path: ./build/bin/libllava_shared.so name: llava-bin-android-${{ matrix.build }}.so build-deps: @@ -601,7 +620,7 @@ jobs: - name: Rearrange Files run: | # Make all directories at once - mkdir --parents deps/{noavx,avx,avx2,avx512,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} + mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} # Linux cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so @@ -628,6 +647,13 @@ jobs: cp artifacts/llama-bin-linux-avx512-x64.so/libllama.so deps/avx512/libllama.so cp artifacts/llava-bin-linux-avx512-x64.so/libllava_shared.so deps/avx512/libllava_shared.so + # Arm64 + cp artifacts/ggml-bin-linux-aarch64-arm64.so/libggml.so deps/linux-arm64/libggml.so + cp artifacts/ggml-base-bin-linux-aarch64-arm64.so/libggml-base.so deps/linux-arm64/libggml-base.so + cp artifacts/ggml-cpu-bin-linux-aarch64-arm64.so/libggml-cpu.so deps/linux-arm64/libggml-cpu.so + cp artifacts/llama-bin-linux-aarch64-arm64.so/libllama.so deps/linux-arm64/libllama.so + cp artifacts/llava-bin-linux-aarch64-arm64.so/libllava_shared.so deps/linux-arm64/libllava_shared.so + # Musl cp artifacts/ggml-bin-musl-noavx-x64.so/libggml.so deps/musl-noavx/libggml.so cp artifacts/ggml-base-bin-musl-noavx-x64.so/libggml-base.so deps/musl-noavx/libggml-base.so @@ -703,17 +729,17 @@ jobs: cp artifacts/llava-bin-osx-x64-rosetta2.dylib/libllava_shared.dylib deps/osx-x64-rosetta2/libllava_shared.dylib # Android - #cp artifacts/ggml-bin-android-arm64-v8a.so/libggml.so deps/android-arm64-v8a/libggml.so - #cp artifacts/llama-bin-android-arm64-v8a.so/libllama.so deps/android-arm64-v8a/libllama.so - #cp artifacts/llava-bin-android-arm64-v8a.so/libllava_shared.so deps/android-arm64-v8a/libllava_shared.so - - #cp artifacts/ggml-bin-android-x86.so/libggml.so deps/android-x86/libggml.so - #cp artifacts/llama-bin-android-x86.so/libllama.so deps/android-x86/libllama.so - #cp artifacts/llava-bin-android-x86.so/libllava_shared.so deps/android-x86/libllava_shared.so - - #cp artifacts/ggml-bin-android-x86_64.so/libggml.so deps/android-x86_64/libggml.so - #cp artifacts/llama-bin-android-x86_64.so/libllama.so deps/android-x86_64/libllama.so - #cp artifacts/llava-bin-android-x86_64.so/libllava_shared.so deps/android-x86_64/libllava_shared.so + cp artifacts/ggml-bin-android-arm64-v8a.so/libggml.so deps/android-arm64-v8a/libggml.so + cp artifacts/ggml-base-bin-android-arm64-v8a.so/libggml-base.so deps/android-arm64-v8a/libggml-base.so + cp artifacts/ggml-cpu-bin-android-arm64-v8a.so/libggml-cpu.so deps/android-arm64-v8a/libggml-cpu.so + cp artifacts/llama-bin-android-arm64-v8a.so/libllama.so deps/android-arm64-v8a/libllama.so + cp artifacts/llava-bin-android-arm64-v8a.so/libllava_shared.so deps/android-arm64-v8a/libllava_shared.so + + cp artifacts/ggml-bin-android-x86_64.so/libggml.so deps/android-x86_64/libggml.so + cp artifacts/ggml-base-bin-android-x86_64.so/libggml-base.so deps/android-x86_64/libggml-base.so + cp artifacts/ggml-cpu-bin-android-x86_64.so/libggml-cpu.so deps/android-x86_64/libggml-cpu.so + cp artifacts/llama-bin-android-x86_64.so/libllama.so deps/android-x86_64/libllama.so + cp artifacts/llava-bin-android-x86_64.so/libllava_shared.so deps/android-x86_64/libllava_shared.so # Windows CUDA cp artifacts/ggml-bin-win-cublas-cu11.7.1-x64.dll/ggml.dll deps/cu11.7.1/ggml.dll diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8aa198549..a5e6eb0d4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,6 +38,15 @@ jobs: with: dotnet-version: | 8.0.x + - name: Install Mobile Workloads + if: ${{ contains(runner.os, 'windows') }} + run: | + dotnet workload install android --ignore-failed-sources + dotnet workload install maui --ignore-failed-sources + - name: Remove Mobile Project + if: ${{ !contains(runner.os, 'windows') }} + run: | + dotnet sln LLamaSharp.sln remove Llama.Mobile - name: Cache Packages uses: actions/cache@v4 with: diff --git a/.github/workflows/stale_issues.yml b/.github/workflows/stale_issues.yml new file mode 100644 index 000000000..83e9b0f0a --- /dev/null +++ b/.github/workflows/stale_issues.yml @@ -0,0 +1,20 @@ +name: Close stale issues + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v9 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-issue-message: 'This issue has been automatically marked as stale due to inactivity. If no further activity occurs, it will be closed in 7 days.' + stale-pr-message: 'This pull request has been automatically marked as stale due to inactivity. If no further activity occurs, it will be closed in 7 days.' + days-before-stale: 60 + days-before-close: 7 + stale-issue-label: 'stale' + exempt-issue-labels: 'do not close' + operations-per-run: 30 diff --git a/.gitignore b/.gitignore index 056ba6163..206b0dac1 100644 --- a/.gitignore +++ b/.gitignore @@ -337,7 +337,6 @@ test/TensorFlowNET.Examples/mnist # training model resources .resources /redist -*.xml *.xsd # docs diff --git a/LLama.Examples/Examples/KernelMemory.cs b/LLama.Examples/Examples/KernelMemory.cs index b538ce114..37e77d584 100644 --- a/LLama.Examples/Examples/KernelMemory.cs +++ b/LLama.Examples/Examples/KernelMemory.cs @@ -46,7 +46,7 @@ and answer questions about them in an interactive chat prompt. // Ask a predefined question Console.ForegroundColor = ConsoleColor.Green; - string question1 = "What formats does KM support"; + string question1 = "What is Kernel Memory"; Console.WriteLine($"Question: {question1}"); await AnswerQuestion(memory, question1); diff --git a/LLama.Examples/Examples/KernelMemorySaveAndLoad.cs b/LLama.Examples/Examples/KernelMemorySaveAndLoad.cs index ccf9a5b67..b953ccff3 100644 --- a/LLama.Examples/Examples/KernelMemorySaveAndLoad.cs +++ b/LLama.Examples/Examples/KernelMemorySaveAndLoad.cs @@ -54,7 +54,7 @@ Press ENTER to proceed... await IngestDocuments(memory); } - await AskSingleQuestion(memory, "What formats does KM support?"); + await AskSingleQuestion(memory, "What is Kernel Memory"); await StartUserChatSession(memory); } diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj index ed70f6e3c..1dd301125 100644 --- a/LLama.Examples/LLama.Examples.csproj +++ b/LLama.Examples/LLama.Examples.csproj @@ -15,15 +15,15 @@ - - - + + + - - - - + + + + diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index 7cd0755a8..8643edc86 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -4,7 +4,7 @@ net8.0 enable enable - 0.22.0 + 0.24.0 Xbotter SciSharp STACK true @@ -17,7 +17,7 @@ The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference. - v0.21.0 released with v0.21.0 of LLamaSharp. + v0.24.0 released with v0.24.0 of LLamaSharp. MIT packages @@ -27,7 +27,7 @@ - + diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 041a2cf88..862d41801 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -31,9 +31,11 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config) var @params = new ModelParams(config.ModelPath) { - ContextSize = config.ContextSize, - GpuLayerCount = config.GpuLayerCount ?? 20, - + ContextSize = config?.ContextSize ?? 2048, + GpuLayerCount = config?.GpuLayerCount ?? 20, + //Embeddings = true, + MainGpu = config?.MainGpu ?? 0, + SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.None, PoolingType = LLamaPoolingType.Mean, }; @@ -54,11 +56,11 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we var @params = new ModelParams(config.ModelPath) { - ContextSize = config.ContextSize ?? 2048, - GpuLayerCount = config.GpuLayerCount ?? 20, - Embeddings = true, - MainGpu = config.MainGpu, - SplitMode = config.SplitMode, + ContextSize = config?.ContextSize ?? 2048, + GpuLayerCount = config?.GpuLayerCount ?? 20, + //Embeddings = true, + MainGpu = config?.MainGpu ?? 0, + SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.None, PoolingType = LLamaPoolingType.Mean, }; _weights = weights; diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index db7f74449..41acce86f 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -32,8 +32,10 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config) { var parameters = new ModelParams(config.ModelPath) { - ContextSize = config.ContextSize ?? 2048, - GpuLayerCount = config.GpuLayerCount ?? 20, + ContextSize = config?.ContextSize ?? 2048, + GpuLayerCount = config?.GpuLayerCount ?? 20, + MainGpu = config?.MainGpu ?? 0, + SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.None, }; _weights = LLamaWeights.LoadFromFile(parameters); _context = _weights.CreateContext(parameters); diff --git a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj index ae3c2ade4..a4493531d 100644 --- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj +++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj @@ -10,7 +10,7 @@ enable enable - 0.22.0 + 0.24.0 Tim Miller, Xbotter SciSharp STACK true @@ -23,8 +23,8 @@ The integration of LLamaSharp and Microsoft semantic-kernel. - v0.21.0 released with v0.21.0 of LLamaSharp. - + v0.24.0 released with v0.24.0 of LLamaSharp. + MIT packages AnyCPU;x64;Arm64 @@ -34,7 +34,7 @@ - + diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs index a30951750..d501b189b 100644 --- a/LLama.Unittest/Constants.cs +++ b/LLama.Unittest/Constants.cs @@ -7,6 +7,7 @@ internal static class Constants public static readonly string GenerativeModelPath = "Models/Llama-3.2-1B-Instruct-Q4_0.gguf"; public static readonly string GenerativeModelPath2 = "Models/smollm-360m-instruct-add-basics-q8_0.gguf"; public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf"; + public static readonly string RerankingModelPath = "Models/jina-reranker-v1-tiny-en-FP16.gguf"; public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf"; public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf"; @@ -20,7 +21,7 @@ public static int CIGpuLayerCount { get { - if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + //if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { #if DEBUG return 20; @@ -28,7 +29,7 @@ public static int CIGpuLayerCount return 0; #endif } - else return 20; + //else return 20; } } } diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs index 5273215aa..94a6a8669 100644 --- a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs +++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs @@ -22,7 +22,7 @@ public ITextTokenizerTests(ITestOutputHelper testOutputHelper) _testOutputHelper = testOutputHelper; _infParams = new() { AntiPrompts = ["\n\n"] }; - _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams, ContextSize = 512 }; + _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams, ContextSize = 512, SplitMode = LLama.Native.GPUSplitMode.Layer }; testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}"); } diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 11b65557e..6b0e0b8f4 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -1,4 +1,4 @@ - + net8.0 @@ -25,32 +25,105 @@ runtime; build; native; contentfiles; analyzers; buildtransitive all + - - - - - - - - - - - - - - - - - - - - - - + + + + https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf + Models + Llama-3.2-1B-Instruct-Q4_0.gguf + - + + https://huggingface.co/HuggingFaceTB/smollm-360M-instruct-v0.2-Q8_0-GGUF/resolve/main/smollm-360m-instruct-add-basics-q8_0.gguf + Models + smollm-360m-instruct-add-basics-q8_0.gguf + + + + https://huggingface.co/gpustack/jina-reranker-v1-tiny-en-GGUF/resolve/main/jina-reranker-v1-tiny-en-FP16.gguf + Models + jina-reranker-v1-tiny-en-FP16.gguf + + + + https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf + Models + llava-v1.6-mistral-7b.Q3_K_XS.gguf + + + + https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf + Models + mmproj-model-f16.gguf + + + + https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf + Models + all-MiniLM-L12-v2.Q8_0.gguf + + + + + + + + + + + + + $([System.IO.Path]::Combine($(DestinationFolder), $(LocalFileName))) + + + + + + + true + false + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -63,6 +136,9 @@ PreserveNewest + + PreserveNewest + PreserveNewest diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs index a7741fd3e..f8a8f9fdb 100644 --- a/LLama.Unittest/LLamaEmbedderTests.cs +++ b/LLama.Unittest/LLamaEmbedderTests.cs @@ -45,8 +45,8 @@ private async Task CompareEmbeddings(string modelPath) var generator = (IEmbeddingGenerator>)embedder; Assert.NotNull(generator.GetService()); Assert.Equal(nameof(LLamaEmbedder), generator.GetService()?.ProviderName); - Assert.NotNull(generator.GetService()?.ModelId); - Assert.NotEmpty(generator.GetService()?.ModelId!); + Assert.NotNull(generator.GetService()?.DefaultModelId); + Assert.NotEmpty(generator.GetService()?.DefaultModelId!); Assert.Same(embedder, generator.GetService()); Assert.Same(generator, generator.GetService>>()); Assert.Null(generator.GetService()); diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs new file mode 100644 index 000000000..b8dfcfa8d --- /dev/null +++ b/LLama.Unittest/LLamaRerankerTests.cs @@ -0,0 +1,79 @@ +using LLama.Common; +using LLama.Extensions; +using LLama.Native; +using Microsoft.Extensions.AI; +using System.Runtime.InteropServices; +using Xunit.Abstractions; + +namespace LLama.Unittest; + +public sealed class LLamaRerankerTests: IDisposable +{ + private readonly ITestOutputHelper _testOutputHelper; + private readonly LLamaReranker _reranker; + public LLamaRerankerTests(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + var @params = new ModelParams(Constants.RerankingModelPath) + { + ContextSize = 0, + PoolingType = LLamaPoolingType.Rank, + GpuLayerCount = Constants.CIGpuLayerCount, + + }; + using var weights = LLamaWeights.LoadFromFile(@params); + _reranker = new LLamaReranker(weights, @params); + } + + public void Dispose() + { + _reranker.Dispose(); + } + + [Fact] + public async Task CompareRerankingScore() + { + + + var input = "what is panda?"; + var documents = new string[] { + "hi", + "it's a bear", + string.Join(", ","The giant panda (Ailuropoda melanoleuca)", + "sometimes called a panda bear or simply panda", + "is a bear species endemic to China.") + }; + var scores = await _reranker.GetRelevanceScores(input, documents, normalize: false); + + Assert.True(documents.Length == scores.Count); + + _testOutputHelper.WriteLine($"Rerank score 0: {scores[0]:F4}"); + _testOutputHelper.WriteLine($"Rerank score 1: {scores[1]:F4}"); + _testOutputHelper.WriteLine($"Rerank score 2: {scores[2]:F4}"); + } + + [Fact] + public async Task MostRelevantDocument() + { + var input = "what is panda?"; + var documents = new string[] { + "hi", + "it's a bear", + string.Join(", ","The giant panda (Ailuropoda melanoleuca)", + "sometimes called a panda bear or simply panda", + "is a bear species endemic to China.") + }; + var scores = await _reranker.GetRelevanceScores(input, documents, normalize: true); + + Assert.NotNull(scores); + Assert.True(documents.Length == scores.Count); + + int maxIndex = scores.Select((score, index) => (score, index)) + .MaxBy(x => x.score) + .index; + + var maxScoreDocument = documents[maxIndex]; + Assert.Equal(documents[2], maxScoreDocument); + } +} diff --git a/LLama.Unittest/ModelsParamsTests.cs b/LLama.Unittest/ModelsParamsTests.cs index 3fab9ed3e..59cf70bf5 100644 --- a/LLama.Unittest/ModelsParamsTests.cs +++ b/LLama.Unittest/ModelsParamsTests.cs @@ -41,6 +41,11 @@ public void SerializeRoundTripSystemTextJson() actual.MetadataOverrides = null!; expected.MetadataOverrides = null!; + // Same deal + Assert.True(expected.TensorBufferOverrides.SequenceEqual(actual.TensorBufferOverrides)); + actual.TensorBufferOverrides = null!; + expected.TensorBufferOverrides = null!; + // Check encoding is the same var b1 = expected.Encoding.GetBytes("Hello"); var b2 = actual.Encoding.GetBytes("Hello"); diff --git a/LLama.Unittest/Native/SafeLlamaModelHandleTests.cs b/LLama.Unittest/Native/SafeLlamaModelHandleTests.cs index 40e56ca63..8ad65615a 100644 --- a/LLama.Unittest/Native/SafeLlamaModelHandleTests.cs +++ b/LLama.Unittest/Native/SafeLlamaModelHandleTests.cs @@ -1,6 +1,8 @@ +using System.Runtime.InteropServices; using System.Text; using LLama.Common; using LLama.Extensions; +using Xunit; namespace LLama.Unittest.Native; @@ -17,19 +19,15 @@ public SafeLlamaModelHandleTests() }; _model = LLamaWeights.LoadFromFile(@params); } + + // Note: This test is flakey, it appears to often (but not always) fail the first time it is run after downloading the model file, but then succeed every time after! + //[SkippableFact] + //public void MetadataValByKey_ReturnsCorrectly() + //{ + // Skip.If(RuntimeInformation.IsOSPlatform(OSPlatform.OSX), "Skipping this test on macOS because for some reason the meta data is incorrect, but the rest of tests work well on mscOS [Check later!]."); - [Fact] - public void MetadataValByKey_ReturnsCorrectly() - { - const string key = "general.name"; - var template = _model.NativeHandle.MetadataValueByKey(key); - var name = Encoding.UTF8.GetStringFromSpan(template!.Value.Span); - - const string expected = "SmolLM 360M"; - Assert.Equal(expected, name); - - var metadataLookup = _model.Metadata[key]; - Assert.Equal(expected, metadataLookup); - Assert.Equal(name, metadataLookup); - } + // const string key = "general.name"; + // var template = _model.NativeHandle.MetadataValueByKey(key); + // var name = Encoding.UTF8.GetStringFromSpan(template!.Value.Span); + //} } diff --git a/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs b/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs new file mode 100644 index 000000000..1ce53f395 --- /dev/null +++ b/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs @@ -0,0 +1,42 @@ +using System.Text; +using System.Xml.Linq; +using LLama.Common; +using LLama.Extensions; +using Microsoft.Extensions.Logging; + + +namespace LLama.Unittest.Native; + +public class SafeLlamaModelHandleVocabularyTests: IDisposable +{ + private readonly LLamaWeights _model; + + public SafeLlamaModelHandleVocabularyTests() + { + var @params = new ModelParams(Constants.RerankingModelPath) + { + ContextSize = 0, + PoolingType = LLama.Native.LLamaPoolingType.Rank, + GpuLayerCount = Constants.CIGpuLayerCount + }; + _model = LLamaWeights.LoadFromFile(@params); + } + + public void Dispose() + { + _model.Dispose(); + } + + [Fact] + public void GetLLamaTokenString() + { + var bos = _model.Vocab.BOS; + var eos = _model.Vocab.EOS; + + var bosStr = _model.Vocab.LLamaTokenToString(bos, true); + var eosStr = _model.Vocab.LLamaTokenToString(eos, true); + + Assert.Equal("", bosStr); + Assert.Equal("", eosStr); + } +} diff --git a/LLama.Web/Common/InferenceOptions.cs b/LLama.Web/Common/InferenceOptions.cs index e5735be63..c49d3aa31 100644 --- a/LLama.Web/Common/InferenceOptions.cs +++ b/LLama.Web/Common/InferenceOptions.cs @@ -20,6 +20,6 @@ public class InferenceOptions public IReadOnlyList AntiPrompts { get; set; } = Array.Empty(); /// - public required ISamplingPipeline SamplingPipeline { get; set; } + public ISamplingPipeline SamplingPipeline { get; set; } = new DefaultSamplingPipeline(); } } diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index a67a11a96..9824c0922 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -26,6 +26,9 @@ public class ModelOptions /// public GPUSplitMode? SplitMode { get; set; } + /// + public List TensorBufferOverrides { get; set; } = new(); + /// public int GpuLayerCount { get; set; } = 20; diff --git a/LLama.Web/LLama.Web.csproj b/LLama.Web/LLama.Web.csproj index 98eb9e266..176c03f73 100644 --- a/LLama.Web/LLama.Web.csproj +++ b/LLama.Web/LLama.Web.csproj @@ -15,7 +15,7 @@ - + diff --git a/LLama.Web/appsettings.Development.json b/LLama.Web/appsettings.Development.json index 770d3e931..ac6d49da3 100644 --- a/LLama.Web/appsettings.Development.json +++ b/LLama.Web/appsettings.Development.json @@ -3,7 +3,9 @@ "Logging": { "LogLevel": { "Default": "Information", - "Microsoft.AspNetCore": "Warning" + "Microsoft.AspNetCore": "Warning", + "Microsoft.AspNetCore.SignalR": "Debug", + "Microsoft.AspNetCore.Http.Connections": "Debug" } } } diff --git a/LLama.Web/appsettings.json b/LLama.Web/appsettings.json index a7a627d9b..caa27cc64 100644 --- a/LLama.Web/appsettings.json +++ b/LLama.Web/appsettings.json @@ -10,13 +10,13 @@ "ModelLoadType": 0, "Models": [ { - "Name": "Example LLama2-7b-Chat", + "Name": "Example LLava-v1.6-mistral", "MaxInstances": 20, - "ModelPath": "..\\LLama.Unittest\\Models\\llama-2-7b-chat.Q4_0.gguf", + "ModelPath": "..\\LLama.Unittest\\Models\\llava-v1.6-mistral-7b.Q3_K_XS.gguf", "ContextSize": 2048, "BatchSize": 2048, "Threads": 4, - "GpuLayerCount": 6, + "GpuLayerCount": 32, "UseMemorymap": true, "UseMemoryLock": false, "MainGpu": 0, diff --git a/LLama.WebAPI/LLama.WebAPI.csproj b/LLama.WebAPI/LLama.WebAPI.csproj index ed3e520da..774450610 100644 --- a/LLama.WebAPI/LLama.WebAPI.csproj +++ b/LLama.WebAPI/LLama.WebAPI.csproj @@ -9,7 +9,7 @@ - + diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs index cbbacafe5..8a752e190 100644 --- a/LLama/Abstractions/IModelParams.cs +++ b/LLama/Abstractions/IModelParams.cs @@ -38,6 +38,12 @@ public interface IModelParams /// GPUSplitMode? SplitMode { get; } + /// + /// Buffer type overrides for specific tensor patterns, allowing you to specify hardware devices to use for individual tensors or sets of tensors. + /// Equivalent to --override-tensor or -ot on the llama.cpp command line or tensor_buft_overrides internally. + /// + List TensorBufferOverrides { get; } + /// /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) /// diff --git a/LLama/Abstractions/TensorBufferOverride.cs b/LLama/Abstractions/TensorBufferOverride.cs new file mode 100644 index 000000000..e8ec3f136 --- /dev/null +++ b/LLama/Abstractions/TensorBufferOverride.cs @@ -0,0 +1,36 @@ +using System; + +namespace LLama.Abstractions +{ + /// + /// Represents a mapping between a tensor name pattern and a specific buffer type + /// + public class TensorBufferOverride + { + /// + /// Pattern to match tensor names. This is a regular expression. You can check the tensor names via the model.Metadata. + /// + public string Pattern { get; set; } + + /// + /// Buffer type to use for matching tensors. Examples: CPU, GPU0, GPU1 + /// + public string BufferType { get; set; } + + /// + /// Creates a new tensor buffer override + /// + /// Pattern to match tensor names + /// Buffer type to use for matching tensors + public TensorBufferOverride(string pattern, string bufferType) + { + if (string.IsNullOrEmpty(pattern)) + throw new ArgumentException("Pattern cannot be null or empty", nameof(pattern)); + if (string.IsNullOrEmpty(bufferType)) + throw new ArgumentException("Buffer type cannot be null or empty", nameof(bufferType)); + + Pattern = pattern; + BufferType = bufferType; + } + } +} diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index 7e4b1a967..23f5681be 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -21,6 +21,9 @@ public record ModelParams /// public GPUSplitMode? SplitMode { get; set; } + /// + public List TensorBufferOverrides { get; set; } = new(); + /// public int GpuLayerCount { get; set; } = 20; diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs index 588564e33..2939318da 100644 --- a/LLama/Extensions/IModelParamsExtensions.cs +++ b/LLama/Extensions/IModelParamsExtensions.cs @@ -3,6 +3,7 @@ using System.Text; using LLama.Abstractions; using LLama.Native; +using System.Collections.Generic; namespace LLama.Extensions; @@ -45,6 +46,13 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam result.tensor_split = (float*)disposer.Add(@params.TensorSplits.Pin()).Pointer; } + // Add tensor buffer overrides + unsafe + { + result.tensor_buft_overrides = ConvertOverrides(@params.TensorBufferOverrides, disposer); + } + + // Add metadata overrides if (@params.MetadataOverrides.Count == 0) { unsafe @@ -92,4 +100,69 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam return disposer; } + + /// + /// Get a map from name of device (`ggml_backend_buft_name`) to the device type (`ggml_backend_dev_buffer_type`) + /// + /// Dictionary mapping buffer type names to their handles + private static IReadOnlyDictionary GetAvailableBufferTypes() + { + var result = new Dictionary(); + + var count = NativeApi.ggml_backend_dev_count(); + for (nuint i = 0; i < count; i++) + { + var dev = NativeApi.ggml_backend_dev_get(i); + var buft = NativeApi.ggml_backend_dev_buffer_type(dev); + + var name = Marshal.PtrToStringAnsi(NativeApi.ggml_backend_buft_name(buft)); + if (string.IsNullOrEmpty(name)) + continue; + + result[name] = buft; + } + + return result; + } + + private static unsafe LLamaModelTensorBufferOverride* ConvertOverrides(List overrides, GroupDisposable disposer) + { + // Early out if there are no overrides + if (overrides.Count == 0) + return null; + + var bufferTypes = GetAvailableBufferTypes(); + + var overridesCount = 0; + var overridesArray = new LLamaModelTensorBufferOverride[overrides.Count + 1]; + + foreach (var @override in overrides) + { + // Check if we have this buffer type + if (!bufferTypes.TryGetValue(@override.BufferType, out var bufferType)) + continue; + + // Create null terminated string and pin this memory so it can be passed to native code + var patternBytes = Encoding.UTF8.GetBytes(@override.Pattern + "\0"); + var patternPin = patternBytes.AsMemory().Pin(); + disposer.Add(patternPin); + + // Add the item to the overridesArray + overridesArray[overridesCount++] = new() + { + Pattern = (byte*)patternPin.Pointer, + BufferType = bufferType + }; + } + + // Early out if there were no valid overrides + if (overridesCount == 0) + return null; + + // Pin it so it can be safely passed across to native code + var overrideArrayPin = overridesArray.AsMemory().Pin(); + disposer.Add(overrideArrayPin); + + return (LLamaModelTensorBufferOverride*)overrideArrayPin.Pointer; + } } \ No newline at end of file diff --git a/LLama/LLamaEmbedder.EmbeddingGenerator.cs b/LLama/LLamaEmbedder.EmbeddingGenerator.cs index 5ed82c0da..bce9f8d8b 100644 --- a/LLama/LLamaEmbedder.EmbeddingGenerator.cs +++ b/LLama/LLamaEmbedder.EmbeddingGenerator.cs @@ -22,8 +22,8 @@ public partial class LLamaEmbedder { return _metadata ??= new( nameof(LLamaEmbedder), - modelId: Context.NativeHandle.ModelHandle.ReadMetadata().TryGetValue("general.name", out var name) ? name : null, - dimensions: EmbeddingSize); + defaultModelId: Context.NativeHandle.ModelHandle.ReadMetadata().TryGetValue("general.name", out var name) ? name : null, + defaultModelDimensions: EmbeddingSize); } if (serviceType?.IsInstanceOfType(Context) is true) diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs index e00459d8c..0e28214f5 100644 --- a/LLama/LLamaEmbedder.cs +++ b/LLama/LLamaEmbedder.cs @@ -5,7 +5,9 @@ using LLama.Abstractions; using LLama.Exceptions; using LLama.Native; +using Microsoft.Extensions.AI; using Microsoft.Extensions.Logging; +using static System.Net.Mime.MediaTypeNames; namespace LLama; @@ -65,9 +67,8 @@ public async Task> GetEmbeddings(string input, Cancellati { // Add all of the tokens to the batch var tokens = Context.Tokenize(input, special: true); - var batch = new LLamaBatch(); - for (var i = 0; i < tokens.Length; i++) - batch.Add(tokens[i], i, LLamaSeqId.Zero, true); + if (tokens.Length > Context.ContextSize) + throw new ArgumentException($"Embedding prompt is longer than the context window ({tokens.Length} > {Context.ContextSize})", nameof(input)); // clear previous kv_cache values Context.NativeHandle.KvCacheClear(); @@ -75,27 +76,42 @@ public async Task> GetEmbeddings(string input, Cancellati // Check if we should cancel the work, just before doing anything expensive (encode/decode) cancellationToken.ThrowIfCancellationRequested(); - // Run model - switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) + // Evaluate prompt in batch-size chunks + var n_past = 0; + var batch = new LLamaBatch(); + var batchSize = (int)Context.Params.BatchSize; + for (var i = 0; i < tokens.Length; i += batchSize) { - case (true, false): - { - var result = await Context.EncodeAsync(batch, cancellationToken); - if (result != EncodeResult.Ok) - throw new RuntimeError($"Failed to encode: {result}"); - break; - } + var n_eval = tokens.Length - i; + if (n_eval > batchSize) + n_eval = batchSize; + + batch.Clear(); + batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, true); + n_past += n_eval; - case (false, true): + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) { - var result = await Context.DecodeAsync(batch, cancellationToken); - if (result != DecodeResult.Ok) - throw new RuntimeError($"Failed to decode: {result}"); - break; + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); } - - default: - throw new NotSupportedException("Unsupported model type"); } // Extract results @@ -114,6 +130,13 @@ public async Task> GetEmbeddings(string input, Cancellati results.Add(Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero).ToArray()); } + // Normalize the embeddings vector + // https://github.com/ggerganov/llama.cpp/blob/2891c8aa9af17f4ff636ff3868bc34ff72b56e25/examples/embedding/embedding.cpp#L92 + foreach (var embedding in results) + { + embedding.EuclideanNormalization(); + } + Context.NativeHandle.KvCacheClear(); return (results, tokens.Length); diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs new file mode 100644 index 000000000..fa42d7f35 --- /dev/null +++ b/LLama/LLamaReranker.cs @@ -0,0 +1,201 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using System.Xml.Linq; +using LLama.Abstractions; +using LLama.Exceptions; +using LLama.Native; +using Microsoft.Extensions.Logging; + +namespace LLama; + +/// +/// Get rank scores between prompt and documents +/// +public sealed partial class LLamaReranker + : IDisposable +{ + /// + /// Dimension of embedding vectors + /// + public int EmbeddingSize => Context.EmbeddingSize; + + /// + /// LLama Context + /// + public LLamaContext Context { get; } + + /// + /// Create a new reranker, using the given LLamaWeights + /// + /// + /// + /// + public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logger = null) + { + if (@params.UBatchSize != @params.BatchSize) + throw new ArgumentException("For non-causal models, batch size must be equal to ubatch size", nameof(@params)); + if (weights.NativeHandle is { HasEncoder: true, HasDecoder: true }) + throw new NotSupportedException("Computing rank in encoder-decoder models is not supported"); + if (@params.PoolingType != LLamaPoolingType.Rank) + throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank"); + Context = weights.CreateContext(@params, logger); + NativeApi.llama_set_embeddings(Context.NativeHandle, true); + } + + /// + public void Dispose() + { + Context.Dispose(); + } + + /// + /// Retrieve relevance scores for input and documents by reranking, execute once. + /// + /// + /// + /// Whether to normalize the score to the range (0, 1) + /// + /// + /// + /// + public async Task> GetRelevanceScores(string input, IReadOnlyList documents, bool normalize = false, CancellationToken cancellationToken = default) + { + List scores = new List(documents.Count); + var inputTokens = Context.Tokenize(input); + var batch = new LLamaBatch(); + var clearFlag = 0; + + for(var idx = 0; idx < documents.Count; idx++) + { + var docTokens = Context.Tokenize(documents[idx] ?? ""); + LLamaToken[] tokens = [.. inputTokens, .. docTokens]; + + if (batch.TokenCount + tokens.Length > Context.ContextSize) + { + scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken)); + batch.Clear(); + clearFlag = idx; + } + + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, (LLamaSeqId)(idx - clearFlag), true); + } + if (batch.LogitPositionCount > 0) + { + scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken)); + batch.Clear(); + } + + return scores; + } + + /// + /// Retrieve relevance score for input and document by reranking + /// + /// + /// + /// + /// Whether to normalize the score to the range (0, 1) + /// + /// + /// + public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default) + { + var inputTokens = Context.Tokenize(input); + var docTokens = Context.Tokenize(document); + LLamaToken[] tokens = [..inputTokens, ..docTokens]; + var batch = new LLamaBatch(); + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, LLamaSeqId.Zero, true); + + // clear previous kv_cache values + Context.NativeHandle.KvCacheClear(); + + // Check if we should cancel the work, just before doing anything expensive (encode/decode) + cancellationToken.ThrowIfCancellationRequested(); + + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) + { + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); + } + + var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0]; + + Context.NativeHandle.KvCacheClear(); + + return (normalize ? Sigmoid(score) : score, tokens.Length); + } + + private async Task> CalcRelevanceScores(LLamaBatch batch, bool normalize = false, CancellationToken cancellationToken = default) + { + var (logicCap, _) = batch.GetLogitPositions()[batch.LogitPositionCount - 1]; + var seqNum = logicCap.Value + 1; + List scores = new List(seqNum); + // clear previous kv_cache values + Context.NativeHandle.KvCacheClear(); + + // Check if we should cancel the work, just before doing anything expensive (encode/decode) + cancellationToken.ThrowIfCancellationRequested(); + + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) + { + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); + } + + for (var seq = 0; seq < seqNum; seq++) + { + var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)seq)[0]; + scores.Add(normalize ? Sigmoid(score) : score); + } + + Context.NativeHandle.KvCacheClear(); + + return scores; + } + + private float Sigmoid(float x) + { + return (float)(1 / (1 + Math.Exp(-x))); + } +} diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 22a3e04e1..0f67303dc 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -202,6 +202,28 @@ + + PreserveNewest + runtimes/linux-arm64/native/libllama.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml-base.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml-cpu.so + + + PreserveNewest + runtimes/linux-arm64/native/libllava_shared.so + + + PreserveNewest runtimes/linux-x64/native/cuda11/libllama.so @@ -466,4 +488,94 @@ runtimes/linux-x64/native/vulkan/libllava_shared.so + + + + + runtimes/android-x86/native/libllama.so + x86 + + + runtimes/android-x86/native/libggml.so + x86 + + + runtimes/android-x86/native/libggml-base.so + x86 + + + runtimes/android-x86/native/libggml-cpu.so + x86 + + + runtimes/android-x86/native/libllava_shared.so + x86 + + + + + + lib/x86_64/libllama.so + x86_64 + + + lib/x86_64/libggml.so + x86_64 + + + lib/x86_64/libggml-base.so + x86_64 + + + lib/x86_64/libggml-cpu.so + x86_64 + + + lib/x86_64/libllava_shared.so + x86_64 + + + + + + lib/arm64-v8a/libllama.so + arm64-v8a + + + lib/arm64-v8a/libggml.so + arm64-v8a + + + lib/arm64-v8a/libggml-base.so + arm64-v8a + + + lib/arm64-v8a/libggml-cpu.so + arm64-v8a + + + lib/arm64-v8a/libllava_shared.so + arm64-v8a + + + + \ No newline at end of file diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 70c8755a1..f400640d0 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -7,7 +7,7 @@ AnyCPU;x64;Arm64 True - 0.22.0 + 0.24.0 Rinne, Martin Evans, jlsantiago and all the other contributors in https://github.com/SciSharp/LLamaSharp/graphs/contributors. SciSharp STACK true @@ -22,7 +22,7 @@ With the higher-level APIs and RAG support, it's convenient to deploy LLM (Large Language Model) in your application with LLamaSharp. - Updated llama.cpp version to 5783575c9d99c4d9370495800663aa5397ceb0be + Updated llama.cpp version to ceda28ef8e310a8dee60bf275077a3eedae8e36c MIT packages @@ -51,13 +51,13 @@ - + - be7c3034108473be + ceda28ef8e310_v2 diff --git a/LLama/Native/DecodeResult.cs b/LLama/Native/DecodeResult.cs index 61056dd9d..8bf72c046 100644 --- a/LLama/Native/DecodeResult.cs +++ b/LLama/Native/DecodeResult.cs @@ -19,4 +19,19 @@ public enum DecodeResult /// Could not find a KV slot for the batch (try reducing the size of the batch or increase the context) /// NoKvSlot = 1, + + /// + /// Compute was aborted (e.g. due to callback request or timeout) + /// + ComputeAborted = 2, + + /// + /// Failed to allocate memory or reserve output space + /// + AllocationFailed = -2, + + /// + /// General failure during decode (e.g. internal error, slot failure) + /// + DecodeFailed = -3, } \ No newline at end of file diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index 5159226fd..acb024852 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -14,6 +14,11 @@ public unsafe struct LLamaModelParams /// private IntPtr devices; + /// + /// NULL-terminated list of buffer types to use for tensors that match a pattern + /// + public LLamaModelTensorBufferOverride* tensor_buft_overrides; + /// /// // number of layers to store in VRAM /// diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index d11f4882e..d31b1bbc8 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -89,6 +89,11 @@ public bool keep_split /// public IntPtr kv_overrides; + /// + /// pointer to vector containing tensor types + /// + public IntPtr tensor_types; + /// /// Create a LLamaModelQuantizeParams with default values /// diff --git a/LLama/Native/LLamaModelTensorBufferOverride.cs b/LLama/Native/LLamaModelTensorBufferOverride.cs new file mode 100644 index 000000000..3b7d3fa99 --- /dev/null +++ b/LLama/Native/LLamaModelTensorBufferOverride.cs @@ -0,0 +1,22 @@ +using System; + +namespace LLama.Native +{ + /// + /// Represents a mapping between a tensor name pattern and a backend buffer type
+ /// Original type: llama_model_tensor_buft_override + ///
+ [StructLayout(LayoutKind.Sequential)] + public unsafe struct LLamaModelTensorBufferOverride + { + /// + /// Tensor name pattern to match + /// + public byte* Pattern; + + /// + /// Backend buffer type to use for matching tensors, as obtained via ggml_backend_dev_buffer_type + /// + public IntPtr BufferType; + } +} diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs index 384ba0391..48ab5585b 100644 --- a/LLama/Native/LLamaVocabPreType.cs +++ b/LLama/Native/LLamaVocabPreType.cs @@ -38,5 +38,10 @@ internal enum LLamaVocabPreType MINERVA = 27, DEEPSEEK3_LLM = 28, GPT4O = 29, + SUPERBPE = 30, + TRILLION = 31, + BAILINGMOE = 32, + LLAMA4 = 33, + PIXTRAL = 34, } // ReSharper restore InconsistentNaming \ No newline at end of file diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index b0e8a792a..9f6457cd1 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -88,19 +88,28 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us - // ggml-cpu - dependencyPaths.Add(Path.Combine( - $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", - $"{libPrefix}ggml-cpu{ext}" - )); - - // ggml-cuda - if (library.Metadata.UseCuda) - dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); - - // ggml-vulkan - if (library.Metadata.UseVulkan) - dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + if (os == "linux-arm64"){ + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native", + $"{libPrefix}ggml-cpu{ext}" + )); + } + else{ + // ggml-cpu + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", + $"{libPrefix}ggml-cpu{ext}" + )); + + // ggml-cuda + if (library.Metadata.UseCuda) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); + + // ggml-vulkan + if (library.Metadata.UseVulkan) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + } + } } @@ -218,6 +227,13 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out if (platform == OSPlatform.Linux) { + if(System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported){ + // linux arm64 + os = "linux-arm64"; + fileExtension = ".so"; + libPrefix = "lib"; + return; + } if(RuntimeInformation.RuntimeIdentifier.ToLower().StartsWith("alpine")) { // alpine linux distro diff --git a/LLama/Native/Load/NativeLibraryWithAvx.cs b/LLama/Native/Load/NativeLibraryWithAvx.cs index 932c49866..e6cbd86f3 100644 --- a/LLama/Native/Load/NativeLibraryWithAvx.cs +++ b/LLama/Native/Load/NativeLibraryWithAvx.cs @@ -50,11 +50,17 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL private string? GetAvxPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback) { NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix); - var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); - if (!string.IsNullOrEmpty(avxStr)) - avxStr += "/"; - var relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; - return relativePath; + if (os != "linux-arm64"){ + var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); + if (!string.IsNullOrEmpty(avxStr)) + avxStr += "/"; + var relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + return relativePath; + } else { + var relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + return relativePath; + } + } } #endif diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs index 5ad30d032..4555ed0d2 100644 --- a/LLama/Native/NativeApi.Load.cs +++ b/LLama/Native/NativeApi.Load.cs @@ -53,6 +53,12 @@ private static void SetDllImportResolver() // NativeLibrary is not available on older runtimes. We'll have to depend on // the normal runtime dll resolution there. #if NET5_0_OR_GREATER + if (OperatingSystem.IsAndroid()) + { + // Android doesn't support DllImportResolver, so we have to rely on the default search path + return; + } + NativeLibrary.SetDllImportResolver(typeof(NativeApi).Assembly, (name, _, _) => { if (name == "llama") @@ -101,6 +107,8 @@ private static void SetDllImportResolver() internal const string libraryName = "llama"; internal const string llavaLibraryName = "llava_shared"; + internal const string ggmlLibraryName = "ggml"; + internal const string ggmlBaseLibraryName = "ggml-base"; private static INativeLibrary? _loadedLLamaLibrary = null; private static INativeLibrary? _loadedLLavaLibrary = null; diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 4c788b7a0..87cf02c78 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -290,6 +290,14 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback) [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx); + [Obsolete("Use `llama_kv_self_clear` instead")] + /// + /// Clear the KV cache. Both cell info is erased and KV data is zeroed + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + internal static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx); + /// /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) /// @@ -439,5 +447,36 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback) // it would expose the raw pointer to the model, without properly wrapping it in a SafeLLamaModelHandle. //[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] //public static void llama_model* llama_get_model(SafeLLamaContextHandle ctx); + + /// + /// Get the number of available backend devices + /// + /// Count of available backend devices + [DllImport(ggmlLibraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern nuint ggml_backend_dev_count(); + + /// + /// Get a backend device by index + /// + /// Device index + /// Pointer to the backend device + [DllImport(ggmlLibraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern IntPtr ggml_backend_dev_get(nuint i); + + /// + /// Get the buffer type for a backend device + /// + /// Backend device pointer + /// Pointer to the buffer type + [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern IntPtr ggml_backend_dev_buffer_type(IntPtr dev); + + /// + /// Get the name of a buffer type + /// + /// Buffer type pointer + /// Name of the buffer type + [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern IntPtr ggml_backend_buft_name(IntPtr buft); } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index faa390f76..467dd98e7 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -389,6 +389,15 @@ static SafeLLamaContextHandle() [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaKvCacheNative llama_get_kv_self(SafeLLamaContextHandle ctx); + + /// + /// Set whether the model is in warmup mode or not + /// If true, all model tensors are activated during llama_decode() to load and cache their weights. + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern void llama_set_warmup(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool warmup); #endregion #region LoRA diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs index 8d6cd3015..bad1a1974 100644 --- a/LLama/Native/SafeLLamaSamplerHandle.cs +++ b/LLama/Native/SafeLLamaSamplerHandle.cs @@ -270,6 +270,7 @@ public void AddMirostat2Sampler(uint seed, float tau, float eta) /// /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// + /// Setting k <= 0 makes this a noop /// public void AddTopK(int k) { @@ -408,20 +409,36 @@ public void AddFillInMiddleInfill(SafeLlamaModelHandle model) } /// - /// Create a sampler which makes tokens impossible unless they match the grammar + /// Create a sampler which makes tokens impossible unless they match the grammar. /// - /// + /// The model that this grammar will be used with /// /// Root rule of the grammar /// public void AddGrammar(SafeLlamaModelHandle model, string grammar, string root) + { + AddGrammar(model.Vocab, grammar, root); + } + + /// + /// Create a sampler which makes tokens impossible unless they match the grammar. + /// + /// The vocabulary that this grammar will be used with + /// + /// Root rule of the grammar + /// + public void AddGrammar(SafeLlamaModelHandle.Vocabulary vocab, string grammar, string root) { unsafe { - llama_sampler_chain_add(this, llama_sampler_init_grammar(model.Vocab.VocabNative, grammar, root)); + llama_sampler_chain_add(this, llama_sampler_init_grammar(vocab.VocabNative, grammar, root)); } // ReSharper disable InconsistentNaming + // @details Initializes a GBNF grammar, see grammars/README.md for details. + // @param vocab The vocabulary that this grammar will be used with. + // @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails. + // @param grammar_root The name of the start symbol for the grammar. [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] static extern unsafe IntPtr llama_sampler_init_grammar(LLamaVocabNative* model, string grammar_str, string grammar_root); // ReSharper restore InconsistentNaming diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index db198ec30..801d25167 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -651,7 +651,18 @@ internal Vocabulary(SafeLlamaModelHandle model) _model = model; } - private string? LLamaTokenToString(LLamaToken? token, bool isSpecialToken) + private static LLamaToken? Normalize(LLamaToken token) + { + return token == -1 ? null : token; + } + + /// + /// Translate LLamaToken to String + /// + /// + /// + /// + public string? LLamaTokenToString(LLamaToken? token, bool isSpecialToken) { if (!token.HasValue) return null; @@ -676,11 +687,6 @@ internal Vocabulary(SafeLlamaModelHandle model) return Encoding.UTF8.GetStringFromSpan(slice); } - private static LLamaToken? Normalize(LLamaToken token) - { - return token == -1 ? null : token; - } - /// /// Total number of tokens in this vocabulary /// diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.Android.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.Android.nuspec new file mode 100644 index 000000000..0d45b1492 --- /dev/null +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.Android.nuspec @@ -0,0 +1,41 @@ + + + + LLamaSharp.Backend.Cpu.Android + $version$ + LLamaSharp.Backend.Cpu.Android, the backend for LLamaSharp + llama.cpp Authors + false + MIT + icon512.png + https://github.com/SciSharp/LLamaSharp + LLamaSharp.Backend.Cpu.Android is a backend for LLamaSharp to use with Android Cpu only. + + Copyright 2023 The llama.cpp Authors. All rights reserved. + LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 7c69534da..aeef403eb 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -1,46 +1,46 @@ - - LLamaSharp.Backend.Cpu - $version$ - LLamaSharp.Backend.Cpu, the backend for LLamaSharp - llama.cpp Authors - false - MIT - icon512.png - https://github.com/SciSharp/LLamaSharp - LLamaSharp.Backend.Cpu is a backend for LLamaSharp to use with Cpu only. - - Copyright 2023 The llama.cpp Authors. All rights reserved. - LLamaSharp LLama LLM GPT AI ChatBot SciSharp - + + LLamaSharp.Backend.Cpu + $version$ + LLamaSharp.Backend.Cpu, the backend for LLamaSharp + llama.cpp Authors + false + MIT + icon512.png + https://github.com/SciSharp/LLamaSharp + LLamaSharp.Backend.Cpu is a backend for LLamaSharp to use with Cpu only. + + Copyright 2023 The llama.cpp Authors. All rights reserved. + LLamaSharp LLama LLM GPT AI ChatBot SciSharp + - - + + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + @@ -66,7 +66,13 @@ - + + + + + + + @@ -97,22 +103,22 @@ - - - - - - + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + \ No newline at end of file diff --git a/LLama/runtimes/build/LLamaSharpBackend.props b/LLama/runtimes/build/LLamaSharpBackend.props index 422969d88..006b4dabc 100644 --- a/LLama/runtimes/build/LLamaSharpBackend.props +++ b/LLama/runtimes/build/LLamaSharpBackend.props @@ -14,4 +14,93 @@ + + + + runtimes\android-x86\native\libllama.so + x86 + + + runtimes\android-x86\native\libggml.so + x86 + + + runtimes\android-x86\native\libggml-base.so + x86 + + + runtimes\android-x86\native\libggml-cpu.so + x86 + + + runtimes\android-x86\native\libllava_shared.so + x86 + + + + + + lib\x86_64\libllama.so + x86_64 + + + lib\x86_64\libggml.so + x86_64 + + + lib\x86_64\libggml-base.so + x86_64 + + + lib\x86_64\libggml-cpu.so + x86_64 + + + lib\x86_64\libllava_shared.so + x86_64 + + + + + + lib\arm64-v8a\libllama.so + arm64-v8a + + + lib\arm64-v8a\libggml.so + arm64-v8a + + + lib\arm64-v8a\libggml-base.so + arm64-v8a + + + lib\arm64-v8a\libggml-cpu.so + arm64-v8a + + + lib\arm64-v8a\libllava_shared.so + arm64-v8a + + + + diff --git a/Llama.Mobile/App.xaml b/Llama.Mobile/App.xaml new file mode 100644 index 000000000..e5b403011 --- /dev/null +++ b/Llama.Mobile/App.xaml @@ -0,0 +1,14 @@ + + + + + + + + + + + diff --git a/Llama.Mobile/App.xaml.cs b/Llama.Mobile/App.xaml.cs new file mode 100644 index 000000000..c2db0b0b9 --- /dev/null +++ b/Llama.Mobile/App.xaml.cs @@ -0,0 +1,12 @@ +namespace Llama.Mobile +{ + public partial class App : Application + { + public App() + { + InitializeComponent(); + + MainPage = new AppShell(); + } + } +} diff --git a/Llama.Mobile/AppShell.xaml b/Llama.Mobile/AppShell.xaml new file mode 100644 index 000000000..65ae2f591 --- /dev/null +++ b/Llama.Mobile/AppShell.xaml @@ -0,0 +1,15 @@ + + + + + + diff --git a/Llama.Mobile/AppShell.xaml.cs b/Llama.Mobile/AppShell.xaml.cs new file mode 100644 index 000000000..33f40ba5c --- /dev/null +++ b/Llama.Mobile/AppShell.xaml.cs @@ -0,0 +1,10 @@ +namespace Llama.Mobile +{ + public partial class AppShell : Shell + { + public AppShell() + { + InitializeComponent(); + } + } +} diff --git a/Llama.Mobile/Llama.Mobile.csproj b/Llama.Mobile/Llama.Mobile.csproj new file mode 100644 index 000000000..a51a3eb0f --- /dev/null +++ b/Llama.Mobile/Llama.Mobile.csproj @@ -0,0 +1,82 @@ + + + + + true + false + + + + + + + + + net8.0-android + + + + + + + + + + Exe + Llama.Mobile + true + true + enable + enable + + + Llama.Mobile + + + com.llama.mobile + + + 1.0 + 1 + + 11.0 + 13.1 + 21.0 + 10.0.17763.0 + 10.0.17763.0 + 6.5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Llama.Mobile/MainPage.xaml b/Llama.Mobile/MainPage.xaml new file mode 100644 index 000000000..5bd8e7e94 --- /dev/null +++ b/Llama.Mobile/MainPage.xaml @@ -0,0 +1,14 @@ + + + + \ No newline at end of file diff --git a/Llama.Mobile/MainPage.xaml.cs b/Llama.Mobile/MainPage.xaml.cs new file mode 100644 index 000000000..8c9cb310c --- /dev/null +++ b/Llama.Mobile/MainPage.xaml.cs @@ -0,0 +1,16 @@ +namespace Llama.Mobile; + +using LLama.Native; + +public partial class MainPage : ContentPage +{ + public MainPage() + { + InitializeComponent(); + + //Load the native library + NativeApi.llama_empty_call(); + + label1.Text = "llama.cpp loaded successfully"; + } +} diff --git a/Llama.Mobile/MauiProgram.cs b/Llama.Mobile/MauiProgram.cs new file mode 100644 index 000000000..fe17dcd27 --- /dev/null +++ b/Llama.Mobile/MauiProgram.cs @@ -0,0 +1,25 @@ +using Microsoft.Extensions.Logging; + +namespace Llama.Mobile +{ + public static class MauiProgram + { + public static MauiApp CreateMauiApp() + { + var builder = MauiApp.CreateBuilder(); + builder + .UseMauiApp() + .ConfigureFonts(fonts => + { + fonts.AddFont("OpenSans-Regular.ttf", "OpenSansRegular"); + fonts.AddFont("OpenSans-Semibold.ttf", "OpenSansSemibold"); + }); + +#if DEBUG + builder.Logging.AddDebug(); +#endif + + return builder.Build(); + } + } +} diff --git a/Llama.Mobile/Platforms/Android/AndroidManifest.xml b/Llama.Mobile/Platforms/Android/AndroidManifest.xml new file mode 100644 index 000000000..e9937ad77 --- /dev/null +++ b/Llama.Mobile/Platforms/Android/AndroidManifest.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Llama.Mobile/Platforms/Android/MainActivity.cs b/Llama.Mobile/Platforms/Android/MainActivity.cs new file mode 100644 index 000000000..8a0d5c68a --- /dev/null +++ b/Llama.Mobile/Platforms/Android/MainActivity.cs @@ -0,0 +1,11 @@ +using Android.App; +using Android.Content.PM; +using Android.OS; + +namespace Llama.Mobile +{ + [Activity(Theme = "@style/Maui.SplashTheme", MainLauncher = true, LaunchMode = LaunchMode.SingleTop, ConfigurationChanges = ConfigChanges.ScreenSize | ConfigChanges.Orientation | ConfigChanges.UiMode | ConfigChanges.ScreenLayout | ConfigChanges.SmallestScreenSize | ConfigChanges.Density)] + public class MainActivity : MauiAppCompatActivity + { + } +} diff --git a/Llama.Mobile/Platforms/Android/MainApplication.cs b/Llama.Mobile/Platforms/Android/MainApplication.cs new file mode 100644 index 000000000..9bf5331af --- /dev/null +++ b/Llama.Mobile/Platforms/Android/MainApplication.cs @@ -0,0 +1,16 @@ +using Android.App; +using Android.Runtime; + +namespace Llama.Mobile +{ + [Application] + public class MainApplication : MauiApplication + { + public MainApplication(IntPtr handle, JniHandleOwnership ownership) + : base(handle, ownership) + { + } + + protected override MauiApp CreateMauiApp() => MauiProgram.CreateMauiApp(); + } +} diff --git a/Llama.Mobile/Platforms/Android/Resources/values/colors.xml b/Llama.Mobile/Platforms/Android/Resources/values/colors.xml new file mode 100644 index 000000000..c04d7492a --- /dev/null +++ b/Llama.Mobile/Platforms/Android/Resources/values/colors.xml @@ -0,0 +1,6 @@ + + + #512BD4 + #2B0B98 + #2B0B98 + \ No newline at end of file diff --git a/Llama.Mobile/Platforms/MacCatalyst/AppDelegate.cs b/Llama.Mobile/Platforms/MacCatalyst/AppDelegate.cs new file mode 100644 index 000000000..5af0d2d6f --- /dev/null +++ b/Llama.Mobile/Platforms/MacCatalyst/AppDelegate.cs @@ -0,0 +1,10 @@ +using Foundation; + +namespace Llama.Mobile +{ + [Register("AppDelegate")] + public class AppDelegate : MauiUIApplicationDelegate + { + protected override MauiApp CreateMauiApp() => MauiProgram.CreateMauiApp(); + } +} diff --git a/Llama.Mobile/Platforms/MacCatalyst/Entitlements.plist b/Llama.Mobile/Platforms/MacCatalyst/Entitlements.plist new file mode 100644 index 000000000..de4adc94a --- /dev/null +++ b/Llama.Mobile/Platforms/MacCatalyst/Entitlements.plist @@ -0,0 +1,14 @@ + + + + + + + com.apple.security.app-sandbox + + + com.apple.security.network.client + + + + diff --git a/Llama.Mobile/Platforms/MacCatalyst/Info.plist b/Llama.Mobile/Platforms/MacCatalyst/Info.plist new file mode 100644 index 000000000..726897715 --- /dev/null +++ b/Llama.Mobile/Platforms/MacCatalyst/Info.plist @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + UIDeviceFamily + + 2 + + UIRequiredDeviceCapabilities + + arm64 + + UISupportedInterfaceOrientations + + UIInterfaceOrientationPortrait + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + UISupportedInterfaceOrientations~ipad + + UIInterfaceOrientationPortrait + UIInterfaceOrientationPortraitUpsideDown + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + XSAppIconAssets + Assets.xcassets/appicon.appiconset + + diff --git a/Llama.Mobile/Platforms/MacCatalyst/Program.cs b/Llama.Mobile/Platforms/MacCatalyst/Program.cs new file mode 100644 index 000000000..d3bd693a9 --- /dev/null +++ b/Llama.Mobile/Platforms/MacCatalyst/Program.cs @@ -0,0 +1,16 @@ +using ObjCRuntime; +using UIKit; + +namespace Llama.Mobile +{ + public class Program + { + // This is the main entry point of the application. + static void Main(string[] args) + { + // if you want to use a different Application Delegate class from "AppDelegate" + // you can specify it here. + UIApplication.Main(args, null, typeof(AppDelegate)); + } + } +} diff --git a/Llama.Mobile/Platforms/Tizen/Main.cs b/Llama.Mobile/Platforms/Tizen/Main.cs new file mode 100644 index 000000000..030e40e44 --- /dev/null +++ b/Llama.Mobile/Platforms/Tizen/Main.cs @@ -0,0 +1,17 @@ +using Microsoft.Maui; +using Microsoft.Maui.Hosting; +using System; + +namespace Llama.Mobile +{ + internal class Program : MauiApplication + { + protected override MauiApp CreateMauiApp() => MauiProgram.CreateMauiApp(); + + static void Main(string[] args) + { + var app = new Program(); + app.Run(args); + } + } +} diff --git a/Llama.Mobile/Platforms/Tizen/tizen-manifest.xml b/Llama.Mobile/Platforms/Tizen/tizen-manifest.xml new file mode 100644 index 000000000..58d0846a5 --- /dev/null +++ b/Llama.Mobile/Platforms/Tizen/tizen-manifest.xml @@ -0,0 +1,15 @@ + + + + + + maui-appicon-placeholder + + + + + http://tizen.org/privilege/internet + + + + \ No newline at end of file diff --git a/Llama.Mobile/Platforms/Windows/App.xaml b/Llama.Mobile/Platforms/Windows/App.xaml new file mode 100644 index 000000000..51d994306 --- /dev/null +++ b/Llama.Mobile/Platforms/Windows/App.xaml @@ -0,0 +1,8 @@ + + + diff --git a/Llama.Mobile/Platforms/Windows/App.xaml.cs b/Llama.Mobile/Platforms/Windows/App.xaml.cs new file mode 100644 index 000000000..17804342a --- /dev/null +++ b/Llama.Mobile/Platforms/Windows/App.xaml.cs @@ -0,0 +1,25 @@ +using Microsoft.UI.Xaml; + +// To learn more about WinUI, the WinUI project structure, +// and more about our project templates, see: http://aka.ms/winui-project-info. + +namespace Llama.Mobile.WinUI +{ + /// + /// Provides application-specific behavior to supplement the default Application class. + /// + public partial class App : MauiWinUIApplication + { + /// + /// Initializes the singleton application object. This is the first line of authored code + /// executed, and as such is the logical equivalent of main() or WinMain(). + /// + public App() + { + this.InitializeComponent(); + } + + protected override MauiApp CreateMauiApp() => MauiProgram.CreateMauiApp(); + } + +} diff --git a/Llama.Mobile/Platforms/Windows/Package.appxmanifest b/Llama.Mobile/Platforms/Windows/Package.appxmanifest new file mode 100644 index 000000000..eb72027fd --- /dev/null +++ b/Llama.Mobile/Platforms/Windows/Package.appxmanifest @@ -0,0 +1,46 @@ + + + + + + + + + $placeholder$ + User Name + $placeholder$.png + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Llama.Mobile/Platforms/Windows/app.manifest b/Llama.Mobile/Platforms/Windows/app.manifest new file mode 100644 index 000000000..9991c324d --- /dev/null +++ b/Llama.Mobile/Platforms/Windows/app.manifest @@ -0,0 +1,15 @@ + + + + + + + + true/PM + PerMonitorV2, PerMonitor + + + diff --git a/Llama.Mobile/Platforms/iOS/AppDelegate.cs b/Llama.Mobile/Platforms/iOS/AppDelegate.cs new file mode 100644 index 000000000..5af0d2d6f --- /dev/null +++ b/Llama.Mobile/Platforms/iOS/AppDelegate.cs @@ -0,0 +1,10 @@ +using Foundation; + +namespace Llama.Mobile +{ + [Register("AppDelegate")] + public class AppDelegate : MauiUIApplicationDelegate + { + protected override MauiApp CreateMauiApp() => MauiProgram.CreateMauiApp(); + } +} diff --git a/Llama.Mobile/Platforms/iOS/Info.plist b/Llama.Mobile/Platforms/iOS/Info.plist new file mode 100644 index 000000000..0004a4fde --- /dev/null +++ b/Llama.Mobile/Platforms/iOS/Info.plist @@ -0,0 +1,32 @@ + + + + + LSRequiresIPhoneOS + + UIDeviceFamily + + 1 + 2 + + UIRequiredDeviceCapabilities + + arm64 + + UISupportedInterfaceOrientations + + UIInterfaceOrientationPortrait + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + UISupportedInterfaceOrientations~ipad + + UIInterfaceOrientationPortrait + UIInterfaceOrientationPortraitUpsideDown + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + XSAppIconAssets + Assets.xcassets/appicon.appiconset + + diff --git a/Llama.Mobile/Platforms/iOS/Program.cs b/Llama.Mobile/Platforms/iOS/Program.cs new file mode 100644 index 000000000..d3bd693a9 --- /dev/null +++ b/Llama.Mobile/Platforms/iOS/Program.cs @@ -0,0 +1,16 @@ +using ObjCRuntime; +using UIKit; + +namespace Llama.Mobile +{ + public class Program + { + // This is the main entry point of the application. + static void Main(string[] args) + { + // if you want to use a different Application Delegate class from "AppDelegate" + // you can specify it here. + UIApplication.Main(args, null, typeof(AppDelegate)); + } + } +} diff --git a/Llama.Mobile/Platforms/iOS/Resources/PrivacyInfo.xcprivacy b/Llama.Mobile/Platforms/iOS/Resources/PrivacyInfo.xcprivacy new file mode 100644 index 000000000..24ab3b433 --- /dev/null +++ b/Llama.Mobile/Platforms/iOS/Resources/PrivacyInfo.xcprivacy @@ -0,0 +1,51 @@ + + + + + + NSPrivacyAccessedAPITypes + + + NSPrivacyAccessedAPIType + NSPrivacyAccessedAPICategoryFileTimestamp + NSPrivacyAccessedAPITypeReasons + + C617.1 + + + + NSPrivacyAccessedAPIType + NSPrivacyAccessedAPICategorySystemBootTime + NSPrivacyAccessedAPITypeReasons + + 35F9.1 + + + + NSPrivacyAccessedAPIType + NSPrivacyAccessedAPICategoryDiskSpace + NSPrivacyAccessedAPITypeReasons + + E174.1 + + + + + + diff --git a/Llama.Mobile/Resources/AppIcon/appicon.svg b/Llama.Mobile/Resources/AppIcon/appicon.svg new file mode 100644 index 000000000..9d63b6513 --- /dev/null +++ b/Llama.Mobile/Resources/AppIcon/appicon.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/Llama.Mobile/Resources/AppIcon/appiconfg.svg b/Llama.Mobile/Resources/AppIcon/appiconfg.svg new file mode 100644 index 000000000..21dfb25f1 --- /dev/null +++ b/Llama.Mobile/Resources/AppIcon/appiconfg.svg @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/Llama.Mobile/Resources/Fonts/OpenSans-Regular.ttf b/Llama.Mobile/Resources/Fonts/OpenSans-Regular.ttf new file mode 100644 index 000000000..ee3f28f4a Binary files /dev/null and b/Llama.Mobile/Resources/Fonts/OpenSans-Regular.ttf differ diff --git a/Llama.Mobile/Resources/Fonts/OpenSans-Semibold.ttf b/Llama.Mobile/Resources/Fonts/OpenSans-Semibold.ttf new file mode 100644 index 000000000..bc81019ae Binary files /dev/null and b/Llama.Mobile/Resources/Fonts/OpenSans-Semibold.ttf differ diff --git a/Llama.Mobile/Resources/Images/dotnet_bot.png b/Llama.Mobile/Resources/Images/dotnet_bot.png new file mode 100644 index 000000000..f93ce025a Binary files /dev/null and b/Llama.Mobile/Resources/Images/dotnet_bot.png differ diff --git a/Llama.Mobile/Resources/Raw/AboutAssets.txt b/Llama.Mobile/Resources/Raw/AboutAssets.txt new file mode 100644 index 000000000..89dc758d6 --- /dev/null +++ b/Llama.Mobile/Resources/Raw/AboutAssets.txt @@ -0,0 +1,15 @@ +Any raw assets you want to be deployed with your application can be placed in +this directory (and child directories). Deployment of the asset to your application +is automatically handled by the following `MauiAsset` Build Action within your `.csproj`. + + + +These files will be deployed with your package and will be accessible using Essentials: + + async Task LoadMauiAsset() + { + using var stream = await FileSystem.OpenAppPackageFileAsync("AboutAssets.txt"); + using var reader = new StreamReader(stream); + + var contents = reader.ReadToEnd(); + } diff --git a/Llama.Mobile/Resources/Splash/splash.svg b/Llama.Mobile/Resources/Splash/splash.svg new file mode 100644 index 000000000..21dfb25f1 --- /dev/null +++ b/Llama.Mobile/Resources/Splash/splash.svg @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/Llama.Mobile/Resources/Styles/Colors.xaml b/Llama.Mobile/Resources/Styles/Colors.xaml new file mode 100644 index 000000000..30307a5dd --- /dev/null +++ b/Llama.Mobile/Resources/Styles/Colors.xaml @@ -0,0 +1,45 @@ + + + + + + + #512BD4 + #ac99ea + #242424 + #DFD8F7 + #9880e5 + #2B0B98 + + White + Black + #D600AA + #190649 + #1f1f1f + + #E1E1E1 + #C8C8C8 + #ACACAC + #919191 + #6E6E6E + #404040 + #212121 + #141414 + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Llama.Mobile/Resources/Styles/Styles.xaml b/Llama.Mobile/Resources/Styles/Styles.xaml new file mode 100644 index 000000000..6641e3aed --- /dev/null +++ b/Llama.Mobile/Resources/Styles/Styles.xaml @@ -0,0 +1,427 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/README.md b/README.md index 17bc21404..240f9931e 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ ## 📖Documentation - [Quick start](https://scisharp.github.io/LLamaSharp/latest/QuickStart/) +- [Ask AI via deep-wiki](https://deepwiki.com/SciSharp/LLamaSharp) - [FAQ](https://scisharp.github.io/LLamaSharp/latest/FAQ/) - [Tutorial](https://scisharp.github.io/LLamaSharp/latest/Tutorials/NativeLibraryConfig/) - [Full documentation](https://scisharp.github.io/LLamaSharp/latest/) @@ -65,7 +66,7 @@ There are integrations for the following libraries, making it easier to develop - [kernel-memory](https://github.com/microsoft/kernel-memory): a multi-modal AI Service specialized in the efficient indexing of datasets through custom continuous data hybrid pipelines, with support for RAG ([Retrieval Augmented Generation](https://en.wikipedia.org/wiki/Prompt_engineering#Retrieval-augmented_generation)), synthetic memory, prompt engineering, and custom semantic memory processing. - [BotSharp](https://github.com/SciSharp/BotSharp): an open source machine learning framework for AI Bot platform builder. - [Langchain](https://github.com/tryAGI/LangChain): a framework for developing applications powered by language models. - +- [MaIN.NET](https://github.com/wisedev-code/MaIN.NET): simplistic approach to orchestrating agents/chats from different (llm) providers The following examples show how to build APPs with LLamaSharp. @@ -120,8 +121,9 @@ Generally, we recommend downloading models with quantization rather than fp16, b Here is a simple example to chat with a bot based on a LLM in LLamaSharp. Please replace the model path with yours. ```cs -using LLama.Common; using LLama; +using LLama.Common; +using LLama.Sampling; string modelPath = @""; // change it to your own model path. @@ -263,6 +265,8 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l | v0.20.0 | | [`0827b2c1`](https://github.com/ggerganov/llama.cpp/tree/0827b2c1da299805288abbd556d869318f2b121e) | | v0.21.0 | [DeepSeek R1](https://huggingface.co/collections/unsloth/deepseek-r1-all-versions-678e1c48f5d2fce87892ace5) | [`5783575c`](https://github.com/ggerganov/llama.cpp/tree/5783575c9d99c4d9370495800663aa5397ceb0be) | | v0.22.0 | Gemma3 | [`be7c3034`](https://github.com/ggerganov/llama.cpp/tree/be7c3034108473beda214fd1d7c98fd6a7a3bdf5) | +| v0.23.0 | Gemma3 | [`be7c3034`](https://github.com/ggerganov/llama.cpp/tree/be7c3034108473beda214fd1d7c98fd6a7a3bdf5) | +| v0.24.0 | Qwen3 | [`ceda28ef`](https://github.com/ggerganov/llama.cpp/tree/ceda28ef8e310a8dee60bf275077a3eedae8e36c) | ## License diff --git a/llama.cpp b/llama.cpp index be7c30341..ceda28ef8 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit be7c3034108473beda214fd1d7c98fd6a7a3bdf5 +Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c