diff --git a/examples/server.ps1 b/examples/server.ps1 index 60d8168..11e6107 100644 --- a/examples/server.ps1 +++ b/examples/server.ps1 @@ -297,39 +297,44 @@ if ($numberOfGPULayers -lt 0) { $numberOfGPULayers = 0 } -# We are automatically using the self extending context window -# on models that have a trained context window < context size. -# https://arxiv.org/abs/2401.01325 -# https://github.com/ggerganov/llama.cpp/issues/4886#issuecomment-1890465266 -$groupAttentionFactor = 1 -$groupAttentionWidth = 512 +Write-Host "Starting llama.cpp server with custom options at http://127.0.0.1:${port}..." -ForegroundColor "Yellow" -if ($contextSize -gt $modelContextLength) { +$commandBinary = "${llamaCppPath}\build\bin\Release\llama-server" + +$commandArguments = @( + "--n-predict 1024", + "--port '${port}'", + "--model '${model}'", + "--alias '${alias}'", + "--ctx-size '${contextSize}'", + "--threads '${numberOfPhysicalCores}'", + "--n-gpu-layers '${numberOfGPULayers}'", + "--parallel '${parallel}'", + "--cache-type-k '${kvCacheDataType}'", + "--cache-type-v '${kvCacheDataType}'" +) - Write-Host "Self extending context window from ${modelContextLength} to ${contextSize}..." -ForegroundColor "Yellow" +if ($chatTemplate) { + $commandArguments += "--chat-template '${chatTemplate}'" +} - $groupAttentionFactor = $contextSize / $modelContextLength - $groupAttentionWidth = $modelContextLength / 2 +if ($enableFlashAttention) { + $commandArguments += "--flash-attn" } -Write-Host "Starting llama.cpp server with custom options at http://127.0.0.1:${port}..." -ForegroundColor "Yellow" +if ($verbose) { + $commandArguments += "--verbose" +} + +$commandArguments = $commandArguments | ForEach-Object { + if ($commandArguments.IndexOf($_) -ne $commandArguments.Count - 1) { + " $_ ```n" + } else { + " $_ `n" + } +} -$command = "${llamaCppPath}\build\bin\Release\llama-server `` - --n-predict 1024 `` - --port '${port}' `` - --model '${model}' `` - --alias '${alias}' `` - --ctx-size '${contextSize}' `` - --threads '${numberOfPhysicalCores}' `` - --n-gpu-layers '${numberOfGPULayers}' `` - --parallel '${parallel}' `` - --grp-attn-n '${groupAttentionFactor}' `` - --grp-attn-w '${groupAttentionWidth}' `` - --cache-type-k '${kvCacheDataType}' `` - --cache-type-v '${kvCacheDataType}' `` - $(if ($enableFlashAttention) {"--flash-attn"}) `` - $(if ($chatTemplate) {"--chat-template '${chatTemplate}'"}) `` - $(if ($verbose) {"--verbose"})" +$command = $commandBinary + " ```n " + $commandArguments Write-Host $command -ForegroundColor "Green" diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 95bc82f..a89f75e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 95bc82fbc0df6d48cf66c857a4dda3d044f45ca2 +Subproject commit a89f75e1b7b90cb2d4d4c52ca53ef9e9b466aa45