From fe1501918778f4ce3a43d3c48e0f053a0bb9f189 Mon Sep 17 00:00:00 2001
From: Leon Nissen <50104433+LeonNissen@users.noreply.github.com>
Date: Mon, 3 Feb 2025 06:41:35 -0800
Subject: [PATCH] Bump mlx to version 2.21.2 (#94)

# Bumps mlx to version 2.21.2

## :recycle: Current situation & Problem
The MLX library is currently 1.18.1, and can be bumped to 2.x to support
newer models.

## :gear: Release Notes
Updates the MLX library to 2.21.2 and migrates code for breaking
changes.

## :pencil: Code of Conduct & Contributing Guidelines

By submitting creating this pull request, you agree to follow our [Code
of
Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md)
and [Contributing
Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md):
- [X] I agree to follow the [Code of
Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md)
and [Contributing
Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md).

---------

Co-authored-by: Leon Nissen <>
Co-authored-by: Vishnu Ravi <vishnur@stanford.edu>
Co-authored-by: Paul Schmiedmayer <PSchmiedmayer@users.noreply.github.com>
---
 .linkspector.yml                              |  14 ++
 Package.swift                                 |  11 +-
 README.md                                     |   4 +-
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    |   1 +
 .../LLMLocalSession+Generate.swift            | 166 ++++++++++--------
 .../SpeziLLMLocal/LLMLocalSession+Setup.swift |   7 +-
 Sources/SpeziLLMLocal/LLMLocalSession.swift   |   3 +-
 .../SpeziLLMLocal.docc/SpeziLLMLocal.md       |   2 +-
 8 files changed, 125 insertions(+), 83 deletions(-)
 create mode 100644 .linkspector.yml

diff --git a/.linkspector.yml b/.linkspector.yml
new file mode 100644
index 00000000..450bcbaf
--- /dev/null
+++ b/.linkspector.yml
@@ -0,0 +1,14 @@
+#
+# This source file is part of the Stanford Spezi open source project
+#
+# SPDX-FileCopyrightText: 2025 Stanford University and the project authors (see CONTRIBUTORS.md)
+#
+# SPDX-License-Identifier: MIT
+# 
+dirs:
+  - .
+useGitIgnore: true
+ignorePatterns:
+  - pattern: '^https://platform.openai.com/docs/guides/.*$' # Causes false positives
+  - pattern: '^doc:.*$'
+  - pattern: '^http://localhost.*$'
\ No newline at end of file
diff --git a/Package.swift b/Package.swift
index bd3afa8b..8736ce24 100644
--- a/Package.swift
+++ b/Package.swift
@@ -28,14 +28,14 @@ let package = Package(
     ],
     dependencies: [
         .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.21.2")),
-        .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "1.18.1"),  // Pin MLX Swift Examples as it doesn't follow semantic versioning
+        .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "2.21.2"),  // Pin MLX Swift Examples as it doesn't follow semantic versioning
         .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.14")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),
         .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0"),
         .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"),
         .package(url: "https://github.com/StanfordSpezi/SpeziOnboarding", from: "1.1.1"),
-        .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.1")),
+        .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.3")),
         .package(url: "https://github.com/StanfordSpezi/SpeziViews", from: "1.3.1")
     ],
     targets: [
@@ -54,12 +54,9 @@ let package = Package(
                 .product(name: "SpeziFoundation", package: "SpeziFoundation"),
                 .product(name: "Spezi", package: "Spezi"),
                 .product(name: "MLX", package: "mlx-swift"),
-                .product(name: "MLXFast", package: "mlx-swift"),
-                .product(name: "MLXNN", package: "mlx-swift"),
-                .product(name: "MLXOptimizers", package: "mlx-swift"),
                 .product(name: "MLXRandom", package: "mlx-swift"),
                 .product(name: "Transformers", package: "swift-transformers"),
-                .product(name: "LLM", package: "mlx-swift-examples")
+                .product(name: "MLXLLM", package: "mlx-swift-examples")
             ]
         ),
         .target(
@@ -68,7 +65,7 @@ let package = Package(
                 .product(name: "SpeziOnboarding", package: "SpeziOnboarding"),
                 .product(name: "SpeziViews", package: "SpeziViews"),
                 .target(name: "SpeziLLMLocal"),
-                .product(name: "LLM", package: "mlx-swift-examples")
+                .product(name: "MLXLLM", package: "mlx-swift-examples")
             ]
         ),
         .target(
diff --git a/README.md b/README.md
index 2de78184..12c7ab0d 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ The target enables developers to easily execute medium-size Language Models (LLM
 > Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
 > [!IMPORTANT]
-> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
+> Important: To use the LLM local target, some LLMs require adding the *Increase Memory Limit* entitlement to the project.
 
 #### Setup
 
@@ -147,7 +147,7 @@ class LLMOpenAIAppDelegate: SpeziAppDelegate {
 ```
 
 > [!IMPORTANT]
-> If using `SpeziLLMOpenAI` on macOS, ensure to add the [`Keychain Access Groups` entitlement](https://developer.apple.com/documentation/bundleresources/entitlements/keychain-access-groups) to the enclosing Xcode project via *PROJECT_NAME > Signing&Capabilities > + Capability*. The array of keychain groups can be left empty, only the base entitlement is required.
+> If using `SpeziLLMOpenAI` on macOS, ensure to add the *`Keychain Access Groups` entitlement* to the enclosing Xcode project via *PROJECT_NAME > Signing&Capabilities > + Capability*. The array of keychain groups can be left empty, only the base entitlement is required.
 
 #### Usage
 
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index a17b2605..32b50c2d 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -8,6 +8,7 @@
 
 import Foundation
 import MLXLLM
+import MLXLMCommon
 import SpeziChat
 import SpeziLLM
 
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 4182637e..d6358ce7 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -9,6 +9,7 @@
 import Foundation
 import MLX
 import MLXLLM
+import MLXLMCommon
 import MLXRandom
 import os
 import SpeziChat
@@ -16,16 +17,24 @@ import SpeziLLM
 
 
 extension LLMLocalSession {
-    // swiftlint:disable:next identifier_name function_body_length
+    private var generationParameters: GenerateParameters {
+        .init(
+            temperature: schema.samplingParameters.temperature,
+            topP: schema.samplingParameters.topP,
+            repetitionPenalty: schema.samplingParameters.penaltyRepeat,
+            repetitionContextSize: schema.samplingParameters.repetitionContextSize
+        )
+    }
+    
+    // swiftlint:disable:next identifier_name
     internal func _generate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
 #if targetEnvironment(simulator)
-        // swiftlint:disable:next return_value_from_void_function
-        return await _mockGenerate(continuation: continuation)
+        await _mockGenerate(continuation: continuation)
+        return
 #endif
         
         guard let modelContainer = await self.modelContainer else {
-            Self.logger.error("SpeziLLMLocal: Failed to load `modelContainer`")
-            await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
+            await handleError("Failed to load `modelContainer`", error: .modelNotFound, continuation: continuation)
             return
         }
         
@@ -35,15 +44,8 @@ extension LLMLocalSession {
             await self.context.formattedChat
         }
         
-        guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in
-            if let chatTempalte = self.schema.parameters.chatTemplate {
-               return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte)
-            } else {
-                return try tokenizer.applyChatTemplate(messages: messages)
-            }
-        }) else {
-            Self.logger.error("SpeziLLMLocal: Failed to format chat with given context")
-            await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation)
+        guard let modelInput: LMInput = try? await prepareModelInput(messages: messages, modelContainer: modelContainer) else {
+            await handleError("Failed to format chat with given context", error: .illegalContext, continuation: continuation)
             return
         }
         
@@ -53,78 +55,104 @@ extension LLMLocalSession {
             return
         }
         
-        let parameters: GenerateParameters = .init(
-            temperature: schema.samplingParameters.temperature,
-            topP: schema.samplingParameters.topP,
-            repetitionPenalty: schema.samplingParameters.penaltyRepeat,
-            repetitionContextSize: schema.samplingParameters.repetitionContextSize
-        )
-        
-        // swiftlint:disable:next closure_body_length
-        let result = await modelContainer.perform { model, tokenizer in
-            let result = MLXLLM.generate(
-                promptTokens: promptTokens,
-                parameters: parameters,
-                model: model,
-                tokenizer: tokenizer,
-                extraEOSTokens: schema.parameters.extraEOSTokens
-            ) { tokens in
-                if Task.isCancelled {
-                    return .stop
-                }
-                
-                if tokens.count >= self.schema.parameters.maxOutputLength {
-                    Self.logger.debug("SpeziLLMLocal: Max output length exceeded.")
-                    return .stop
+        do {
+            let result = try await modelContainer.perform { modelContext in
+                let result = try MLXLMCommon.generate(
+                    input: modelInput,
+                    parameters: generationParameters,
+                    context: modelContext
+                ) { tokens in
+                    processTokens(tokens, modelContext: modelContext, continuation: continuation)
                 }
                 
-                if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
-                    let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens))
-                    let text = tokenizer.decode(tokens: lastTokens)
-                    
-                    Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)")
-                    continuation.yield(text)
-                    
-                    if schema.injectIntoContext {
-                        Task { @MainActor in
-                            context.append(assistantOutput: text)
-                        }
-                    }
-                }
-                
-                return .more
+                processRemainingTokens(result: result, modelContext: modelContext, continuation: continuation)
+                return result
             }
             
-            // Yielding every Nth token may result in missing the final tokens.
-            let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
-            let lastTokens = Array(result.tokens.suffix(reaminingTokens))
-            let text = tokenizer.decode(tokens: lastTokens)
+            Self.logger.debug(
+                """
+                SpeziLLMLocal:
+                Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public)
+                Generation tokens per second: \(result.tokensPerSecond, privacy: .public)
+                """
+            )
+            
+            await MainActor.run {
+                continuation.finish()
+                state = .ready
+            }
+        } catch {
+            await handleError("Generation ended with error: \(error)", error: .generationError, continuation: continuation)
+            return
+        }
+    }
+    
+    private func prepareModelInput(messages: [[String: String]], modelContainer: ModelContainer) async throws -> LMInput {
+        try await modelContainer.perform { modelContext in
+            if let chatTemplate = self.schema.parameters.chatTemplate {
+                let tokens = try modelContext.tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate)
+                return LMInput(text: .init(tokens: MLXArray(tokens)))
+            } else {
+                return try await modelContext.processor.prepare(input: .init(messages: messages))
+            }
+        }
+    }
+    
+    private func processTokens(
+        _ tokens: [Int],
+        modelContext: ModelContext,
+        continuation: AsyncThrowingStream<String, any Error>.Continuation
+    ) -> GenerateDisposition {
+        if Task.isCancelled {
+            return .stop
+        }
+        
+        if tokens.count >= self.schema.parameters.maxOutputLength {
+            Self.logger.debug("SpeziLLMLocal: Max output length exceeded.")
+            return .stop
+        }
+        
+        if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
+            let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens))
+            let text = modelContext.tokenizer.decode(tokens: lastTokens)
+            
+            Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)")
             continuation.yield(text)
             
             if schema.injectIntoContext {
                 Task { @MainActor in
                     context.append(assistantOutput: text)
-                    context.completeAssistantStreaming()
                 }
             }
-            
-            return result
         }
         
-        Self.logger.debug(
-            """
-            SpeziLLMLocal:
-            Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public)
-            Generation tokens per second: \(result.tokensPerSecond, privacy: .public)
-            """
-        )
+        return .more
+    }
+    
+    private func processRemainingTokens(
+        result: GenerateResult,
+        modelContext: ModelContext,
+        continuation: AsyncThrowingStream<String, any Error>.Continuation
+    ) {
+        // Yielding every Nth token may result in missing the final tokens.
+        let remainingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
+        let lastTokens = Array(result.tokens.suffix(remainingTokens))
+        let text = modelContext.tokenizer.decode(tokens: lastTokens)
+        continuation.yield(text)
         
-        await MainActor.run {
-            continuation.finish()
-            state = .ready
+        if schema.injectIntoContext {
+            Task { @MainActor in
+                context.append(assistantOutput: text)
+                context.completeAssistantStreaming()
+            }
         }
     }
     
+    private func handleError(_ message: String, error: LLMLocalError, continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
+        Self.logger.error("SpeziLLMLocal: \(message)")
+        await finishGenerationWithError(error, on: continuation)
+    }
+    
     private func _mockGenerate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
         let tokens = [
             "Mock ", "Message ", "from ", "SpeziLLM! ",
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
index 91369905..ce35a15d 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -9,6 +9,7 @@
 import Foundation
 import Hub
 import MLXLLM
+import MLXLMCommon
 
 
 extension LLMLocalSession {
@@ -46,10 +47,10 @@ extension LLMLocalSession {
         }
         
         do {
-            let modelContainer = try await loadModelContainer(configuration: self.schema.configuration)
+            let modelContainer = try await LLMModelFactory.shared.loadContainer(configuration: self.schema.configuration)
             
-            let numParams = await modelContainer.perform { [] model, _ in
-                model.numParameters()
+            let numParams = await modelContainer.perform { modelContext in
+                modelContext.model.numParameters()
             }
             
             await MainActor.run {
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index 93bab019..a314dcc3 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -10,6 +10,7 @@
 import Foundation
 import MLX
 import MLXLLM
+import MLXLMCommon
 import MLXRandom
 import os
 import SpeziChat
@@ -83,7 +84,7 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     @MainActor public var customContext: [[String: String]] = []
     
     @MainActor public var numParameters: Int?
-    @MainActor public var modelConfiguration: ModelConfiguration?
+    @MainActor public var modelConfiguration: ModelRegistry?
     @MainActor public var modelContainer: ModelContainer?
     
     
diff --git a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
index 641f8b3f..8e09c4e3 100644
--- a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
+++ b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
@@ -29,7 +29,7 @@ You need to add the SpeziLLM Swift package to
  
 > Important: Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
-> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
+> Important: To use the LLM local target, some LLMs require adding the *Increase Memory Limit* entitlement to the project.
 
 ## Spezi LLM Local Components