From fe1501918778f4ce3a43d3c48e0f053a0bb9f189 Mon Sep 17 00:00:00 2001 From: Leon Nissen <50104433+LeonNissen@users.noreply.github.com> Date: Mon, 3 Feb 2025 06:41:35 -0800 Subject: [PATCH] Bump mlx to version 2.21.2 (#94) # Bumps mlx to version 2.21.2 ## :recycle: Current situation & Problem The MLX library is currently 1.18.1, and can be bumped to 2.x to support newer models. ## :gear: Release Notes Updates the MLX library to 2.21.2 and migrates code for breaking changes. ## :pencil: Code of Conduct & Contributing Guidelines By submitting creating this pull request, you agree to follow our [Code of Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md) and [Contributing Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md): - [X] I agree to follow the [Code of Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md) and [Contributing Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md). --------- Co-authored-by: Leon Nissen <> Co-authored-by: Vishnu Ravi Co-authored-by: Paul Schmiedmayer --- .linkspector.yml | 14 ++ Package.swift | 11 +- README.md | 4 +- Sources/SpeziLLMLocal/LLMLocalSchema.swift | 1 + .../LLMLocalSession+Generate.swift | 166 ++++++++++-------- .../SpeziLLMLocal/LLMLocalSession+Setup.swift | 7 +- Sources/SpeziLLMLocal/LLMLocalSession.swift | 3 +- .../SpeziLLMLocal.docc/SpeziLLMLocal.md | 2 +- 8 files changed, 125 insertions(+), 83 deletions(-) create mode 100644 .linkspector.yml diff --git a/.linkspector.yml b/.linkspector.yml new file mode 100644 index 00000000..450bcbaf --- /dev/null +++ b/.linkspector.yml @@ -0,0 +1,14 @@ +# +# This source file is part of the Stanford Spezi open source project +# +# SPDX-FileCopyrightText: 2025 Stanford University and the project authors (see CONTRIBUTORS.md) +# +# SPDX-License-Identifier: MIT +# +dirs: + - . +useGitIgnore: true +ignorePatterns: + - pattern: '^https://platform.openai.com/docs/guides/.*$' # Causes false positives + - pattern: '^doc:.*$' + - pattern: '^http://localhost.*$' \ No newline at end of file diff --git a/Package.swift b/Package.swift index bd3afa8b..8736ce24 100644 --- a/Package.swift +++ b/Package.swift @@ -28,14 +28,14 @@ let package = Package( ], dependencies: [ .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.21.2")), - .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "1.18.1"), // Pin MLX Swift Examples as it doesn't follow semantic versioning + .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "2.21.2"), // Pin MLX Swift Examples as it doesn't follow semantic versioning .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.14")), .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")), .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"), .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0"), .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"), .package(url: "https://github.com/StanfordSpezi/SpeziOnboarding", from: "1.1.1"), - .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.1")), + .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.3")), .package(url: "https://github.com/StanfordSpezi/SpeziViews", from: "1.3.1") ], targets: [ @@ -54,12 +54,9 @@ let package = Package( .product(name: "SpeziFoundation", package: "SpeziFoundation"), .product(name: "Spezi", package: "Spezi"), .product(name: "MLX", package: "mlx-swift"), - .product(name: "MLXFast", package: "mlx-swift"), - .product(name: "MLXNN", package: "mlx-swift"), - .product(name: "MLXOptimizers", package: "mlx-swift"), .product(name: "MLXRandom", package: "mlx-swift"), .product(name: "Transformers", package: "swift-transformers"), - .product(name: "LLM", package: "mlx-swift-examples") + .product(name: "MLXLLM", package: "mlx-swift-examples") ] ), .target( @@ -68,7 +65,7 @@ let package = Package( .product(name: "SpeziOnboarding", package: "SpeziOnboarding"), .product(name: "SpeziViews", package: "SpeziViews"), .target(name: "SpeziLLMLocal"), - .product(name: "LLM", package: "mlx-swift-examples") + .product(name: "MLXLLM", package: "mlx-swift-examples") ] ), .target( diff --git a/README.md b/README.md index 2de78184..12c7ab0d 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ The target enables developers to easily execute medium-size Language Models (LLM > Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that. > [!IMPORTANT] -> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project. +> Important: To use the LLM local target, some LLMs require adding the *Increase Memory Limit* entitlement to the project. #### Setup @@ -147,7 +147,7 @@ class LLMOpenAIAppDelegate: SpeziAppDelegate { ``` > [!IMPORTANT] -> If using `SpeziLLMOpenAI` on macOS, ensure to add the [`Keychain Access Groups` entitlement](https://developer.apple.com/documentation/bundleresources/entitlements/keychain-access-groups) to the enclosing Xcode project via *PROJECT_NAME > Signing&Capabilities > + Capability*. The array of keychain groups can be left empty, only the base entitlement is required. +> If using `SpeziLLMOpenAI` on macOS, ensure to add the *`Keychain Access Groups` entitlement* to the enclosing Xcode project via *PROJECT_NAME > Signing&Capabilities > + Capability*. The array of keychain groups can be left empty, only the base entitlement is required. #### Usage diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift index a17b2605..32b50c2d 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift @@ -8,6 +8,7 @@ import Foundation import MLXLLM +import MLXLMCommon import SpeziChat import SpeziLLM diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift index 4182637e..d6358ce7 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift @@ -9,6 +9,7 @@ import Foundation import MLX import MLXLLM +import MLXLMCommon import MLXRandom import os import SpeziChat @@ -16,16 +17,24 @@ import SpeziLLM extension LLMLocalSession { - // swiftlint:disable:next identifier_name function_body_length + private var generationParameters: GenerateParameters { + .init( + temperature: schema.samplingParameters.temperature, + topP: schema.samplingParameters.topP, + repetitionPenalty: schema.samplingParameters.penaltyRepeat, + repetitionContextSize: schema.samplingParameters.repetitionContextSize + ) + } + + // swiftlint:disable:next identifier_name internal func _generate(continuation: AsyncThrowingStream.Continuation) async { #if targetEnvironment(simulator) - // swiftlint:disable:next return_value_from_void_function - return await _mockGenerate(continuation: continuation) + await _mockGenerate(continuation: continuation) + return #endif guard let modelContainer = await self.modelContainer else { - Self.logger.error("SpeziLLMLocal: Failed to load `modelContainer`") - await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation) + await handleError("Failed to load `modelContainer`", error: .modelNotFound, continuation: continuation) return } @@ -35,15 +44,8 @@ extension LLMLocalSession { await self.context.formattedChat } - guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in - if let chatTempalte = self.schema.parameters.chatTemplate { - return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte) - } else { - return try tokenizer.applyChatTemplate(messages: messages) - } - }) else { - Self.logger.error("SpeziLLMLocal: Failed to format chat with given context") - await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation) + guard let modelInput: LMInput = try? await prepareModelInput(messages: messages, modelContainer: modelContainer) else { + await handleError("Failed to format chat with given context", error: .illegalContext, continuation: continuation) return } @@ -53,78 +55,104 @@ extension LLMLocalSession { return } - let parameters: GenerateParameters = .init( - temperature: schema.samplingParameters.temperature, - topP: schema.samplingParameters.topP, - repetitionPenalty: schema.samplingParameters.penaltyRepeat, - repetitionContextSize: schema.samplingParameters.repetitionContextSize - ) - - // swiftlint:disable:next closure_body_length - let result = await modelContainer.perform { model, tokenizer in - let result = MLXLLM.generate( - promptTokens: promptTokens, - parameters: parameters, - model: model, - tokenizer: tokenizer, - extraEOSTokens: schema.parameters.extraEOSTokens - ) { tokens in - if Task.isCancelled { - return .stop - } - - if tokens.count >= self.schema.parameters.maxOutputLength { - Self.logger.debug("SpeziLLMLocal: Max output length exceeded.") - return .stop + do { + let result = try await modelContainer.perform { modelContext in + let result = try MLXLMCommon.generate( + input: modelInput, + parameters: generationParameters, + context: modelContext + ) { tokens in + processTokens(tokens, modelContext: modelContext, continuation: continuation) } - if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) { - let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens)) - let text = tokenizer.decode(tokens: lastTokens) - - Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)") - continuation.yield(text) - - if schema.injectIntoContext { - Task { @MainActor in - context.append(assistantOutput: text) - } - } - } - - return .more + processRemainingTokens(result: result, modelContext: modelContext, continuation: continuation) + return result } - // Yielding every Nth token may result in missing the final tokens. - let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens - let lastTokens = Array(result.tokens.suffix(reaminingTokens)) - let text = tokenizer.decode(tokens: lastTokens) + Self.logger.debug( + """ + SpeziLLMLocal: + Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public) + Generation tokens per second: \(result.tokensPerSecond, privacy: .public) + """ + ) + + await MainActor.run { + continuation.finish() + state = .ready + } + } catch { + await handleError("Generation ended with error: \(error)", error: .generationError, continuation: continuation) + return + } + } + + private func prepareModelInput(messages: [[String: String]], modelContainer: ModelContainer) async throws -> LMInput { + try await modelContainer.perform { modelContext in + if let chatTemplate = self.schema.parameters.chatTemplate { + let tokens = try modelContext.tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate) + return LMInput(text: .init(tokens: MLXArray(tokens))) + } else { + return try await modelContext.processor.prepare(input: .init(messages: messages)) + } + } + } + + private func processTokens( + _ tokens: [Int], + modelContext: ModelContext, + continuation: AsyncThrowingStream.Continuation + ) -> GenerateDisposition { + if Task.isCancelled { + return .stop + } + + if tokens.count >= self.schema.parameters.maxOutputLength { + Self.logger.debug("SpeziLLMLocal: Max output length exceeded.") + return .stop + } + + if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) { + let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens)) + let text = modelContext.tokenizer.decode(tokens: lastTokens) + + Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)") continuation.yield(text) if schema.injectIntoContext { Task { @MainActor in context.append(assistantOutput: text) - context.completeAssistantStreaming() } } - - return result } - Self.logger.debug( - """ - SpeziLLMLocal: - Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public) - Generation tokens per second: \(result.tokensPerSecond, privacy: .public) - """ - ) + return .more + } + + private func processRemainingTokens( + result: GenerateResult, + modelContext: ModelContext, + continuation: AsyncThrowingStream.Continuation + ) { + // Yielding every Nth token may result in missing the final tokens. + let remainingTokens = result.tokens.count % schema.parameters.displayEveryNTokens + let lastTokens = Array(result.tokens.suffix(remainingTokens)) + let text = modelContext.tokenizer.decode(tokens: lastTokens) + continuation.yield(text) - await MainActor.run { - continuation.finish() - state = .ready + if schema.injectIntoContext { + Task { @MainActor in + context.append(assistantOutput: text) + context.completeAssistantStreaming() + } } } + private func handleError(_ message: String, error: LLMLocalError, continuation: AsyncThrowingStream.Continuation) async { + Self.logger.error("SpeziLLMLocal: \(message)") + await finishGenerationWithError(error, on: continuation) + } + private func _mockGenerate(continuation: AsyncThrowingStream.Continuation) async { let tokens = [ "Mock ", "Message ", "from ", "SpeziLLM! ", diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift index 91369905..ce35a15d 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift @@ -9,6 +9,7 @@ import Foundation import Hub import MLXLLM +import MLXLMCommon extension LLMLocalSession { @@ -46,10 +47,10 @@ extension LLMLocalSession { } do { - let modelContainer = try await loadModelContainer(configuration: self.schema.configuration) + let modelContainer = try await LLMModelFactory.shared.loadContainer(configuration: self.schema.configuration) - let numParams = await modelContainer.perform { [] model, _ in - model.numParameters() + let numParams = await modelContainer.perform { modelContext in + modelContext.model.numParameters() } await MainActor.run { diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift index 93bab019..a314dcc3 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSession.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift @@ -10,6 +10,7 @@ import Foundation import MLX import MLXLLM +import MLXLMCommon import MLXRandom import os import SpeziChat @@ -83,7 +84,7 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable { @MainActor public var customContext: [[String: String]] = [] @MainActor public var numParameters: Int? - @MainActor public var modelConfiguration: ModelConfiguration? + @MainActor public var modelConfiguration: ModelRegistry? @MainActor public var modelContainer: ModelContainer? diff --git a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md index 641f8b3f..8e09c4e3 100644 --- a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md +++ b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md @@ -29,7 +29,7 @@ You need to add the SpeziLLM Swift package to > Important: Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that. -> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project. +> Important: To use the LLM local target, some LLMs require adding the *Increase Memory Limit* entitlement to the project. ## Spezi LLM Local Components