Feat: Create Speech [tts-1, tts-1-hd]

MacPaw · Nov 13, 2023 · dc2f24e · dc2f24e
1 parent 41b3515
commit dc2f24e
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 2 deletions.
diff --git a/Sources/OpenAI/OpenAI.swift b/Sources/OpenAI/OpenAI.swift
@@ -111,13 +111,20 @@ final public class OpenAI: OpenAIProtocol {
     public func audioTranslations(query: AudioTranslationQuery, completion: @escaping (Result<AudioTranslationResult, Error>) -> Void) {
         performRequest(request: MultipartFormDataRequest<AudioTranslationResult>(body: query, url: buildURL(path: .audioTranslations)), completion: completion)
     }
+
+    public func audioCreateSpeech(query: AudioSpeechQuery, completion: @escaping (Result<AudioSpeechResult, Error>) -> Void) {
+        performSpeechRequest(request: JSONRequest<AudioSpeechResult>(body: query, url: buildURL(path: .audioSpeech)), completion: completion)
+    }
+
 }
 
 extension OpenAI {
 
     func performRequest<ResultType: Codable>(request: any URLRequestBuildable, completion: @escaping (Result<ResultType, Error>) -> Void) {
         do {
-            let request = try request.build(token: configuration.token, organizationIdentifier: configuration.organizationIdentifier, timeoutInterval: configuration.timeoutInterval)
+            let request = try request.build(token: configuration.token, 
+                                            organizationIdentifier: configuration.organizationIdentifier,
+                                            timeoutInterval: configuration.timeoutInterval)
             let task = session.dataTask(with: request) { data, _, error in
                 if let error = error {
                     completion(.failure(error))
@@ -153,7 +160,9 @@ extension OpenAI {
 
     func performSteamingRequest<ResultType: Codable>(request: any URLRequestBuildable, onResult: @escaping (Result<ResultType, Error>) -> Void, completion: ((Error?) -> Void)?) {
         do {
-            let request = try request.build(token: configuration.token, organizationIdentifier: configuration.organizationIdentifier, timeoutInterval: configuration.timeoutInterval)
+            let request = try request.build(token: configuration.token, 
+                                            organizationIdentifier: configuration.organizationIdentifier,
+                                            timeoutInterval: configuration.timeoutInterval)
             let session = StreamingSession<ResultType>(urlRequest: request)
             session.onReceiveContent = {_, object in
                 onResult(.success(object))
@@ -171,6 +180,40 @@ extension OpenAI {
             completion?(error)
         }
     }
+
+    func performSpeechRequest(request: any URLRequestBuildable, completion: @escaping (Result<AudioSpeechResult, Error>) -> Void) {
+        do {
+            let request = try request.build(token: configuration.token, 
+                                            organizationIdentifier: configuration.organizationIdentifier,
+                                            timeoutInterval: configuration.timeoutInterval)
+
+            let task = session.dataTask(with: request) { data, _, error in
+                if let error = error {
+                    completion(.failure(error))
+                    return
+                }
+                guard let data = data else {
+                    completion(.failure(OpenAIError.emptyData))
+                    return
+                }
+
+                var apiError: Error? = nil
+                completion(.success(AudioSpeechResult(audioData: data)))
+
+                if let apiError = apiError {
+                    do {
+                        let decoded = try JSONDecoder().decode(APIErrorResponse.self, from: data)
+                        completion(.failure(decoded))
+                    } catch {
+                        completion(.failure(apiError))
+                    }
+                }
+            }
+            task.resume()
+        } catch {
+            completion(.failure(error))
+        }
+    }
 }
 
 extension OpenAI {
@@ -194,6 +237,7 @@ extension APIPath {
     static let models = "/v1/models"
     static let moderations = "/v1/moderations"
 
+    static let audioSpeech = "/v1/audio/speech"
     static let audioTranscriptions = "/v1/audio/transcriptions"
     static let audioTranslations = "/v1/audio/translations"
 

diff --git a/Sources/OpenAI/Public/Models/AudioSpeechQuery.swift b/Sources/OpenAI/Public/Models/AudioSpeechQuery.swift
@@ -0,0 +1,78 @@
+//
+//  AudioSpeechQuery.swift
+//  
+//
+//  Created by Ihor Makhnyk on 13.11.2023.
+//
+
+import Foundation
+
+/// Learn more: [OpenAI Speech – Documentation](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+public struct AudioSpeechQuery: Codable, Equatable {
+
+    /// Encapsulates the voices available for audio generation.
+    ///
+    /// To get aquinted with each of the voices and listen to the samples visit:
+    /// [OpenAI Text-to-Speech – Voice Options](https://platform.openai.com/docs/guides/text-to-speech/voice-options)
+    public enum AudioSpeechVoice: String, Codable {
+        case alloy,
+             echo,
+             fable,
+             onyx,
+             nova,
+             shimmer
+    }
+
+    /// Encapsulates the response formats available for audio data.
+    ///
+    /// **Formats:**
+    /// -  mp3
+    /// -  opus
+    /// -  aac
+    /// -  flac
+    public enum AudioSpeechResponseFormat: String, Codable {
+        case mp3,
+             opus,
+             aac,
+             flac
+    }
+    /// One of the available TTS models: tts-1 or tts-1-hd
+    public let model: Model
+    /// The text to generate audio for. The maximum length is 4096 characters.
+    public let input: String?
+    /// The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
+    public let voice: AudioSpeechVoice
+    /// The format to audio in. Supported formats are mp3, opus, aac, and flac.
+    public let response_format: AudioSpeechResponseFormat
+    /// The speed of the generated audio. Enter a value between **0.25** and **4.0**. Default: **1.0**
+    public let speed: String?
+
+    public init(model: Model?,
+                input: String,
+                voice: AudioSpeechVoice,
+                response_format: AudioSpeechResponseFormat = .mp3,
+                speed: Double?) {
+
+        self.model = {
+            guard let model else { return .tts_1 }
+            let isModelOfIncorrentFormat = model != .tts_1 && model != .tts_1_hd
+            guard !isModelOfIncorrentFormat else {
+                NSLog("[AudioSpeech] 'AudioSpeechQuery' must have a valid Text-To-Speech model, 'tts-1' or 'tts-1-hd'. Setting model to 'tts-1'.")
+                return .tts_1
+            }
+            return model
+        }()
+        self.input = input
+        self.voice = voice
+        self.speed = {
+            guard let speed else { return "1.0" }
+            let isSpeedOutOfBounds = speed >= 4.0 && speed <= 0.25
+            guard !isSpeedOutOfBounds else {
+                NSLog("[AudioSpeech] Speed value must be between 0.25 and 4.0. Setting value to closest valid.")
+                return speed < 0.25 ? "1.0" : "4.0"
+            }
+            return String("\(speed)")
+        }()
+        self.response_format = response_format
+    }
+}
diff --git a/Sources/OpenAI/Public/Models/AudioSpeechResult.swift b/Sources/OpenAI/Public/Models/AudioSpeechResult.swift
@@ -0,0 +1,55 @@
+//
+//  AudioSpeechResult.swift
+//
+//
+//  Created by Ihor Makhnyk on 13.11.2023.
+//
+
+import Foundation
+import AVFoundation
+
+public struct AudioSpeechResult {
+
+    /// Audio data for one of the following formats :`mp3`, `opus`, `aac`, `flac`
+    public let audioData: Data?
+
+    /// Saves the audio data to a file at a specified file path.
+    ///
+    /// - Parameters:
+    ///     - name: The name for the file.
+    ///     - format: The format of the audio data, as defined in **`AudioSpeechQuery.AudioSpeechResponseFormat`**.  For example: **`.mp3`**
+    ///     - path: The destination file path as an URL.
+    /// - Throws: Throws an NSError if there is an issue with writing the data to the specified file path.
+    public func saveAs(_ name: String, format: AudioSpeechQuery.AudioSpeechResponseFormat, to path: URL) throws {
+        guard let data = audioData else {
+            throw NSError(
+                domain: Bundle.main.bundleIdentifier!,
+                code: 1,
+                userInfo: [NSLocalizedDescriptionKey: "No audio data"]
+            )
+        }
+        let filename = "\(name).\(format.rawValue)"
+        let fileURL = path.appendingPathComponent(filename)
+        try data.write(to: fileURL)
+    }
+
+    /// Gets an `AVAudioPlayer` instance configured with the audio data.
+    ///
+    /// - Returns: An `AVAudioPlayer` instance or nil if there is no audio data or if there is issue initializing an `AVAudioPlayer`.
+    /// - Note: Import **AVFoundation**
+    public func getAudioPlayer() -> AVAudioPlayer? {
+        guard let data = audioData else {
+            NSLog("No audio data")
+            return nil
+        }
+
+        do {
+            let audioPlayer = try AVAudioPlayer(data: data)
+            return audioPlayer
+        } catch {
+            NSLog("Error initializing audio player: \(error)")
+            return nil
+        }
+    }
+
+}
diff --git a/Sources/OpenAI/Public/Models/Models/Models.swift b/Sources/OpenAI/Public/Models/Models/Models.swift
@@ -55,6 +55,13 @@ public extension Model {
     static let textDavinci_001 = "text-davinci-001"
     static let codeDavinciEdit_001 = "code-davinci-edit-001"
 
+    // Speech
+
+    /// The latest text to speech model, optimized for speed.
+    static let tts_1 = "tts-1"
+    /// The latest text to speech model, optimized for quality.
+    static let tts_1_hd = "tts-1-hd"
+
     // Transcriptions / Translations
 
     static let whisper_1 = "whisper-1"