From 9b17032a6564c8bda4ab1749989a9797c670919c Mon Sep 17 00:00:00 2001 From: Dawood Khan Date: Tue, 17 Dec 2024 17:15:16 -0500 Subject: [PATCH] Add Microphone Input to MultimodalTextbox (#10186) * microphone * add changeset * undo css changes * notebook * css fix * fixes * add changeset * fixes * pr fixes * guides * format * ally ignore * type fix --------- Co-authored-by: gradio-pr-bot Co-authored-by: Abubakar Abid --- .changeset/fluffy-pots-clap.md | 7 + demo/chatbot_multimodal/run.ipynb | 2 +- demo/chatbot_multimodal/run.py | 1 + gradio/components/multimodal_textbox.py | 20 ++ .../05_chatbots/01_creating-a-chatbot-fast.md | 4 +- ...4_creating-a-custom-chatbot-with-blocks.md | 4 +- js/audio/interactive/InteractiveAudio.svelte | 29 +- js/multimodaltextbox/Index.svelte | 62 +++- .../MultimodalTextbox.stories.svelte | 10 + .../shared/MultimodalTextbox.svelte | 313 +++++++++++------- 10 files changed, 322 insertions(+), 130 deletions(-) create mode 100644 .changeset/fluffy-pots-clap.md diff --git a/.changeset/fluffy-pots-clap.md b/.changeset/fluffy-pots-clap.md new file mode 100644 index 0000000000000..5762915f2301a --- /dev/null +++ b/.changeset/fluffy-pots-clap.md @@ -0,0 +1,7 @@ +--- +"@gradio/audio": minor +"@gradio/multimodaltextbox": minor +"gradio": minor +--- + +feat:Add Microphone Input to MultimodalTextbox diff --git a/demo/chatbot_multimodal/run.ipynb b/demo/chatbot_multimodal/run.ipynb index 0299e0799d9e9..eca1ad4f41b0c 100644 --- a/demo/chatbot_multimodal/run.ipynb +++ b/demo/chatbot_multimodal/run.ipynb @@ -1 +1 @@ -{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"**That's cool!**\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file +{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"**That's cool!**\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " sources=[\"microphone\", \"upload\"],\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file diff --git a/demo/chatbot_multimodal/run.py b/demo/chatbot_multimodal/run.py index ac0cb18805246..1d72e0e0cf1fb 100644 --- a/demo/chatbot_multimodal/run.py +++ b/demo/chatbot_multimodal/run.py @@ -33,6 +33,7 @@ def bot(history: list): file_count="multiple", placeholder="Enter message or upload file...", show_label=False, + sources=["microphone", "upload"], ) chat_msg = chat_input.submit( diff --git a/gradio/components/multimodal_textbox.py b/gradio/components/multimodal_textbox.py index 3eac76cd17862..bc5659be4b233 100644 --- a/gradio/components/multimodal_textbox.py +++ b/gradio/components/multimodal_textbox.py @@ -61,6 +61,9 @@ def __init__( self, value: str | dict[str, str | list] | Callable | None = None, *, + sources: list[Literal["upload", "microphone"]] + | Literal["upload", "microphone"] + | None = None, file_types: list[str] | None = None, file_count: Literal["single", "multiple", "directory"] = "single", lines: int = 1, @@ -91,6 +94,7 @@ def __init__( """ Parameters: value: Default value to show in MultimodalTextbox. A string value, or a dictionary of the form {"text": "sample text", "files": [{path: "files/file.jpg", orig_name: "file.jpg", url: "http://image_url.jpg", size: 100}]}. If callable, the function will be called whenever the app loads to set the initial value of the component. + sources: A list of sources permitted. "upload" creates a button where users can click to upload or drop files, "microphone" creates a microphone input. If None, defaults to ["upload"]. file_count: if single, allows user to upload one file. If "multiple", user uploads multiple files. If "directory", user uploads all files in selected directory. Return type will be list for each file in case of "multiple" or "directory". file_types: List of file extensions or types of files to be uploaded (e.g. ['image', '.json', '.mp4']). "file" allows any file to be uploaded, "image" allows only image files to be uploaded, "audio" allows only audio files to be uploaded, "video" allows only video files to be uploaded, "text" allows only text files to be uploaded. lines: minimum number of line rows to provide in textarea. @@ -118,6 +122,22 @@ def __init__( stop_btn: If True, will show a stop button (useful for streaming demos). If a string, will use that string as the stop button text. max_plain_text_length: Maximum length of plain text in the textbox. If the text exceeds this length, the text will be pasted as a file. Default is 1000. """ + valid_sources: list[Literal["upload", "microphone"]] = ["upload", "microphone"] + if sources is None: + self.sources = ["upload"] + elif isinstance(sources, str) and sources in valid_sources: + self.sources = [sources] + elif isinstance(sources, list): + self.sources = sources + else: + raise ValueError( + f"`sources` must be a list consisting of elements in {valid_sources}" + ) + for source in self.sources: + if source not in valid_sources: + raise ValueError( + f"`sources` must a list consisting of elements in {valid_sources}" + ) self.file_types = file_types self.file_count = file_count if file_types is not None and not isinstance(file_types, list): diff --git a/guides/05_chatbots/01_creating-a-chatbot-fast.md b/guides/05_chatbots/01_creating-a-chatbot-fast.md index e64d6a710458f..df663fe679a04 100644 --- a/guides/05_chatbots/01_creating-a-chatbot-fast.md +++ b/guides/05_chatbots/01_creating-a-chatbot-fast.md @@ -194,7 +194,7 @@ This second parameter of your chat function, `history`, will be in the same open The return type of your chat function does *not change* when setting `multimodal=True` (i.e. in the simplest case, you should still return a string value). We discuss more complex cases, e.g. returning files [below](#returning-complex-responses). -If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. Here's an example that illustrates how to set up and customize and multimodal chat interface: +If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. Here's an example that illustrates how to set up and customize and multimodal chat interface: ```python @@ -215,7 +215,7 @@ demo = gr.ChatInterface( {"text": "No files", "files": []} ], multimodal=True, - textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"]) + textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"], sources=["upload", "microphone"]) ) demo.launch() diff --git a/guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md b/guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md index e8dff122eb110..07f0eeaa3f906 100644 --- a/guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md +++ b/guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md @@ -70,7 +70,7 @@ def bot(history): return history ``` -In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this: +In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this: ```python def add_message(history, message): @@ -78,7 +78,7 @@ def add_message(history, message): history.append({"role": "user", "content": {"path": x}}) if message["text"] is not None: history.append({"role": "user", "content": message["text"]}) - return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"]) + return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"], sources=["upload", "microphone"]) ``` Putting this together, we can create a _multimodal_ chatbot with a multimodal textbox for a user to submit text and media files. The rest of the code looks pretty much the same as before: diff --git a/js/audio/interactive/InteractiveAudio.svelte b/js/audio/interactive/InteractiveAudio.svelte index 4b37e26bed9e5..2dfcf2ca9c8b6 100644 --- a/js/audio/interactive/InteractiveAudio.svelte +++ b/js/audio/interactive/InteractiveAudio.svelte @@ -41,6 +41,7 @@ export let stream_every: number; export let uploading = false; export let recording = false; + export let class_name = ""; let time_limit: number | null = null; let stream_state: "open" | "waiting" | "closed" = "closed"; @@ -246,7 +247,7 @@ float={active_source === "upload" && value === null} label={label || i18n("audio.audio")} /> -
+
{#if value === null || streaming} {#if active_source === "microphone"} @@ -329,4 +330,30 @@ flex-direction: column; justify-content: space-between; } + + .audio-container.compact-audio { + margin-top: calc(var(--size-8) * -1); + height: auto; + padding: 0px; + gap: var(--size-2); + min-height: var(--size-5); + } + + .compact-audio :global(.audio-player) { + padding: 0px; + } + + .compact-audio :global(.controls) { + gap: 0px; + padding: 0px; + } + + .compact-audio :global(.waveform-container) { + height: var(--size-12) !important; + } + + .compact-audio :global(.player-container) { + min-height: unset; + height: auto; + } diff --git a/js/multimodaltextbox/Index.svelte b/js/multimodaltextbox/Index.svelte index b7aadede1d001..123132b67df36 100644 --- a/js/multimodaltextbox/Index.svelte +++ b/js/multimodaltextbox/Index.svelte @@ -12,6 +12,8 @@ import { StatusTracker } from "@gradio/statustracker"; import type { LoadingStatus } from "@gradio/statustracker"; import type { FileData } from "@gradio/client"; + import { onMount } from "svelte"; + import type { WaveformOptions } from "../audio/shared/types"; export let gradio: Gradio<{ change: typeof value; @@ -23,6 +25,11 @@ focus: never; error: string; clear_status: LoadingStatus; + start_recording: never; + pause_recording: never; + stop_recording: never; + upload: FileData[] | FileData; + clear: undefined; }>; export let elem_id = ""; export let elem_classes: string[] = []; @@ -38,7 +45,6 @@ export let info: string | undefined = undefined; export let show_label: boolean; export let max_lines: number; - export let container = true; export let scale: number | null = null; export let min_width: number | undefined = undefined; export let submit_btn: string | boolean | null = null; @@ -53,8 +59,52 @@ export let root: string; export let file_count: "single" | "multiple" | "directory"; export let max_plain_text_length: number; + export let sources: ["microphone" | "upload"] = ["upload"]; + export let waveform_options: WaveformOptions = {}; let dragging: boolean; + let active_source: "microphone" | null = null; + let waveform_settings: Record; + let color_accent = "darkorange"; + + onMount(() => { + color_accent = getComputedStyle(document?.documentElement).getPropertyValue( + "--color-accent" + ); + set_trim_region_colour(); + waveform_settings.waveColor = waveform_options.waveform_color || "#9ca3af"; + waveform_settings.progressColor = + waveform_options.waveform_progress_color || color_accent; + waveform_settings.mediaControls = waveform_options.show_controls; + waveform_settings.sampleRate = waveform_options.sample_rate || 44100; + }); + + $: waveform_settings = { + height: 50, + + barWidth: 2, + barGap: 3, + cursorWidth: 2, + cursorColor: "#ddd5e9", + autoplay: false, + barRadius: 10, + dragToSeek: true, + normalize: true, + minPxPerSec: 20 + }; + + const trim_region_settings = { + color: waveform_options.trim_region_color, + drag: true, + resize: true + }; + + function set_trim_region_colour(): void { + document.documentElement.style.setProperty( + "--trim-region-color", + trim_region_settings.color || color_accent + ); + } gradio.dispatch("change", value)} on:input={() => gradio.dispatch("input")} @@ -107,6 +160,11 @@ on:error={({ detail }) => { gradio.dispatch("error", detail); }} + on:start_recording={() => gradio.dispatch("start_recording")} + on:pause_recording={() => gradio.dispatch("pause_recording")} + on:stop_recording={() => gradio.dispatch("stop_recording")} + on:upload={(e) => gradio.dispatch("upload", e.detail)} + on:clear={() => gradio.dispatch("clear")} disabled={!interactive} upload={(...args) => gradio.client.upload(...args)} stream_handler={(...args) => gradio.client.stream(...args)} diff --git a/js/multimodaltextbox/MultimodalTextbox.stories.svelte b/js/multimodaltextbox/MultimodalTextbox.stories.svelte index d0739e566f3b8..e094cd5475a6b 100644 --- a/js/multimodaltextbox/MultimodalTextbox.stories.svelte +++ b/js/multimodaltextbox/MultimodalTextbox.stories.svelte @@ -42,6 +42,12 @@ description: "Whether to render right-to-left", control: { type: "boolean" }, defaultValue: false + }, + sources: { + options: ["upload", "microphone"], + description: "The sources to enable", + control: { type: "select" }, + defaultValue: ["upload", "microphone"] } }} /> @@ -87,3 +93,7 @@ } }} /> + diff --git a/js/multimodaltextbox/shared/MultimodalTextbox.svelte b/js/multimodaltextbox/shared/MultimodalTextbox.svelte index f3eb52648b746..1bf1066006b3c 100644 --- a/js/multimodaltextbox/shared/MultimodalTextbox.svelte +++ b/js/multimodaltextbox/shared/MultimodalTextbox.svelte @@ -10,7 +10,9 @@ import { BlockTitle } from "@gradio/atoms"; import { Upload } from "@gradio/upload"; import { Image } from "@gradio/image/shared"; + import type { I18nFormatter } from "js/core/src/gradio_helper"; import type { FileData, Client } from "@gradio/client"; + import type { WaveformOptions } from "../../audio/shared/types"; import { Clear, File, @@ -18,9 +20,11 @@ Paperclip, Video, Send, - Square + Square, + Microphone } from "@gradio/icons"; import type { SelectData } from "@gradio/utils"; + import InteractiveAudio from "../../audio/interactive/InteractiveAudio.svelte"; export let value: { text: string; files: FileData[] } = { text: "", @@ -29,12 +33,12 @@ export let value_is_output = false; export let lines = 1; + export let i18n: I18nFormatter; export let placeholder = "Type here..."; export let disabled = false; export let label: string; export let info: string | undefined = undefined; export let show_label = true; - export let container = true; export let max_lines: number; export let submit_btn: string | boolean | null = null; export let stop_btn: string | boolean | null = null; @@ -49,7 +53,10 @@ export let stream_handler: Client["stream"]; export let file_count: "single" | "multiple" | "directory" = "multiple"; export let max_plain_text_length = 1000; - + export let waveform_settings: Record; + export let waveform_options: WaveformOptions = {}; + export let sources: ["microphone" | "upload"] = ["upload"]; + export let active_source: "microphone" | null = null; let upload_component: Upload; let hidden_upload: HTMLInputElement; let el: HTMLTextAreaElement | HTMLInputElement; @@ -59,7 +66,9 @@ export let dragging = false; let uploading = false; let oldValue = value.text; + let recording = false; $: dispatch("drag", dragging); + let mic_audio: FileData | null = null; let full_container: HTMLDivElement; @@ -84,6 +93,9 @@ clear: undefined; load: FileData[] | FileData; error: string; + start_recording: undefined; + pause_recording: undefined; + stop_recording: undefined; }>(); beforeUpdate(() => { @@ -141,6 +153,11 @@ ) { e.preventDefault(); dispatch("submit"); + active_source = null; + if (mic_audio) { + value.files.push(mic_audio); + value = value; + } } } @@ -161,7 +178,7 @@ async function handle_upload({ detail - }: CustomEvent): Promise { + }: CustomEvent): Promise { handle_change(); if (Array.isArray(detail)) { for (let file of detail) { @@ -197,6 +214,11 @@ function handle_submit(): void { dispatch("submit"); + active_source = null; + if (mic_audio) { + value.files.push(mic_audio); + value = value; + } } async function handle_paste(event: ClipboardEvent): Promise { @@ -289,127 +311,167 @@ role="group" aria-label="Multimedia input field" > - -