Skip to content

Commit

Permalink
Add Microphone Input to MultimodalTextbox (#10186)
Browse files Browse the repository at this point in the history
* microphone

* add changeset

* undo css changes

* notebook

* css fix

* fixes

* add changeset

* fixes

* pr fixes

* guides

* format

* ally ignore

* type fix

---------

Co-authored-by: gradio-pr-bot <[email protected]>
Co-authored-by: Abubakar Abid <[email protected]>
  • Loading branch information
3 people authored Dec 17, 2024
1 parent a95f8ef commit 9b17032
Show file tree
Hide file tree
Showing 10 changed files with 322 additions and 130 deletions.
7 changes: 7 additions & 0 deletions .changeset/fluffy-pots-clap.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@gradio/audio": minor
"@gradio/multimodaltextbox": minor
"gradio": minor
---

feat:Add Microphone Input to MultimodalTextbox
2 changes: 1 addition & 1 deletion demo/chatbot_multimodal/run.ipynb
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"**That's cool!**\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"**That's cool!**\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " sources=[\"microphone\", \"upload\"],\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
1 change: 1 addition & 0 deletions demo/chatbot_multimodal/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def bot(history: list):
file_count="multiple",
placeholder="Enter message or upload file...",
show_label=False,
sources=["microphone", "upload"],
)

chat_msg = chat_input.submit(
Expand Down
20 changes: 20 additions & 0 deletions gradio/components/multimodal_textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def __init__(
self,
value: str | dict[str, str | list] | Callable | None = None,
*,
sources: list[Literal["upload", "microphone"]]
| Literal["upload", "microphone"]
| None = None,
file_types: list[str] | None = None,
file_count: Literal["single", "multiple", "directory"] = "single",
lines: int = 1,
Expand Down Expand Up @@ -91,6 +94,7 @@ def __init__(
"""
Parameters:
value: Default value to show in MultimodalTextbox. A string value, or a dictionary of the form {"text": "sample text", "files": [{path: "files/file.jpg", orig_name: "file.jpg", url: "http://image_url.jpg", size: 100}]}. If callable, the function will be called whenever the app loads to set the initial value of the component.
sources: A list of sources permitted. "upload" creates a button where users can click to upload or drop files, "microphone" creates a microphone input. If None, defaults to ["upload"].
file_count: if single, allows user to upload one file. If "multiple", user uploads multiple files. If "directory", user uploads all files in selected directory. Return type will be list for each file in case of "multiple" or "directory".
file_types: List of file extensions or types of files to be uploaded (e.g. ['image', '.json', '.mp4']). "file" allows any file to be uploaded, "image" allows only image files to be uploaded, "audio" allows only audio files to be uploaded, "video" allows only video files to be uploaded, "text" allows only text files to be uploaded.
lines: minimum number of line rows to provide in textarea.
Expand Down Expand Up @@ -118,6 +122,22 @@ def __init__(
stop_btn: If True, will show a stop button (useful for streaming demos). If a string, will use that string as the stop button text.
max_plain_text_length: Maximum length of plain text in the textbox. If the text exceeds this length, the text will be pasted as a file. Default is 1000.
"""
valid_sources: list[Literal["upload", "microphone"]] = ["upload", "microphone"]
if sources is None:
self.sources = ["upload"]
elif isinstance(sources, str) and sources in valid_sources:
self.sources = [sources]
elif isinstance(sources, list):
self.sources = sources
else:
raise ValueError(
f"`sources` must be a list consisting of elements in {valid_sources}"
)
for source in self.sources:
if source not in valid_sources:
raise ValueError(
f"`sources` must a list consisting of elements in {valid_sources}"
)
self.file_types = file_types
self.file_count = file_count
if file_types is not None and not isinstance(file_types, list):
Expand Down
4 changes: 2 additions & 2 deletions guides/05_chatbots/01_creating-a-chatbot-fast.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ This second parameter of your chat function, `history`, will be in the same open

The return type of your chat function does *not change* when setting `multimodal=True` (i.e. in the simplest case, you should still return a string value). We discuss more complex cases, e.g. returning files [below](#returning-complex-responses).

If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. Here's an example that illustrates how to set up and customize and multimodal chat interface:
If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. Here's an example that illustrates how to set up and customize and multimodal chat interface:


```python
Expand All @@ -215,7 +215,7 @@ demo = gr.ChatInterface(
{"text": "No files", "files": []}
],
multimodal=True,
textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"])
textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"], sources=["upload", "microphone"])
)

demo.launch()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ def bot(history):
return history
```

In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this:
In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this:

```python
def add_message(history, message):
for x in message["files"]:
history.append({"role": "user", "content": {"path": x}})
if message["text"] is not None:
history.append({"role": "user", "content": message["text"]})
return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"])
return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"], sources=["upload", "microphone"])
```

Putting this together, we can create a _multimodal_ chatbot with a multimodal textbox for a user to submit text and media files. The rest of the code looks pretty much the same as before:
Expand Down
29 changes: 28 additions & 1 deletion js/audio/interactive/InteractiveAudio.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
export let stream_every: number;
export let uploading = false;
export let recording = false;
export let class_name = "";
let time_limit: number | null = null;
let stream_state: "open" | "waiting" | "closed" = "closed";
Expand Down Expand Up @@ -246,7 +247,7 @@
float={active_source === "upload" && value === null}
label={label || i18n("audio.audio")}
/>
<div class="audio-container">
<div class="audio-container {class_name}">
<StreamingBar {time_limit} />
{#if value === null || streaming}
{#if active_source === "microphone"}
Expand Down Expand Up @@ -329,4 +330,30 @@
flex-direction: column;
justify-content: space-between;
}
.audio-container.compact-audio {
margin-top: calc(var(--size-8) * -1);
height: auto;
padding: 0px;
gap: var(--size-2);
min-height: var(--size-5);
}
.compact-audio :global(.audio-player) {
padding: 0px;
}
.compact-audio :global(.controls) {
gap: 0px;
padding: 0px;
}
.compact-audio :global(.waveform-container) {
height: var(--size-12) !important;
}
.compact-audio :global(.player-container) {
min-height: unset;
height: auto;
}
</style>
62 changes: 60 additions & 2 deletions js/multimodaltextbox/Index.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import { StatusTracker } from "@gradio/statustracker";
import type { LoadingStatus } from "@gradio/statustracker";
import type { FileData } from "@gradio/client";
import { onMount } from "svelte";
import type { WaveformOptions } from "../audio/shared/types";
export let gradio: Gradio<{
change: typeof value;
Expand All @@ -23,6 +25,11 @@
focus: never;
error: string;
clear_status: LoadingStatus;
start_recording: never;
pause_recording: never;
stop_recording: never;
upload: FileData[] | FileData;
clear: undefined;
}>;
export let elem_id = "";
export let elem_classes: string[] = [];
Expand All @@ -38,7 +45,6 @@
export let info: string | undefined = undefined;
export let show_label: boolean;
export let max_lines: number;
export let container = true;
export let scale: number | null = null;
export let min_width: number | undefined = undefined;
export let submit_btn: string | boolean | null = null;
Expand All @@ -53,8 +59,52 @@
export let root: string;
export let file_count: "single" | "multiple" | "directory";
export let max_plain_text_length: number;
export let sources: ["microphone" | "upload"] = ["upload"];
export let waveform_options: WaveformOptions = {};
let dragging: boolean;
let active_source: "microphone" | null = null;
let waveform_settings: Record<string, any>;
let color_accent = "darkorange";
onMount(() => {
color_accent = getComputedStyle(document?.documentElement).getPropertyValue(
"--color-accent"
);
set_trim_region_colour();
waveform_settings.waveColor = waveform_options.waveform_color || "#9ca3af";
waveform_settings.progressColor =
waveform_options.waveform_progress_color || color_accent;
waveform_settings.mediaControls = waveform_options.show_controls;
waveform_settings.sampleRate = waveform_options.sample_rate || 44100;
});
$: waveform_settings = {
height: 50,
barWidth: 2,
barGap: 3,
cursorWidth: 2,
cursorColor: "#ddd5e9",
autoplay: false,
barRadius: 10,
dragToSeek: true,
normalize: true,
minPxPerSec: 20
};
const trim_region_settings = {
color: waveform_options.trim_region_color,
drag: true,
resize: true
};
function set_trim_region_colour(): void {
document.documentElement.style.setProperty(
"--trim-region-color",
trim_region_settings.color || color_accent
);
}
</script>

<Block
Expand All @@ -80,6 +130,7 @@
bind:value
bind:value_is_output
bind:dragging
bind:active_source
{file_types}
{root}
{label}
Expand All @@ -88,14 +139,16 @@
{lines}
{rtl}
{text_align}
{waveform_settings}
i18n={gradio.i18n}
max_lines={!max_lines ? lines + 1 : max_lines}
{placeholder}
{submit_btn}
{stop_btn}
{autofocus}
{container}
{autoscroll}
{file_count}
{sources}
max_file_size={gradio.max_file_size}
on:change={() => gradio.dispatch("change", value)}
on:input={() => gradio.dispatch("input")}
Expand All @@ -107,6 +160,11 @@
on:error={({ detail }) => {
gradio.dispatch("error", detail);
}}
on:start_recording={() => gradio.dispatch("start_recording")}
on:pause_recording={() => gradio.dispatch("pause_recording")}
on:stop_recording={() => gradio.dispatch("stop_recording")}
on:upload={(e) => gradio.dispatch("upload", e.detail)}
on:clear={() => gradio.dispatch("clear")}
disabled={!interactive}
upload={(...args) => gradio.client.upload(...args)}
stream_handler={(...args) => gradio.client.stream(...args)}
Expand Down
10 changes: 10 additions & 0 deletions js/multimodaltextbox/MultimodalTextbox.stories.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@
description: "Whether to render right-to-left",
control: { type: "boolean" },
defaultValue: false
},
sources: {
options: ["upload", "microphone"],
description: "The sources to enable",
control: { type: "select" },
defaultValue: ["upload", "microphone"]
}
}}
/>
Expand Down Expand Up @@ -87,3 +93,7 @@
}
}}
/>
<Story
name="MultimodalTextbox with microphone input"
args={{ sources: ["microphone"] }}
/>
Loading

0 comments on commit 9b17032

Please sign in to comment.