diff --git a/vertexai/app/src/main/AndroidManifest.xml b/vertexai/app/src/main/AndroidManifest.xml index 0d795b206..d6e90a837 100644 --- a/vertexai/app/src/main/AndroidManifest.xml +++ b/vertexai/app/src/main/AndroidManifest.xml @@ -16,6 +16,8 @@ + + { + // Initialize a GenerativeModel with the `gemini-pro` AI model for audio generation + val generativeModel = Firebase.vertexAI.generativeModel( + modelName = "gemini-1.5-pro-001", + generationConfig = config + ) + AudioViewModel(generativeModel) + } + else -> throw IllegalArgumentException("Unknown ViewModel class: ${viewModelClass.name}") } diff --git a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MainActivity.kt b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MainActivity.kt index e489d8ea4..a7cb6ceb8 100644 --- a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MainActivity.kt +++ b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MainActivity.kt @@ -26,6 +26,7 @@ import androidx.compose.ui.Modifier import androidx.navigation.compose.NavHost import androidx.navigation.compose.composable import androidx.navigation.compose.rememberNavController +import com.google.firebase.quickstart.vertexai.feature.audio.AudioRoute import com.google.firebase.quickstart.vertexai.feature.chat.ChatRoute import com.google.firebase.quickstart.vertexai.feature.functioncalling.FunctionsChatRoute import com.google.firebase.quickstart.vertexai.feature.multimodal.PhotoReasoningRoute @@ -64,6 +65,9 @@ class MainActivity : ComponentActivity() { composable("functions_chat") { FunctionsChatRoute() } + composable("audio") { + AudioRoute() + } } } } diff --git a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MenuScreen.kt b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MenuScreen.kt index 5c8f6b6d9..defdc6028 100644 --- a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MenuScreen.kt +++ b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MenuScreen.kt @@ -46,7 +46,8 @@ fun MenuScreen( MenuItem("summarize", R.string.menu_summarize_title, R.string.menu_summarize_description), MenuItem("photo_reasoning", R.string.menu_reason_title, R.string.menu_reason_description), MenuItem("chat", R.string.menu_chat_title, R.string.menu_chat_description), - MenuItem("functions_chat", R.string.menu_functions_title, R.string.menu_functions_description) + MenuItem("functions_chat", R.string.menu_functions_title, R.string.menu_functions_description), + MenuItem("audio", R.string.menu_audio_title, R.string.menu_audio_description) ) LazyColumn( diff --git a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioRecorder.kt b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioRecorder.kt new file mode 100644 index 000000000..d26fae851 --- /dev/null +++ b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioRecorder.kt @@ -0,0 +1,57 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.quickstart.vertexai.feature.audio + +import android.content.Context +import android.media.MediaRecorder +import android.os.Build +import java.io.File + +class AudioRecorder { + private var recorder: MediaRecorder? = null + private var outputFilePath: String? = null + + fun startRecording(context: Context) { + outputFilePath = File.createTempFile( + "recording_${System.currentTimeMillis()}", ".m4a", context.cacheDir + ).absolutePath + + recorder = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + MediaRecorder(context) + } else { + MediaRecorder() + }.apply { + setAudioSource(MediaRecorder.AudioSource.MIC) + setOutputFormat(MediaRecorder.OutputFormat.MPEG_4) + setAudioEncoder(MediaRecorder.AudioEncoder.AAC) + setOutputFile(outputFilePath) + prepare() + start() + } + } + + fun stopRecording(): ByteArray { + recorder?.stop() + recorder?.release() + recorder = null + + val audioFile = File(outputFilePath ?: throw IllegalStateException("Output file path not set")) + val audioBytes = audioFile.readBytes() + audioFile.delete() + return audioBytes + } +} diff --git a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioScreen.kt b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioScreen.kt new file mode 100644 index 000000000..01a6e2052 --- /dev/null +++ b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioScreen.kt @@ -0,0 +1,252 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.quickstart.vertexai.feature.audio + +import android.Manifest +import androidx.activity.compose.rememberLauncherForActivityResult +import androidx.activity.result.contract.ActivityResultContracts +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.layout.requiredSize +import androidx.compose.foundation.rememberScrollState +import androidx.compose.foundation.verticalScroll +import androidx.compose.material.icons.Icons +import androidx.compose.material.icons.filled.Delete +import androidx.compose.material.icons.outlined.Person +import androidx.compose.material3.Card +import androidx.compose.material3.CardDefaults +import androidx.compose.material3.CircularProgressIndicator +import androidx.compose.material3.Icon +import androidx.compose.material3.IconButton +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.OutlinedTextField +import androidx.compose.material3.Text +import androidx.compose.material3.TextButton +import androidx.compose.runtime.Composable +import androidx.compose.runtime.collectAsState +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.rememberCoroutineScope +import androidx.compose.runtime.saveable.rememberSaveable +import androidx.compose.runtime.setValue +import androidx.compose.ui.Alignment +import androidx.compose.ui.Modifier +import androidx.compose.ui.draw.drawBehind +import androidx.compose.ui.graphics.Color +import androidx.compose.ui.graphics.vector.ImageVector +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.res.stringResource +import androidx.compose.ui.res.vectorResource +import androidx.compose.ui.tooling.preview.Preview +import androidx.compose.ui.unit.dp +import androidx.core.content.ContextCompat +import androidx.core.content.PermissionChecker.PERMISSION_GRANTED +import androidx.lifecycle.viewmodel.compose.viewModel +import com.google.firebase.quickstart.vertexai.GenerativeViewModelFactory +import com.google.firebase.quickstart.vertexai.R +import kotlinx.coroutines.launch + +@Composable +internal fun AudioRoute( + viewModel: AudioViewModel = viewModel(factory = GenerativeViewModelFactory) +) { + val audioUiState by viewModel.uiState.collectAsState() + val coroutineScope = rememberCoroutineScope() + + AudioScreen( + viewModel.audioRecorder, + uiState = audioUiState, + onReasonClicked = { inputText, audioData -> + coroutineScope.launch { viewModel.reason(inputText, audioData) } + }, + ) +} + +@Composable +fun AudioScreen( + audioRecorder: AudioRecorder = AudioRecorder(), + uiState: AudioUiState = AudioUiState.Loading, + onReasonClicked: (String, ByteArray) -> Unit = { _, _ -> }, +) { + val context = LocalContext.current + + var userQuestion by rememberSaveable { mutableStateOf("") } + var recordGranted by rememberSaveable { + mutableStateOf( + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == PERMISSION_GRANTED + ) + } + var isRecording by rememberSaveable { mutableStateOf(false) } + var audioData by rememberSaveable { mutableStateOf(null) } + + val launcher = rememberLauncherForActivityResult(ActivityResultContracts.RequestPermission()) { isGranted -> + recordGranted = isGranted + } + + Column( + modifier = Modifier + .padding(all = 16.dp) + .verticalScroll(rememberScrollState()) + ) { + Card(modifier = Modifier.fillMaxWidth()) { + Row(modifier = Modifier.padding(vertical = 16.dp)) { + if (!recordGranted) { + Box( + modifier = Modifier.fillMaxWidth(), contentAlignment = Alignment.Center + ) { + TextButton(onClick = { launcher.launch(Manifest.permission.RECORD_AUDIO) }) { + Text(stringResource(R.string.grant_record)) + } + } + } else { + IconButton( + onClick = { + if (isRecording) { + audioData = audioRecorder.stopRecording() + isRecording = false + } else if (audioData == null) { + audioRecorder.startRecording(context) + isRecording = true + } else { + audioData = null + } + }, + modifier = Modifier + .padding(all = 4.dp) + .align(Alignment.CenterVertically), + ) { + Icon( + imageVector = if (isRecording) { + ImageVector.vectorResource(R.drawable.stop) + } else if (audioData == null) { + ImageVector.vectorResource(R.drawable.mic) + } else { + Icons.Filled.Delete + }, + contentDescription = stringResource( + if (isRecording) { + R.string.stop_recording + } else if (audioData == null) { + R.string.start_recording + } else { + R.string.delete_clip + } + ), + ) + } + OutlinedTextField( + value = userQuestion, + label = { Text(stringResource(R.string.audio_label)) }, + placeholder = { Text(stringResource(R.string.audio_hint)) }, + onValueChange = { userQuestion = it }, + modifier = Modifier.fillMaxWidth(0.8f), + ) + TextButton( + onClick = { + if (audioData != null) onReasonClicked(userQuestion, audioData!!) + }, + modifier = Modifier + .padding(all = 4.dp) + .align(Alignment.CenterVertically), + ) { + Text( + stringResource(R.string.action_go), + color = if (audioData == null) { + MaterialTheme.colorScheme.secondary + } else { + MaterialTheme.colorScheme.primary + } + ) + } + } + } + } + when (uiState) { + AudioUiState.Initial -> { + // Nothing is shown + } + + AudioUiState.Loading -> { + Box( + contentAlignment = Alignment.Center, + modifier = Modifier + .padding(all = 8.dp) + .align(Alignment.CenterHorizontally), + ) { + CircularProgressIndicator() + } + } + + is AudioUiState.Success -> { + Card( + modifier = Modifier + .padding(vertical = 16.dp) + .fillMaxWidth(), + shape = MaterialTheme.shapes.large, + colors = CardDefaults.cardColors(containerColor = MaterialTheme.colorScheme.onSecondaryContainer), + ) { + Row( + modifier = Modifier + .padding(all = 16.dp) + .fillMaxWidth() + ) { + Icon( + Icons.Outlined.Person, + contentDescription = "Person Icon", + tint = MaterialTheme.colorScheme.onSecondary, + modifier = Modifier + .requiredSize(36.dp) + .drawBehind { drawCircle(color = Color.White) }, + ) + Text( + text = uiState.outputText, + color = MaterialTheme.colorScheme.onSecondary, + modifier = Modifier + .padding(start = 16.dp) + .fillMaxWidth(), + ) + } + } + } + + is AudioUiState.Error -> { + Card( + modifier = Modifier + .padding(vertical = 16.dp) + .fillMaxWidth(), + shape = MaterialTheme.shapes.large, + colors = CardDefaults.cardColors(containerColor = MaterialTheme.colorScheme.errorContainer), + ) { + Text( + text = uiState.errorMessage, + color = MaterialTheme.colorScheme.error, + modifier = Modifier.padding(all = 16.dp), + ) + } + } + } + } +} + +@Composable +@Preview(showSystemUi = true) +fun AudioScreenPreview() { + AudioScreen() +} diff --git a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioUiState.kt b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioUiState.kt new file mode 100644 index 000000000..d5e621e97 --- /dev/null +++ b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioUiState.kt @@ -0,0 +1,47 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.quickstart.vertexai.feature.audio + +/** + * A sealed hierarchy describing the state of the text generation. + */ +sealed interface AudioUiState { + + /** + * Empty state when the screen is first shown + */ + data object Initial : AudioUiState + + /** + * Still loading + */ + data object Loading : AudioUiState + + /** + * Text has been generated + */ + data class Success( + val outputText: String + ) : AudioUiState + + /** + * There was an error generating text + */ + data class Error( + val errorMessage: String + ) : AudioUiState +} diff --git a/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioViewModel.kt b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioViewModel.kt new file mode 100644 index 000000000..a17a0201c --- /dev/null +++ b/vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/feature/audio/AudioViewModel.kt @@ -0,0 +1,65 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.quickstart.vertexai.feature.audio + +import androidx.lifecycle.ViewModel +import androidx.lifecycle.viewModelScope +import com.google.firebase.vertexai.GenerativeModel +import com.google.firebase.vertexai.type.content +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.StateFlow +import kotlinx.coroutines.flow.asStateFlow +import kotlinx.coroutines.launch + +class AudioViewModel( + private val generativeModel: GenerativeModel +) : ViewModel() { + + private val _uiState: MutableStateFlow = MutableStateFlow(AudioUiState.Initial) + val uiState: StateFlow = _uiState.asStateFlow() + val audioRecorder = AudioRecorder() + + fun reason( + userInput: String, + audioBytes: ByteArray, + ) { + _uiState.value = AudioUiState.Loading + val prompt = if (userInput.isBlank()) { + "Answer the question in the audio." + } else { + "Listen to the audio, and then answer the following question: $userInput" + } + viewModelScope.launch(Dispatchers.IO) { + try { + val inputContent = content { + blob("audio/aac", audioBytes) + text(prompt) + } + + var outputContent = "" + + generativeModel.generateContentStream(inputContent).collect { response -> + outputContent += response.text + _uiState.value = AudioUiState.Success(outputContent) + } + } catch (e: Exception) { + _uiState.value = AudioUiState.Error(e.localizedMessage ?: "") + } + } + } +} diff --git a/vertexai/app/src/main/res/drawable/mic.xml b/vertexai/app/src/main/res/drawable/mic.xml new file mode 100644 index 000000000..c6ba41647 --- /dev/null +++ b/vertexai/app/src/main/res/drawable/mic.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/vertexai/app/src/main/res/drawable/stop.xml b/vertexai/app/src/main/res/drawable/stop.xml new file mode 100644 index 000000000..817d57b76 --- /dev/null +++ b/vertexai/app/src/main/res/drawable/stop.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/vertexai/app/src/main/res/values/strings.xml b/vertexai/app/src/main/res/values/strings.xml index 59f7a4069..70a623bbd 100644 --- a/vertexai/app/src/main/res/values/strings.xml +++ b/vertexai/app/src/main/res/values/strings.xml @@ -24,6 +24,8 @@ Sample app for uploading images and asking about them Build multi-turn conversations (chat) Sample app demonstrating a conversational UI + Generate text from text-and-audio input + Sample app for recording audio and generating text from it Text @@ -39,4 +41,12 @@ Send Sample code demonstrating a conversational UI that supports function calling Function calling enabled chat + + + Question + Record a clip and then ask a question + Grant Permission to Record + Start Recording + Stop Recording + Delete Recorded Clip