From 336a951b72365cfd1afd4f540978743f5575ce90 Mon Sep 17 00:00:00 2001 From: Charles Date: Sun, 2 Feb 2025 23:18:32 +0800 Subject: [PATCH] feat(share): add screen recording video sharing (#77) * chore(ci): add test cases * feat(share): add screen recording video sharing * fix(share): black screen on the first frame * fix(share): export as video bug * chore(version): 0.0.4-beta.1 --- package.json | 2 +- packages/action-parser/src/actionParser.ts | 2 +- .../action-parser/test/actionParser.test.ts | 156 ++++++++++++ .../{src => test}/index.bench.ts | 0 .../action-parser/{src => test}/index.test.ts | 25 +- src/main/main.ts | 31 ++- src/main/store/ScreenMarker.ts | 6 +- src/main/window/index.ts | 8 +- src/preload/index.ts | 2 + .../src/components/ChatInput/index.tsx | 226 ++++++++++++------ src/renderer/src/components/Header/index.tsx | 3 +- .../src/components/ScreenRecorder/index.tsx | 48 ++++ src/renderer/src/hooks/useScreenRecord.ts | 155 ++++++++++++ 13 files changed, 579 insertions(+), 85 deletions(-) create mode 100644 packages/action-parser/test/actionParser.test.ts rename packages/action-parser/{src => test}/index.bench.ts (100%) rename packages/action-parser/{src => test}/index.test.ts (90%) create mode 100644 src/renderer/src/components/ScreenRecorder/index.tsx create mode 100644 src/renderer/src/hooks/useScreenRecord.ts diff --git a/package.json b/package.json index d9a554e..784cee9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ui-tars-desktop", - "version": "0.0.3", + "version": "0.0.4-beta.1", "private": true, "packageManager": "pnpm@9.10.0", "description": "A GUI Agent application based on UI-TARS(Vision-Lanuage Model) that allows you to control your computer using natural language.", diff --git a/packages/action-parser/src/actionParser.ts b/packages/action-parser/src/actionParser.ts index 568e10c..da80285 100644 --- a/packages/action-parser/src/actionParser.ts +++ b/packages/action-parser/src/actionParser.ts @@ -16,7 +16,7 @@ export function actionParser(params: { prediction: string; factor: number }): { }; } -function parseActionVlm( +export function parseActionVlm( text: string, factor = 1000, mode: 'bc' | 'o1' = 'bc', diff --git a/packages/action-parser/test/actionParser.test.ts b/packages/action-parser/test/actionParser.test.ts new file mode 100644 index 0000000..cbceaf6 --- /dev/null +++ b/packages/action-parser/test/actionParser.test.ts @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2025 Bytedance, Inc. and its affiliates. + * SPDX-License-Identifier: Apache-2.0 + */ +import { describe, it, expect } from 'vitest'; +import { parseActionVlm } from '../src/actionParser'; + +describe('parseActionVlm', () => { + // BC mode tests + describe('BC mode', () => { + it('should correctly parse input with Thought', () => { + const input = `Thought: I need to click this button +Action: click(start_box='(100,200)')`; + + const result = parseActionVlm(input); + + expect(result).toEqual([ + { + reflection: null, + thought: 'I need to click this button', + action_type: 'click', + action_inputs: { + start_box: '[0.1,0.2,0.1,0.2]', + }, + }, + ]); + }); + + it('should correctly parse input with Reflection and Action_Summary', () => { + const input = `Reflection: This is a reflection +Action_Summary: This is a summary +Action: type(text='Hello', start_box='(300,400)')`; + + const result = parseActionVlm(input); + + expect(result).toEqual([ + { + reflection: 'This is a reflection', + thought: 'This is a summary', + action_type: 'type', + action_inputs: { + text: 'Hello', + start_box: '[0.3,0.4,0.3,0.4]', + }, + }, + ]); + }); + + it('should handle multiple actions', () => { + const input = `Thought: Perform multiple actions +Action: click(start_box='(100,200)') + +type(text='Hello', start_box='(300,400)')`; + + const result = parseActionVlm(input); + + expect(result).toEqual([ + { + thought: 'Perform multiple actions', + reflection: null, + action_type: 'click', + action_inputs: { + start_box: '[0.1,0.2,0.1,0.2]', + }, + }, + { + thought: 'Perform multiple actions', + reflection: null, + action_type: 'type', + action_inputs: { + text: 'Hello', + start_box: '[0.3,0.4,0.3,0.4]', + }, + }, + ]); + }); + }); + + // O1 mode tests + describe('O1 mode', () => { + it('should correctly parse O1 format input', () => { + const input = `I need to perform this action +Action_Summary: Click and type text +Action: click(start_box='(100,200)') +`; + + const result = parseActionVlm(input, 1000, 'o1'); + + expect(result).toEqual([ + { + reflection: null, + thought: + 'I need to perform this action\n\nClick and type text', + action_type: 'click', + action_inputs: { + start_box: '[0.1,0.2,0.1,0.2]', + }, + }, + ]); + }); + + it('should handle complex O1 format input', () => { + const input = `Complex operation +Action_Summary: Multiple sequential actions +Action: click(start_box='(100,200)') +`; + + const result = parseActionVlm(input, 1000, 'o1'); + + expect(result).toEqual([ + { + reflection: null, + thought: + 'Complex operation\n\nMultiple sequential actions', + action_type: 'click', + action_inputs: { + start_box: '[0.1,0.2,0.1,0.2]', + }, + }, + ]); + }); + }); + + // Edge cases + describe('Edge cases', () => { + it('should handle input without Action keyword', () => { + const input = 'click(start_box="(100,200)")'; + const result = parseActionVlm(input); + + expect(result).toEqual([ + { + action_inputs: { + start_box: '[0.1]', + }, + action_type: 'click', + reflection: null, + thought: '', + }, + ]); + }); + + it('should handle empty action input', () => { + const input = 'Thought: Empty action\nAction:'; + const result = parseActionVlm(input); + + expect(result).toEqual([ + { + action_inputs: {}, + action_type: '', + reflection: null, + thought: 'Empty action', + }, + ]); + }); + }); +}); diff --git a/packages/action-parser/src/index.bench.ts b/packages/action-parser/test/index.bench.ts similarity index 100% rename from packages/action-parser/src/index.bench.ts rename to packages/action-parser/test/index.bench.ts diff --git a/packages/action-parser/src/index.test.ts b/packages/action-parser/test/index.test.ts similarity index 90% rename from packages/action-parser/src/index.test.ts rename to packages/action-parser/test/index.test.ts index e709acf..f5d0370 100644 --- a/packages/action-parser/src/index.test.ts +++ b/packages/action-parser/test/index.test.ts @@ -5,7 +5,7 @@ // @prettier import { describe, expect, it } from 'vitest'; -import { actionParser } from './index'; +import { actionParser } from '../'; describe('actionParser', () => { it('should return parsed action', () => { @@ -140,6 +140,29 @@ describe('actionParser', () => { }); }); + it('should return Reflection', () => { + const result = actionParser({ + prediction: + "Reflection: 在桌面上我看到了Chrome浏览器的图标,根据任务要求需要打开Chrome浏览器,应该双击该图标来启动浏览器。\nAction_Summary: 在桌面上找到Chrome浏览器图标的位置,通过双击操作来打开浏览器。\nAction: left_double(start_box='21, 246, 21, 246')", + factor: 1000, + }); + + expect(result).toEqual({ + parsed: [ + { + thought: + '在桌面上找到Chrome浏览器图标的位置,通过双击操作来打开浏览器。', + reflection: + '在桌面上我看到了Chrome浏览器的图标,根据任务要求需要打开Chrome浏览器,应该双击该图标来启动浏览器。', + action_type: 'left_double', + action_inputs: { + start_box: '[0.021,0.246,0.021,0.246]', + }, + }, + ], + }); + }); + it('should return parsed action with newline', () => { const result = actionParser({ // prettier-ignore diff --git a/src/main/main.ts b/src/main/main.ts index 8da5c50..890946d 100644 --- a/src/main/main.ts +++ b/src/main/main.ts @@ -3,7 +3,14 @@ * SPDX-License-Identifier: Apache-2.0 */ import { electronApp, optimizer } from '@electron-toolkit/utils'; -import { app, globalShortcut, ipcMain } from 'electron'; +import { + app, + desktopCapturer, + globalShortcut, + ipcMain, + screen, + session, +} from 'electron'; import squirrelStartup from 'electron-squirrel-startup'; import ElectronStore from 'electron-store'; import { updateElectronApp, UpdateSourceType } from 'update-electron-app'; @@ -123,6 +130,20 @@ const initializeApp = async () => { // eslint-disable-next-line new AppUpdater(); + session.defaultSession.setDisplayMediaRequestHandler( + (_request, callback) => { + desktopCapturer.getSources({ types: ['screen'] }).then((sources) => { + // Grant access to the first screen found. + callback({ video: sources[0], audio: 'loopback' }); + }); + // If true, use the system picker if available. + // Note: this is currently experimental. If the system picker + // is available, it will be used and the media request handler + // will not be invoked. + }, + { useSystemPicker: true }, + ); + logger.info('mainZustandBridge'); const { unsubscribe } = mainZustandBridge( @@ -152,6 +173,14 @@ const registerIPCHandlers = () => { ipcMain.handle('utio:shareReport', async (_, params) => { await UTIOService.getInstance().shareReport(params); }); + + ipcMain.handle('get-screen-size', () => { + const primaryDisplay = screen.getPrimaryDisplay(); + return { + screenWidth: primaryDisplay.size.width, + screenHeight: primaryDisplay.size.height, + }; + }); }; /** diff --git a/src/main/store/ScreenMarker.ts b/src/main/store/ScreenMarker.ts index bf4be7d..19b3cf5 100644 --- a/src/main/store/ScreenMarker.ts +++ b/src/main/store/ScreenMarker.ts @@ -59,7 +59,7 @@ class ScreenMarker { }); this.screenWaterFlow.blur(); - this.screenWaterFlow.setContentProtection(false); // show for vlm model + this.screenWaterFlow.setContentProtection(false); this.screenWaterFlow.setIgnoreMouseEvents(true); this.screenWaterFlow.loadURL(`data:text/html;charset=UTF-8, @@ -159,6 +159,7 @@ class ScreenMarker { }, }); + this.pauseButton.blur(); this.pauseButton.setContentProtection(true); // not show for vlm model this.pauseButton.setPosition(Math.floor(screenWidth / 2 - 50), 0); @@ -292,7 +293,7 @@ class ScreenMarker { showTextWithMarker(text: string, x: number, y: number) { logger.info('[showTextWithMarker] text', text, 'x', x, 'y', y); - // 如果存在之前的窗口,先关闭它 + // close previous overlay if exists this.closeOverlay(); this.currentOverlay = new BrowserWindow({ @@ -317,6 +318,7 @@ class ScreenMarker { this.currentOverlay.setAlwaysOnTop(true, 'screen-saver'); } + this.currentOverlay.blur(); this.currentOverlay.setContentProtection(false); // show for vlm model this.currentOverlay.setIgnoreMouseEvents(true); diff --git a/src/main/window/index.ts b/src/main/window/index.ts index eb13372..2dde470 100644 --- a/src/main/window/index.ts +++ b/src/main/window/index.ts @@ -31,8 +31,8 @@ export function createMainWindow() { mainWindow = createWindow({ routerPath: '/', - width: 450, - height: 600, + width: 430, + height: 580, alwaysOnTop: false, }); @@ -145,8 +145,9 @@ export async function hideWindowBlock( let originalBounds: Electron.Rectangle | undefined; try { - mainWindow?.setContentProtection(true); + mainWindow?.setContentProtection(false); mainWindow?.setAlwaysOnTop(true); + mainWindow?.blur(); try { if (mainWindow) { originalBounds = mainWindow.getBounds(); @@ -156,7 +157,6 @@ export async function hideWindowBlock( } catch (e) { logger.error(e); } - mainWindow?.blur(); const result = await Promise.resolve(operation()); return result; diff --git a/src/preload/index.ts b/src/preload/index.ts index 112a305..a28e7bf 100644 --- a/src/preload/index.ts +++ b/src/preload/index.ts @@ -13,6 +13,8 @@ export type Channels = 'ipc-example'; const electronHandler = { ipcRenderer: { + invoke: (channel: string, ...args: unknown[]) => + ipcRenderer.invoke(channel, ...args), sendMessage(channel: Channels, ...args: unknown[]) { ipcRenderer.send(channel, ...args); }, diff --git a/src/renderer/src/components/ChatInput/index.tsx b/src/renderer/src/components/ChatInput/index.tsx index 3648655..8617359 100644 --- a/src/renderer/src/components/ChatInput/index.tsx +++ b/src/renderer/src/components/ChatInput/index.tsx @@ -2,11 +2,26 @@ * Copyright (c) 2025 Bytedance, Inc. and its affiliates. * SPDX-License-Identifier: Apache-2.0 */ -import { Box, Button, Flex, HStack, Spinner, VStack } from '@chakra-ui/react'; +import { + Box, + Button, + Flex, + HStack, + Menu, + MenuButton, + MenuItem, + MenuList, + Spinner, + useDisclosure, + VStack, +} from '@chakra-ui/react'; import { useToast } from '@chakra-ui/react'; +import { RiRecordCircleLine } from 'react-icons/ri'; +import { TbReport } from 'react-icons/tb'; import React, { forwardRef, useEffect, useMemo, useRef } from 'react'; import { FaPaperPlane, FaStop, FaTrash } from 'react-icons/fa'; -import { LuScreenShare } from 'react-icons/lu'; +import { HiChevronDown } from 'react-icons/hi'; +import { FaRegShareFromSquare } from 'react-icons/fa6'; import { IoPlay } from 'react-icons/io5'; import { useDispatch } from 'zutron'; @@ -20,6 +35,7 @@ import { uploadReport } from '@renderer/utils/share'; import reportHTMLUrl from '@resources/report.html?url'; import { isCallUserMessage } from '@renderer/utils/message'; +import { useScreenRecord } from '@renderer/hooks/useScreenRecord'; const ChatInput = forwardRef((_props, _ref) => { const { @@ -36,6 +52,18 @@ const ChatInput = forwardRef((_props, _ref) => { const toast = useToast(); const { run } = useRunAgent(); + const { + isOpen: isShareOpen, + onOpen: onShareOpen, + onClose: onShareClose, + } = useDisclosure(); + const { + canSaveRecording, + startRecording, + stopRecording, + saveRecording, + recordRefs, + } = useScreenRecord(); const textareaRef = useRef(null); const running = status === 'running'; @@ -44,6 +72,9 @@ const ChatInput = forwardRef((_props, _ref) => { const startRun = () => { run(localInstructions, () => { + startRecording().catch((e) => { + console.error('start recording failed:', e); + }); setLocalInstructions(''); }); }; @@ -74,6 +105,12 @@ const ChatInput = forwardRef((_props, _ref) => { } }, []); + useEffect(() => { + if (status === StatusEnum.INIT) { + return; + } + }, [status]); + const isCallUser = useMemo(() => isCallUserMessage(messages), [messages]); /** @@ -83,6 +120,14 @@ const ChatInput = forwardRef((_props, _ref) => { if (status === StatusEnum.END && isCallUser && savedInstructions) { setLocalInstructions(savedInstructions); } + // record screen when running + if (status !== StatusEnum.INIT) { + stopRecording(); + } + + return () => { + stopRecording(); + }; }, [isCallUser, status]); const lastHumanMessage = @@ -96,7 +141,7 @@ const ChatInput = forwardRef((_props, _ref) => { const shareTimeoutRef = React.useRef(); const SHARE_TIMEOUT = 100000; - const handleShare = async () => { + const handleShare = async (type: 'report' | 'video') => { if (isSharePending.current) { return; } @@ -118,73 +163,77 @@ const ChatInput = forwardRef((_props, _ref) => { }); }, SHARE_TIMEOUT); - const response = await fetch(reportHTMLUrl); - const html = await response.text(); - - const userData = { - ...restUserData, - status, - conversations: messages, - } as ComputerUseUserData; - - const htmlContent = reportHTMLContent(html, [userData]); - - let reportUrl: string | undefined; - - if (settings?.reportStorageBaseUrl) { - try { - const { url } = await uploadReport( - htmlContent, - settings.reportStorageBaseUrl, - ); - reportUrl = url; - await navigator.clipboard.writeText(url); - toast({ - title: 'Report link copied to clipboard!', - status: 'success', - position: 'top', - duration: 2000, - isClosable: true, - variant: 'ui-tars-success', - }); - } catch (error) { - console.error('Share failed:', error); - toast({ - title: 'Failed to upload report', - description: - error instanceof Error ? error.message : JSON.stringify(error), - status: 'error', - position: 'top', - duration: 3000, - isClosable: true, + if (type === 'video') { + saveRecording(); + } else if (type === 'report') { + const response = await fetch(reportHTMLUrl); + const html = await response.text(); + + const userData = { + ...restUserData, + status, + conversations: messages, + } as ComputerUseUserData; + + const htmlContent = reportHTMLContent(html, [userData]); + + let reportUrl: string | undefined; + + if (settings?.reportStorageBaseUrl) { + try { + const { url } = await uploadReport( + htmlContent, + settings.reportStorageBaseUrl, + ); + reportUrl = url; + await navigator.clipboard.writeText(url); + toast({ + title: 'Report link copied to clipboard!', + status: 'success', + position: 'top', + duration: 2000, + isClosable: true, + variant: 'ui-tars-success', + }); + } catch (error) { + console.error('Share failed:', error); + toast({ + title: 'Failed to upload report', + description: + error instanceof Error ? error.message : JSON.stringify(error), + status: 'error', + position: 'top', + duration: 3000, + isClosable: true, + }); + } + } + + // Send UTIO data through IPC + if (settings?.utioBaseUrl) { + const lastScreenshot = messages + .filter((m) => m.screenshotBase64) + .pop()?.screenshotBase64; + + await window.electron.utio.shareReport({ + type: 'shareReport', + instruction: lastHumanMessage, + lastScreenshot, + report: reportUrl, }); } - } - // Send UTIO data through IPC - if (settings?.utioBaseUrl) { - const lastScreenshot = messages - .filter((m) => m.screenshotBase64) - .pop()?.screenshotBase64; - - await window.electron.utio.shareReport({ - type: 'shareReport', - instruction: lastHumanMessage, - lastScreenshot, - report: reportUrl, - }); + // If shareEndpoint is not configured or the upload fails, fall back to downloading the file + const blob = new Blob([htmlContent], { type: 'text/html' }); + const url = window.URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `report-${Date.now()}.html`; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + window.URL.revokeObjectURL(url); } - - // If shareEndpoint is not configured or the upload fails, fall back to downloading the file - const blob = new Blob([htmlContent], { type: 'text/html' }); - const url = window.URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = `report-${Date.now()}.html`; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - window.URL.revokeObjectURL(url); } catch (error) { console.error('Share failed:', error); toast({ @@ -270,17 +319,46 @@ const ChatInput = forwardRef((_props, _ref) => { {status !== StatusEnum.RUNNING && messages?.length > 1 && ( - + + + } + > + {isSharing ? ( + + ) : ( + + )} + + + {canSaveRecording && ( + handleShare('video')}> + + + Export as Video + + + )} + handleShare('report')}> + + + Export as HTML + + + + + )}
+
+
{/* - + UI-TARS Logo = (props) => { + // eslint-disable-next-line react/prop-types + const { watermarkText = `© ${new Date().getFullYear()} UI-TARS Desktop` } = + props; + const { + isRecording, + canSaveRecording, + startRecording, + stopRecording, + saveRecording, + recordRefs, + } = useScreenRecord(watermarkText); + + return ( +
+
+ +
+ {canSaveRecording && ( +
+ +
+ )} +
+
+
+ ); +}; diff --git a/src/renderer/src/hooks/useScreenRecord.ts b/src/renderer/src/hooks/useScreenRecord.ts new file mode 100644 index 0000000..24acbc8 --- /dev/null +++ b/src/renderer/src/hooks/useScreenRecord.ts @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2025 Bytedance, Inc. and its affiliates. + * SPDX-License-Identifier: Apache-2.0 + */ +import { useEffect, useRef, useState } from 'react'; + +export const useScreenRecord = ( + watermarkText = `© ${new Date().getFullYear()} UI-TARS Desktop`, +) => { + const [isRecording, setIsRecording] = useState(false); + const [recordedChunks, setRecordedChunks] = useState([]); + const mediaRecorderRef = useRef(null); + const streamRef = useRef(null); + const canvasRef = useRef(null); + const videoRef = useRef(null); + + const startRecording = async () => { + try { + setRecordedChunks([]); + + const { screenWidth, screenHeight } = + await window.electron.ipcRenderer.invoke('get-screen-size'); + + const stream = await navigator.mediaDevices.getDisplayMedia({ + video: { + width: screenWidth, + height: screenHeight, + displaySurface: 'monitor', + frameRate: 60, + }, + }); + + streamRef.current = stream; + + if (videoRef.current && canvasRef.current) { + const canvas = canvasRef.current; + const video = videoRef.current; + const ctx = canvas.getContext('2d', { + alpha: true, + willReadFrequently: true, + }); + + video.srcObject = stream; + + await new Promise((resolve) => { + video.onloadeddata = () => resolve(); + }); + await video.play(); + + // draw first frame + ctx?.drawImage(video, 0, 0, canvas.width, canvas.height); + + // set canvas size + canvas.width = screenWidth; + canvas.height = screenHeight; + + const canvasStream = canvas.captureStream(30); + + // create MediaRecorder + const recorder = new MediaRecorder(canvasStream, { + mimeType: 'video/mp4', + videoBitsPerSecond: 12000000, + }); + + const chunks: BlobPart[] = []; + recorder.start(); + recorder.ondataavailable = (e) => { + if (e.data && e.data.size > 0) { + chunks.push(e.data); + } + }; + + // draw watermark video frame + const drawInterval = setInterval(() => { + if (ctx && !video.paused && !video.ended) { + ctx.drawImage(video, 0, 0, canvas.width, canvas.height); + + // add watermark + ctx.font = '36px Arial'; + ctx.fillStyle = 'rgba(255, 255, 255, 0.8)'; + + const metrics = ctx.measureText(watermarkText); + const padding = 20; + + const x = canvas.width - metrics.width - padding; + const y = canvas.height - padding; + + ctx.fillText(watermarkText, x, y); + } + }, 1000 / 30); // 30fps + + recorder.onstop = () => { + clearInterval(drawInterval); + setRecordedChunks(chunks); + }; + + mediaRecorderRef.current = recorder; + setIsRecording(true); + } + } catch (error) { + console.error('record failed:', error); + throw error; + } + }; + + useEffect(() => { + return () => { + setRecordedChunks([]); + if (mediaRecorderRef.current) { + mediaRecorderRef.current.stop(); + } + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + } + }; + }, []); + + const stopRecording = () => { + if (mediaRecorderRef.current && streamRef.current) { + // `setTimeout` to preserve the last rendered screen content + setTimeout(() => { + mediaRecorderRef.current?.stop(); + streamRef.current?.getTracks().forEach((track) => track.stop()); + setIsRecording(false); + }, 500); // 500ms delay + } + }; + + const saveRecording = () => { + if (recordedChunks.length === 0) return; + + const blob = new Blob(recordedChunks, { type: 'video/mp4' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `ui-tars-recording-${Date.now()}.mp4`; + a.click(); + + URL.revokeObjectURL(url); + }; + + const canSaveRecording = !isRecording && recordedChunks.length > 0; + + return { + isRecording, + startRecording, + stopRecording, + saveRecording, + canSaveRecording, + recordRefs: { + videoRef, + canvasRef, + }, + }; +};