feat(share): add screen recording video sharing (#77)

* chore(ci): add test cases * feat(share): add screen recording video sharing * fix(share): black screen on the first frame * fix(share): export as video bug * chore(version): 0.0.4-beta.1
bytedance · Feb 2, 2025 · 336a951 · 336a951
1 parent d0bd957
commit 336a951
Show file tree

Hide file tree

Showing 13 changed files with 579 additions and 85 deletions.
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ui-tars-desktop",
-  "version": "0.0.3",
+  "version": "0.0.4-beta.1",
   "private": true,
   "packageManager": "[email protected]",
   "description": "A GUI Agent application based on UI-TARS(Vision-Lanuage Model) that allows you to control your computer using natural language.",

diff --git a/packages/action-parser/src/actionParser.ts b/packages/action-parser/src/actionParser.ts
@@ -16,7 +16,7 @@ export function actionParser(params: { prediction: string; factor: number }): {
   };
 }
 
-function parseActionVlm(
+export function parseActionVlm(
   text: string,
   factor = 1000,
   mode: 'bc' | 'o1' = 'bc',

diff --git a/packages/action-parser/test/actionParser.test.ts b/packages/action-parser/test/actionParser.test.ts
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+import { describe, it, expect } from 'vitest';
+import { parseActionVlm } from '../src/actionParser';
+
+describe('parseActionVlm', () => {
+  // BC mode tests
+  describe('BC mode', () => {
+    it('should correctly parse input with Thought', () => {
+      const input = `Thought: I need to click this button
+Action: click(start_box='(100,200)')`;
+
+      const result = parseActionVlm(input);
+
+      expect(result).toEqual([
+        {
+          reflection: null,
+          thought: 'I need to click this button',
+          action_type: 'click',
+          action_inputs: {
+            start_box: '[0.1,0.2,0.1,0.2]',
+          },
+        },
+      ]);
+    });
+
+    it('should correctly parse input with Reflection and Action_Summary', () => {
+      const input = `Reflection: This is a reflection
+Action_Summary: This is a summary
+Action: type(text='Hello', start_box='(300,400)')`;
+
+      const result = parseActionVlm(input);
+
+      expect(result).toEqual([
+        {
+          reflection: 'This is a reflection',
+          thought: 'This is a summary',
+          action_type: 'type',
+          action_inputs: {
+            text: 'Hello',
+            start_box: '[0.3,0.4,0.3,0.4]',
+          },
+        },
+      ]);
+    });
+
+    it('should handle multiple actions', () => {
+      const input = `Thought: Perform multiple actions
+Action: click(start_box='(100,200)')
+
+type(text='Hello', start_box='(300,400)')`;
+
+      const result = parseActionVlm(input);
+
+      expect(result).toEqual([
+        {
+          thought: 'Perform multiple actions',
+          reflection: null,
+          action_type: 'click',
+          action_inputs: {
+            start_box: '[0.1,0.2,0.1,0.2]',
+          },
+        },
+        {
+          thought: 'Perform multiple actions',
+          reflection: null,
+          action_type: 'type',
+          action_inputs: {
+            text: 'Hello',
+            start_box: '[0.3,0.4,0.3,0.4]',
+          },
+        },
+      ]);
+    });
+  });
+
+  // O1 mode tests
+  describe('O1 mode', () => {
+    it('should correctly parse O1 format input', () => {
+      const input = `<Thought>I need to perform this action</Thought>
+Action_Summary: Click and type text
+Action: click(start_box='(100,200)')
+</Output>`;
+
+      const result = parseActionVlm(input, 1000, 'o1');
+
+      expect(result).toEqual([
+        {
+          reflection: null,
+          thought:
+            'I need to perform this action\n<Action_Summary>\nClick and type text',
+          action_type: 'click',
+          action_inputs: {
+            start_box: '[0.1,0.2,0.1,0.2]',
+          },
+        },
+      ]);
+    });
+
+    it('should handle complex O1 format input', () => {
+      const input = `<Thought>Complex operation</Thought>
+Action_Summary: Multiple sequential actions
+Action: click(start_box='(100,200)')
+</Output>`;
+
+      const result = parseActionVlm(input, 1000, 'o1');
+
+      expect(result).toEqual([
+        {
+          reflection: null,
+          thought:
+            'Complex operation\n<Action_Summary>\nMultiple sequential actions',
+          action_type: 'click',
+          action_inputs: {
+            start_box: '[0.1,0.2,0.1,0.2]',
+          },
+        },
+      ]);
+    });
+  });
+
+  // Edge cases
+  describe('Edge cases', () => {
+    it('should handle input without Action keyword', () => {
+      const input = 'click(start_box="(100,200)")';
+      const result = parseActionVlm(input);
+
+      expect(result).toEqual([
+        {
+          action_inputs: {
+            start_box: '[0.1]',
+          },
+          action_type: 'click',
+          reflection: null,
+          thought: '',
+        },
+      ]);
+    });
+
+    it('should handle empty action input', () => {
+      const input = 'Thought: Empty action\nAction:';
+      const result = parseActionVlm(input);
+
+      expect(result).toEqual([
+        {
+          action_inputs: {},
+          action_type: '',
+          reflection: null,
+          thought: 'Empty action',
+        },
+      ]);
+    });
+  });
+});
diff --git a/packages/action-parser/src/index.bench.ts → packages/action-parser/test/index.bench.ts b/packages/action-parser/src/index.bench.ts → packages/action-parser/test/index.bench.ts
diff --git a/packages/action-parser/src/index.test.ts → packages/action-parser/test/index.test.ts b/packages/action-parser/src/index.test.ts → packages/action-parser/test/index.test.ts
@@ -5,7 +5,7 @@
 // @prettier
 import { describe, expect, it } from 'vitest';
 
-import { actionParser } from './index';
+import { actionParser } from '../';
 
 describe('actionParser', () => {
   it('should return parsed action', () => {
@@ -140,6 +140,29 @@ describe('actionParser', () => {
     });
   });
 
+  it('should return Reflection', () => {
+    const result = actionParser({
+      prediction:
+        "Reflection: 在桌面上我看到了Chrome浏览器的图标，根据任务要求需要打开Chrome浏览器，应该双击该图标来启动浏览器。\nAction_Summary: 在桌面上找到Chrome浏览器图标的位置，通过双击操作来打开浏览器。\nAction: left_double(start_box='21, 246, 21, 246')",
+      factor: 1000,
+    });
+
+    expect(result).toEqual({
+      parsed: [
+        {
+          thought:
+            '在桌面上找到Chrome浏览器图标的位置，通过双击操作来打开浏览器。',
+          reflection:
+            '在桌面上我看到了Chrome浏览器的图标，根据任务要求需要打开Chrome浏览器，应该双击该图标来启动浏览器。',
+          action_type: 'left_double',
+          action_inputs: {
+            start_box: '[0.021,0.246,0.021,0.246]',
+          },
+        },
+      ],
+    });
+  });
+
   it('should return parsed action with newline', () => {
     const result = actionParser({
       // prettier-ignore

diff --git a/src/main/main.ts b/src/main/main.ts
@@ -3,7 +3,14 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 import { electronApp, optimizer } from '@electron-toolkit/utils';
-import { app, globalShortcut, ipcMain } from 'electron';
+import {
+  app,
+  desktopCapturer,
+  globalShortcut,
+  ipcMain,
+  screen,
+  session,
+} from 'electron';
 import squirrelStartup from 'electron-squirrel-startup';
 import ElectronStore from 'electron-store';
 import { updateElectronApp, UpdateSourceType } from 'update-electron-app';
@@ -123,6 +130,20 @@ const initializeApp = async () => {
   // eslint-disable-next-line
   new AppUpdater();
 
+  session.defaultSession.setDisplayMediaRequestHandler(
+    (_request, callback) => {
+      desktopCapturer.getSources({ types: ['screen'] }).then((sources) => {
+        // Grant access to the first screen found.
+        callback({ video: sources[0], audio: 'loopback' });
+      });
+      // If true, use the system picker if available.
+      // Note: this is currently experimental. If the system picker
+      // is available, it will be used and the media request handler
+      // will not be invoked.
+    },
+    { useSystemPicker: true },
+  );
+
   logger.info('mainZustandBridge');
 
   const { unsubscribe } = mainZustandBridge(
@@ -152,6 +173,14 @@ const registerIPCHandlers = () => {
   ipcMain.handle('utio:shareReport', async (_, params) => {
     await UTIOService.getInstance().shareReport(params);
   });
+
+  ipcMain.handle('get-screen-size', () => {
+    const primaryDisplay = screen.getPrimaryDisplay();
+    return {
+      screenWidth: primaryDisplay.size.width,
+      screenHeight: primaryDisplay.size.height,
+    };
+  });
 };
 
 /**

diff --git a/src/main/store/ScreenMarker.ts b/src/main/store/ScreenMarker.ts
@@ -59,7 +59,7 @@ class ScreenMarker {
     });
 
     this.screenWaterFlow.blur();
-    this.screenWaterFlow.setContentProtection(false); // show for vlm model
+    this.screenWaterFlow.setContentProtection(false);
     this.screenWaterFlow.setIgnoreMouseEvents(true);
 
     this.screenWaterFlow.loadURL(`data:text/html;charset=UTF-8,
@@ -159,6 +159,7 @@ class ScreenMarker {
       },
     });
 
+    this.pauseButton.blur();
     this.pauseButton.setContentProtection(true); // not show for vlm model
     this.pauseButton.setPosition(Math.floor(screenWidth / 2 - 50), 0);
 
@@ -292,7 +293,7 @@ class ScreenMarker {
 
   showTextWithMarker(text: string, x: number, y: number) {
     logger.info('[showTextWithMarker] text', text, 'x', x, 'y', y);
-    // 如果存在之前的窗口，先关闭它
+    // close previous overlay if exists
     this.closeOverlay();
 
     this.currentOverlay = new BrowserWindow({
@@ -317,6 +318,7 @@ class ScreenMarker {
       this.currentOverlay.setAlwaysOnTop(true, 'screen-saver');
     }
 
+    this.currentOverlay.blur();
     this.currentOverlay.setContentProtection(false); // show for vlm model
     this.currentOverlay.setIgnoreMouseEvents(true);
 

diff --git a/src/main/window/index.ts b/src/main/window/index.ts
@@ -31,8 +31,8 @@ export function createMainWindow() {
 
   mainWindow = createWindow({
     routerPath: '/',
-    width: 450,
-    height: 600,
+    width: 430,
+    height: 580,
     alwaysOnTop: false,
   });
 
@@ -145,8 +145,9 @@ export async function hideWindowBlock<T>(
   let originalBounds: Electron.Rectangle | undefined;
 
   try {
-    mainWindow?.setContentProtection(true);
+    mainWindow?.setContentProtection(false);
     mainWindow?.setAlwaysOnTop(true);
+    mainWindow?.blur();
     try {
       if (mainWindow) {
         originalBounds = mainWindow.getBounds();
@@ -156,7 +157,6 @@ export async function hideWindowBlock<T>(
     } catch (e) {
       logger.error(e);
     }
-    mainWindow?.blur();
 
     const result = await Promise.resolve(operation());
     return result;

diff --git a/src/preload/index.ts b/src/preload/index.ts
@@ -13,6 +13,8 @@ export type Channels = 'ipc-example';
 
 const electronHandler = {
   ipcRenderer: {
+    invoke: (channel: string, ...args: unknown[]) =>
+      ipcRenderer.invoke(channel, ...args),
     sendMessage(channel: Channels, ...args: unknown[]) {
       ipcRenderer.send(channel, ...args);
     },