Skip to content

Commit

Permalink
feat(share): add screen recording video sharing (#77)
Browse files Browse the repository at this point in the history
* chore(ci): add test cases

* feat(share): add screen recording video sharing

* fix(share): black screen on the first frame

* fix(share): export as video bug

* chore(version): 0.0.4-beta.1
  • Loading branch information
ycjcl868 authored Feb 2, 2025
1 parent d0bd957 commit 336a951
Show file tree
Hide file tree
Showing 13 changed files with 579 additions and 85 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ui-tars-desktop",
"version": "0.0.3",
"version": "0.0.4-beta.1",
"private": true,
"packageManager": "[email protected]",
"description": "A GUI Agent application based on UI-TARS(Vision-Lanuage Model) that allows you to control your computer using natural language.",
Expand Down
2 changes: 1 addition & 1 deletion packages/action-parser/src/actionParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export function actionParser(params: { prediction: string; factor: number }): {
};
}

function parseActionVlm(
export function parseActionVlm(
text: string,
factor = 1000,
mode: 'bc' | 'o1' = 'bc',
Expand Down
156 changes: 156 additions & 0 deletions packages/action-parser/test/actionParser.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect } from 'vitest';
import { parseActionVlm } from '../src/actionParser';

describe('parseActionVlm', () => {
// BC mode tests
describe('BC mode', () => {
it('should correctly parse input with Thought', () => {
const input = `Thought: I need to click this button
Action: click(start_box='(100,200)')`;

const result = parseActionVlm(input);

expect(result).toEqual([
{
reflection: null,
thought: 'I need to click this button',
action_type: 'click',
action_inputs: {
start_box: '[0.1,0.2,0.1,0.2]',
},
},
]);
});

it('should correctly parse input with Reflection and Action_Summary', () => {
const input = `Reflection: This is a reflection
Action_Summary: This is a summary
Action: type(text='Hello', start_box='(300,400)')`;

const result = parseActionVlm(input);

expect(result).toEqual([
{
reflection: 'This is a reflection',
thought: 'This is a summary',
action_type: 'type',
action_inputs: {
text: 'Hello',
start_box: '[0.3,0.4,0.3,0.4]',
},
},
]);
});

it('should handle multiple actions', () => {
const input = `Thought: Perform multiple actions
Action: click(start_box='(100,200)')
type(text='Hello', start_box='(300,400)')`;

const result = parseActionVlm(input);

expect(result).toEqual([
{
thought: 'Perform multiple actions',
reflection: null,
action_type: 'click',
action_inputs: {
start_box: '[0.1,0.2,0.1,0.2]',
},
},
{
thought: 'Perform multiple actions',
reflection: null,
action_type: 'type',
action_inputs: {
text: 'Hello',
start_box: '[0.3,0.4,0.3,0.4]',
},
},
]);
});
});

// O1 mode tests
describe('O1 mode', () => {
it('should correctly parse O1 format input', () => {
const input = `<Thought>I need to perform this action</Thought>
Action_Summary: Click and type text
Action: click(start_box='(100,200)')
</Output>`;

const result = parseActionVlm(input, 1000, 'o1');

expect(result).toEqual([
{
reflection: null,
thought:
'I need to perform this action\n<Action_Summary>\nClick and type text',
action_type: 'click',
action_inputs: {
start_box: '[0.1,0.2,0.1,0.2]',
},
},
]);
});

it('should handle complex O1 format input', () => {
const input = `<Thought>Complex operation</Thought>
Action_Summary: Multiple sequential actions
Action: click(start_box='(100,200)')
</Output>`;

const result = parseActionVlm(input, 1000, 'o1');

expect(result).toEqual([
{
reflection: null,
thought:
'Complex operation\n<Action_Summary>\nMultiple sequential actions',
action_type: 'click',
action_inputs: {
start_box: '[0.1,0.2,0.1,0.2]',
},
},
]);
});
});

// Edge cases
describe('Edge cases', () => {
it('should handle input without Action keyword', () => {
const input = 'click(start_box="(100,200)")';
const result = parseActionVlm(input);

expect(result).toEqual([
{
action_inputs: {
start_box: '[0.1]',
},
action_type: 'click',
reflection: null,
thought: '',
},
]);
});

it('should handle empty action input', () => {
const input = 'Thought: Empty action\nAction:';
const result = parseActionVlm(input);

expect(result).toEqual([
{
action_inputs: {},
action_type: '',
reflection: null,
thought: 'Empty action',
},
]);
});
});
});
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// @prettier
import { describe, expect, it } from 'vitest';

import { actionParser } from './index';
import { actionParser } from '../';

describe('actionParser', () => {
it('should return parsed action', () => {
Expand Down Expand Up @@ -140,6 +140,29 @@ describe('actionParser', () => {
});
});

it('should return Reflection', () => {
const result = actionParser({
prediction:
"Reflection: 在桌面上我看到了Chrome浏览器的图标,根据任务要求需要打开Chrome浏览器,应该双击该图标来启动浏览器。\nAction_Summary: 在桌面上找到Chrome浏览器图标的位置,通过双击操作来打开浏览器。\nAction: left_double(start_box='21, 246, 21, 246')",
factor: 1000,
});

expect(result).toEqual({
parsed: [
{
thought:
'在桌面上找到Chrome浏览器图标的位置,通过双击操作来打开浏览器。',
reflection:
'在桌面上我看到了Chrome浏览器的图标,根据任务要求需要打开Chrome浏览器,应该双击该图标来启动浏览器。',
action_type: 'left_double',
action_inputs: {
start_box: '[0.021,0.246,0.021,0.246]',
},
},
],
});
});

it('should return parsed action with newline', () => {
const result = actionParser({
// prettier-ignore
Expand Down
31 changes: 30 additions & 1 deletion src/main/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
* SPDX-License-Identifier: Apache-2.0
*/
import { electronApp, optimizer } from '@electron-toolkit/utils';
import { app, globalShortcut, ipcMain } from 'electron';
import {
app,
desktopCapturer,
globalShortcut,
ipcMain,
screen,
session,
} from 'electron';
import squirrelStartup from 'electron-squirrel-startup';
import ElectronStore from 'electron-store';
import { updateElectronApp, UpdateSourceType } from 'update-electron-app';
Expand Down Expand Up @@ -123,6 +130,20 @@ const initializeApp = async () => {
// eslint-disable-next-line
new AppUpdater();

session.defaultSession.setDisplayMediaRequestHandler(
(_request, callback) => {
desktopCapturer.getSources({ types: ['screen'] }).then((sources) => {
// Grant access to the first screen found.
callback({ video: sources[0], audio: 'loopback' });
});
// If true, use the system picker if available.
// Note: this is currently experimental. If the system picker
// is available, it will be used and the media request handler
// will not be invoked.
},
{ useSystemPicker: true },
);

logger.info('mainZustandBridge');

const { unsubscribe } = mainZustandBridge(
Expand Down Expand Up @@ -152,6 +173,14 @@ const registerIPCHandlers = () => {
ipcMain.handle('utio:shareReport', async (_, params) => {
await UTIOService.getInstance().shareReport(params);
});

ipcMain.handle('get-screen-size', () => {
const primaryDisplay = screen.getPrimaryDisplay();
return {
screenWidth: primaryDisplay.size.width,
screenHeight: primaryDisplay.size.height,
};
});
};

/**
Expand Down
6 changes: 4 additions & 2 deletions src/main/store/ScreenMarker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class ScreenMarker {
});

this.screenWaterFlow.blur();
this.screenWaterFlow.setContentProtection(false); // show for vlm model
this.screenWaterFlow.setContentProtection(false);
this.screenWaterFlow.setIgnoreMouseEvents(true);

this.screenWaterFlow.loadURL(`data:text/html;charset=UTF-8,
Expand Down Expand Up @@ -159,6 +159,7 @@ class ScreenMarker {
},
});

this.pauseButton.blur();
this.pauseButton.setContentProtection(true); // not show for vlm model
this.pauseButton.setPosition(Math.floor(screenWidth / 2 - 50), 0);

Expand Down Expand Up @@ -292,7 +293,7 @@ class ScreenMarker {

showTextWithMarker(text: string, x: number, y: number) {
logger.info('[showTextWithMarker] text', text, 'x', x, 'y', y);
// 如果存在之前的窗口,先关闭它
// close previous overlay if exists
this.closeOverlay();

this.currentOverlay = new BrowserWindow({
Expand All @@ -317,6 +318,7 @@ class ScreenMarker {
this.currentOverlay.setAlwaysOnTop(true, 'screen-saver');
}

this.currentOverlay.blur();
this.currentOverlay.setContentProtection(false); // show for vlm model
this.currentOverlay.setIgnoreMouseEvents(true);

Expand Down
8 changes: 4 additions & 4 deletions src/main/window/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ export function createMainWindow() {

mainWindow = createWindow({
routerPath: '/',
width: 450,
height: 600,
width: 430,
height: 580,
alwaysOnTop: false,
});

Expand Down Expand Up @@ -145,8 +145,9 @@ export async function hideWindowBlock<T>(
let originalBounds: Electron.Rectangle | undefined;

try {
mainWindow?.setContentProtection(true);
mainWindow?.setContentProtection(false);
mainWindow?.setAlwaysOnTop(true);
mainWindow?.blur();
try {
if (mainWindow) {
originalBounds = mainWindow.getBounds();
Expand All @@ -156,7 +157,6 @@ export async function hideWindowBlock<T>(
} catch (e) {
logger.error(e);
}
mainWindow?.blur();

const result = await Promise.resolve(operation());
return result;
Expand Down
2 changes: 2 additions & 0 deletions src/preload/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ export type Channels = 'ipc-example';

const electronHandler = {
ipcRenderer: {
invoke: (channel: string, ...args: unknown[]) =>
ipcRenderer.invoke(channel, ...args),
sendMessage(channel: Channels, ...args: unknown[]) {
ipcRenderer.send(channel, ...args);
},
Expand Down
Loading

0 comments on commit 336a951

Please sign in to comment.