Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(browser-event): support drag event #321

Merged
merged 7 commits into from
Jan 26, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion packages/midscene/src/ai-model/ui-tars-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@ import {
} from './prompt/ui-tars-planning';
import { call } from './service-caller';

type ActionType = 'click' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
type ActionType =
| 'click'
| 'drag'
| 'type'
| 'hotkey'
| 'finished'
| 'scroll'
| 'wait';

function capitalize(str: string) {
return str.charAt(0).toUpperCase() + str.slice(1);
Expand Down Expand Up @@ -60,6 +67,18 @@ export async function vlmPlanning(options: {
},
param: action.thought || '',
});
} else if (action.action_type === 'drag') {
const startPoint = getPoint(action.action_inputs.start_box, size);
const endPoint = getPoint(action.action_inputs.end_box, size);
transformActions.push({
type: 'Drag',
param: {
start_box: { x: startPoint[0], y: startPoint[1] },
end_box: { x: endPoint[0], y: endPoint[1] },
},
locate: null,
thought: action.thought || '',
});
} else if (action.action_type === 'type') {
transformActions.push({
type: 'Input',
Expand Down Expand Up @@ -140,6 +159,14 @@ interface ClickAction extends BaseAction {
};
}

interface DragAction extends BaseAction {
action_type: 'drag';
action_inputs: {
start_box: string; // JSON string of [x, y] coordinates
end_box: string; // JSON string of [x, y] coordinates
};
}

interface WaitAction extends BaseAction {
action_type: 'wait';
action_inputs: {
Expand Down Expand Up @@ -175,6 +202,7 @@ interface FinishedAction extends BaseAction {

export type Action =
| ClickAction
| DragAction
| TypeAction
| HotkeyAction
| ScrollAction
Expand Down
1 change: 1 addition & 0 deletions packages/midscene/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ export interface PlanningAction<ParamType = any> {
type:
| 'Locate'
| 'Tap'
| 'Drag'
| 'Hover'
| 'Input'
| 'KeyboardPress'
Expand Down
6 changes: 5 additions & 1 deletion packages/midscene/tests/ai/evaluate/plan/planning.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,13 @@ describe('automation - planning', () => {
it('throw error when instruction is not feasible', async () => {
const { context } = await getPageDataOfTestName('todo');
await expect(async () => {
await plan('close Cookie Prompt', {
const planFromAI = await plan('close Cookie Prompt', {
context,
});
console.log(
'throw error when instruction is not feasible res:',
planFromAI,
);
}).rejects.toThrow();
});

Expand Down
3 changes: 2 additions & 1 deletion packages/web-integration/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@
"test": "vitest --run",
"test:u": "vitest --run -u",
"test:ai": "AI_TEST_TYPE=web npm run test",
"test:ai:bridge": "BRIDGE_MODE=true npm run test --inspect packages/web-integration/tests/ai/bridge/agent.test.ts",
"test:ai:temp": "AI_TEST_TYPE=web vitest --run tests/ai/bridge/temp.test.ts",
"test:ai:bridge": "BRIDGE_MODE=true npm run test --inspect tests/ai/bridge/agent.test.ts",
"test:ai:cache": "MIDSCENE_CACHE=true AI_TEST_TYPE=web npm run test",
"test:ai:all": "npm run test:ai:web && npm run test:ai:native",
"test:ai:native": "MIDSCENE_CACHE=true AI_TEST_TYPE=native npm run test",
Expand Down
21 changes: 21 additions & 0 deletions packages/web-integration/src/appium/page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ export class Page implements AbstractPage {
wheel: (deltaX: number, deltaY: number) =>
this.mouseWheel(deltaX, deltaY),
move: (x: number, y: number) => this.mouseMove(x, y),
drag: (from: { x: number; y: number }, to: { x: number; y: number }) =>
this.mouseDrag(from, to),
};
}

Expand Down Expand Up @@ -249,6 +251,25 @@ export class Page implements AbstractPage {
]);
}

private async mouseDrag(
from: { x: number; y: number },
to: { x: number; y: number },
): Promise<void> {
await this.browser.performActions([
{
type: 'pointer',
id: 'mouse',
parameters: { pointerType: 'mouse' },
actions: [
{ type: 'pointerMove', duration: 0, x: from.x, y: from.y },
{ type: 'pointerDown', button: 0 },
{ type: 'pointerMove', duration: 500, x: to.x, y: to.y },
{ type: 'pointerUp', button: 0 },
],
},
]);
}

private async mouseWheel(
deltaX: number,
deltaY: number,
Expand Down
1 change: 1 addition & 0 deletions packages/web-integration/src/bridge-mode/agent-cli-side.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ export const getBridgePageInCliSide = (): ChromeExtensionPageCliSide => {
click: bridgeCaller(MouseEvent.Click),
wheel: bridgeCaller(MouseEvent.Wheel),
move: bridgeCaller(MouseEvent.Move),
drag: bridgeCaller(MouseEvent.Drag),
};
return mouse;
}
Expand Down
1 change: 1 addition & 0 deletions packages/web-integration/src/bridge-mode/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export enum MouseEvent {
Click = 'mouse.click',
Wheel = 'mouse.wheel',
Move = 'mouse.move',
Drag = 'mouse.drag',
}

export enum KeyboardEvent {
Expand Down
3 changes: 3 additions & 0 deletions packages/web-integration/src/bridge-mode/page-browser-side.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ export class ChromeExtensionPageBrowserSide extends ChromeExtensionProxyPage {

if (method.startsWith(MouseEvent.PREFIX)) {
const actionName = method.split('.')[1] as keyof MouseAction;
if (actionName === 'drag') {
zhoushaw marked this conversation as resolved.
Show resolved Hide resolved
return this.mouse[actionName].apply(this.mouse, args as any);
}
return this.mouse[actionName].apply(this.mouse, args as any);
}

Expand Down
21 changes: 21 additions & 0 deletions packages/web-integration/src/chrome-extension/page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,27 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
y,
});
},
drag: async (
from: { x: number; y: number },
to: { x: number; y: number },
) => {
await this.mouse.move(from.x, from.y);
await this.sendCommandToDebugger('Input.dispatchMouseEvent', {
type: 'mousePressed',
x: from.x,
y: from.y,
button: 'left',
clickCount: 1,
});
await this.mouse.move(to.x, to.y);
await this.sendCommandToDebugger('Input.dispatchMouseEvent', {
type: 'mouseReleased',
x: to.x,
y: to.y,
button: 'left',
clickCount: 1,
});
},
};

keyboard = {
Expand Down
19 changes: 19 additions & 0 deletions packages/web-integration/src/common/tasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,25 @@ export class PageTaskExecutor {
},
};
tasks.push(taskActionTap);
} else if (plan.type === 'Drag') {
const taskActionDrag: ExecutionTaskActionApply<{
start_box: { x: number; y: number };
end_box: { x: number; y: number };
}> = {
type: 'Action',
subType: 'Drag',
param: plan.param,
thought: plan.thought,
locate: plan.locate,
executor: async (taskParam) => {
assert(
taskParam?.start_box && taskParam?.end_box,
'No start_box or end_box to drag',
);
await this.page.mouse.drag(taskParam.start_box, taskParam.end_box);
},
};
tasks.push(taskActionDrag);
} else if (plan.type === 'Hover') {
const taskActionHover: ExecutionTaskActionApply<PlanningActionParamHover> =
{
Expand Down
8 changes: 8 additions & 0 deletions packages/web-integration/src/page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ export interface MouseAction {
) => Promise<void>;
wheel: (deltaX: number, deltaY: number) => Promise<void>;
move: (x: number, y: number) => Promise<void>;
drag: (
from: { x: number; y: number },
to: { x: number; y: number },
) => Promise<void>;
}

export interface KeyboardAction {
Expand All @@ -36,6 +40,10 @@ export abstract class AbstractPage {
) => {},
wheel: async (deltaX: number, deltaY: number) => {},
move: async (x: number, y: number) => {},
drag: async (
from: { x: number; y: number },
to: { x: number; y: number },
) => {},
};
}

Expand Down
1 change: 1 addition & 0 deletions packages/web-integration/src/playground/static-page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ export default class StaticPage implements AbstractPage {
click: ThrowNotImplemented.bind(null, 'mouse.click'),
wheel: ThrowNotImplemented.bind(null, 'mouse.wheel'),
move: ThrowNotImplemented.bind(null, 'mouse.move'),
drag: ThrowNotImplemented.bind(null, 'mouse.drag'),
};

keyboard = {
Expand Down
26 changes: 26 additions & 0 deletions packages/web-integration/src/puppeteer/base-page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,32 @@ export class Page<
},
move: async (x: number, y: number) =>
this.underlyingPage.mouse.move(x, y),
drag: async (
from: { x: number; y: number },
to: { x: number; y: number },
) => {
if (this.pageType === 'puppeteer') {
await (this.underlyingPage as PuppeteerPage).mouse.drag(
{
x: from.x,
y: from.y,
},
{
x: to.x,
y: to.y,
},
);
} else if (this.pageType === 'playwright') {
// Playwright doesn't have a drag method, so we need to simulate it
await (this.underlyingPage as PlaywrightPage).mouse.move(
from.x,
from.y,
);
await (this.underlyingPage as PlaywrightPage).mouse.down();
await (this.underlyingPage as PlaywrightPage).mouse.move(to.x, to.y);
await (this.underlyingPage as PlaywrightPage).mouse.up();
}
},
};
}

Expand Down
19 changes: 19 additions & 0 deletions packages/web-integration/tests/ai/bridge/temp.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import {
AgentOverChromeBridge,
getBridgePageInCliSide,
} from '@/bridge-mode/agent-cli-side';
import { describe, expect, it, vi } from 'vitest';

vi.setConfig({
testTimeout: 260 * 1000,
});

describe.skipIf(!process.env.BRIDGE_MODE)('drag event', () => {
it('agent in cli side, current tab', async () => {
const agent = new AgentOverChromeBridge();
await agent.connectCurrentTab();
await agent.ai('Finish dragging the slider');

await agent.destroy();
});
});
Loading