Skip to content

Commit bcf7a1a

Browse files
committed
feat(mcp): add default action spaces for Android, iOS, and Web tools; enhance tool initialization handling
1 parent b843d59 commit bcf7a1a

File tree

6 files changed

+150
-15
lines changed

6 files changed

+150
-15
lines changed

.mcp.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"mcpServers": {
3+
"midscene-web": {
4+
"command": "node",
5+
"args": [
6+
"/Users/bytedance/personal/midscene_4/packages/web-mcp/dist/index.js"
7+
],
8+
"env": {
9+
"MIDSCENE_OPENAI_INIT_CONFIG_JSON": "{\"baseURL\":\"https://search.bytedance.net/gpt/openapi/online/multimodal/crawl?a=\",\"defaultQuery\":{\"ak\":\"Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE\"},\"defaultHeaders\":{\"api-key\":\"Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE\",\"x-tt-logid\":\"midscene-qwen-trail\"},\"REPORT_SERVER_URL\":\"https://cloudapi.bytedance.net/faas/services/tt9i74/invoke/midscene-log\"}",
10+
"MIDSCENE_MODEL_NAME": "openai_qwen3-vl-plus",
11+
"OPENAI_API_KEY": "Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE",
12+
"OPENAI_BASE_URL": "https://search.bytedance.net/gpt/openapi/online/multimodal/crawl?a=",
13+
"MIDSCENE_USE_QWEN3_VL": "1",
14+
"MCP_SERVER_REQUEST_TIMEOUT": "800000"
15+
}
16+
},
17+
"midscene-android": {
18+
"command": "node",
19+
"args": [
20+
"/Users/bytedance/personal/midscene_4/packages/android-mcp/dist/index.js"
21+
],
22+
"env": {
23+
"MIDSCENE_OPENAI_INIT_CONFIG_JSON": "{\"baseURL\":\"https://search.bytedance.net/gpt/openapi/online/multimodal/crawl?a=\",\"defaultQuery\":{\"ak\":\"Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE\"},\"defaultHeaders\":{\"api-key\":\"Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE\",\"x-tt-logid\":\"midscene-qwen-trail\"},\"REPORT_SERVER_URL\":\"https://cloudapi.bytedance.net/faas/services/tt9i74/invoke/midscene-log\"}",
24+
"MIDSCENE_MODEL_NAME": "openai_qwen3-vl-plus",
25+
"OPENAI_API_KEY": "Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE",
26+
"OPENAI_BASE_URL": "https://search.bytedance.net/gpt/openapi/online/multimodal/crawl?a=",
27+
"MIDSCENE_USE_QWEN3_VL": "1",
28+
"MCP_SERVER_REQUEST_TIMEOUT": "800000",
29+
"ANDROID_HOME": "/Users/bytedance/Library/Android/sdk"
30+
}
31+
},
32+
"midscene-ios": {
33+
"command": "node",
34+
"args": [
35+
"/Users/bytedance/personal/midscene_4/packages/ios-mcp/dist/index.js"
36+
],
37+
"env": {
38+
"MIDSCENE_OPENAI_INIT_CONFIG_JSON": "{\"baseURL\":\"https://search.bytedance.net/gpt/openapi/online/multimodal/crawl?a=\",\"defaultQuery\":{\"ak\":\"Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE\"},\"defaultHeaders\":{\"api-key\":\"Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE\",\"x-tt-logid\":\"midscene-qwen-trail\"},\"REPORT_SERVER_URL\":\"https://cloudapi.bytedance.net/faas/services/tt9i74/invoke/midscene-log\"}",
39+
"MIDSCENE_MODEL_NAME": "openai_qwen3-vl-plus",
40+
"OPENAI_API_KEY": "Km8oOEtSb1h1sXpawCXnQAbmeEn7K0hE",
41+
"OPENAI_BASE_URL": "https://search.bytedance.net/gpt/openapi/online/multimodal/crawl?a=",
42+
"MIDSCENE_USE_QWEN3_VL": "1",
43+
"MCP_SERVER_REQUEST_TIMEOUT": "800000"
44+
}
45+
}
46+
}
47+
}

packages/android-mcp/src/android-tools.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,26 @@ const debug = getDebug('mcp:android-tools');
1111
* Extends BaseMidsceneTools to provide Android ADB device connection tools
1212
*/
1313
export class AndroidMidsceneTools extends BaseMidsceneTools {
14+
protected getDefaultActionSpace() {
15+
// Provide default Android action space when device is not connected
16+
return [
17+
{ name: 'Tap', description: 'Tap the element' },
18+
{ name: 'DoubleClick', description: 'Double click the element' },
19+
{ name: 'Input', description: 'Input text into the input field' },
20+
{ name: 'Scroll', description: 'Scroll the page or an element' },
21+
{ name: 'DragAndDrop', description: 'Drag and drop the element' },
22+
{ name: 'KeyboardPress', description: 'Press a key or key combination' },
23+
{ name: 'AndroidLongPress', description: 'Trigger a long press on the screen at specified coordinates on Android devices' },
24+
{ name: 'AndroidPull', description: 'Trigger pull down to refresh or pull up actions' },
25+
{ name: 'ClearInput', description: 'Clear the input field' },
26+
{ name: 'RunAdbShell', description: 'Execute ADB shell command on Android device' },
27+
{ name: 'Launch', description: 'Launch an Android app or URL' },
28+
{ name: 'AndroidBackButton', description: 'Trigger the system "back" operation on Android devices' },
29+
{ name: 'AndroidHomeButton', description: 'Trigger the system "home" operation on Android devices' },
30+
{ name: 'AndroidRecentAppsButton', description: 'Trigger the system "recent apps" operation on Android devices' },
31+
];
32+
}
33+
1434
protected async ensureAgent(deviceId?: string): Promise<AndroidAgent> {
1535
if (this.agent && deviceId) {
1636
// If a specific deviceId is requested and we have an agent,

packages/ios-mcp/src/ios-tools.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,24 @@ const debug = getDebug('mcp:ios-tools');
1111
* Extends BaseMidsceneTools to provide iOS WebDriverAgent connection tools
1212
*/
1313
export class IOSMidsceneTools extends BaseMidsceneTools {
14+
protected getDefaultActionSpace() {
15+
// Provide default iOS action space when WebDriverAgent is not connected
16+
return [
17+
{ name: 'Tap', description: 'Tap the element' },
18+
{ name: 'DoubleClick', description: 'Double click the element' },
19+
{ name: 'Input', description: 'Input text into the input field' },
20+
{ name: 'Scroll', description: 'Scroll the page or an element' },
21+
{ name: 'DragAndDrop', description: 'Drag and drop the element' },
22+
{ name: 'KeyboardPress', description: 'Press a key or key combination' },
23+
{ name: 'IOSLongPress', description: 'Trigger a long press on iOS devices' },
24+
{ name: 'IOSPull', description: 'Trigger pull down to refresh or pull up actions' },
25+
{ name: 'ClearInput', description: 'Clear the input field' },
26+
{ name: 'Launch', description: 'Launch an iOS app or URL' },
27+
{ name: 'IOSBackButton', description: 'Trigger the system "back" operation on iOS devices' },
28+
{ name: 'IOSHomeButton', description: 'Trigger the system "home" operation on iOS devices' },
29+
];
30+
}
31+
1432
protected async ensureAgent(): Promise<IOSAgent> {
1533
if (this.agent) {
1634
return this.agent;

packages/mcp/src/web-tools.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,27 @@ export class WebMidsceneTools extends BaseMidsceneTools {
1212
MIDSCENE_MCP_USE_PUPPETEER_MODE,
1313
);
1414

15+
protected getDefaultActionSpace() {
16+
// Provide default Web action space when browser is not connected
17+
// This allows Codex to see all available tools even when browser isn't running
18+
return [
19+
{ name: 'Tap', description: 'Tap the element' },
20+
{ name: 'RightClick', description: 'Right click the element' },
21+
{ name: 'DoubleClick', description: 'Double click the element' },
22+
{ name: 'Hover', description: 'Move the mouse to the element' },
23+
{ name: 'Input', description: 'Input the value into the element' },
24+
{ name: 'KeyboardPress', description: 'Press a key or key combination, like "Enter", "Tab", "Escape", or "Control+A", "Shift+Enter". Do not use this to type text.' },
25+
{ name: 'Scroll', description: 'Scroll the page or an element. The direction to scroll, the scroll type, and the distance to scroll. The distance is the number of pixels to scroll. If not specified, use `down` direction, `once` scroll type, and `null` distance.' },
26+
{ name: 'DragAndDrop', description: 'Drag and drop the element' },
27+
{ name: 'LongPress', description: 'Long press the element' },
28+
{ name: 'Swipe', description: 'Perform a swipe gesture. You must specify either "end" (target location) or "distance" + "direction" - they are mutually exclusive. Use "end" for precise location-based swipes, or "distance" + "direction" for relative movement.' },
29+
{ name: 'ClearInput', description: 'the position of the placeholder or text content in the target input field. If there is no content, locate the center of the input field.' },
30+
{ name: 'Navigate', description: 'Navigate the browser to a specified URL. Opens the URL in the current tab.' },
31+
{ name: 'Reload', description: 'Reload the current page' },
32+
{ name: 'GoBack', description: 'Navigate back in browser history' },
33+
];
34+
}
35+
1536
protected async ensureAgent(openNewTabWithUrl?: string): Promise<any> {
1637
// Re-init if URL provided
1738
if (this.agent && openNewTabWithUrl) {
@@ -26,7 +47,14 @@ export class WebMidsceneTools extends BaseMidsceneTools {
2647
if (this.agent) return this.agent;
2748

2849
// Choose bridge or puppeteer mode
50+
// In bridge mode, we need a URL to connect to
51+
// If no URL provided, agent creation will be deferred until first tool use
2952
if (!this.puppeteerMode) {
53+
if (!openNewTabWithUrl) {
54+
throw new Error(
55+
'Bridge mode requires a URL. Use web_connect tool to connect to a page first.',
56+
);
57+
}
3058
this.agent = await this.initAgentByBridgeMode(openNewTabWithUrl);
3159
} else {
3260
this.agent = await this.initPuppeteerAgent(openNewTabWithUrl);

packages/shared/src/mcp/base-server.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,16 @@ export abstract class BaseMCPServer {
4242
// Create platform-specific tools manager
4343
this.toolsManager = this.createToolsManager();
4444

45-
// Initialize tools (queries actionSpace)
46-
await this.toolsManager.initTools();
45+
// Try to initialize tools, but don't fail if device/agent is not available
46+
// Tools will be lazily initialized on first use
47+
try {
48+
await this.toolsManager.initTools();
49+
} catch (error: any) {
50+
console.error(`Failed to initialize tools: ${error.message}`);
51+
console.error('Tools will be initialized on first use');
52+
}
4753

48-
// Attach to MCP server
54+
// Attach to MCP server (even if initTools failed)
4955
this.toolsManager.attachToServer(this.mcpServer);
5056

5157
// Connect transport

packages/shared/src/mcp/base-tools.ts

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,30 +25,46 @@ export abstract class BaseMidsceneTools implements IMidsceneTools {
2525
return [];
2626
}
2727

28+
/**
29+
* Optional: provide default action space when agent is not available
30+
* This allows registering tools even when device/browser is not connected
31+
*/
32+
protected getDefaultActionSpace(): any[] {
33+
return [];
34+
}
35+
2836
/**
2937
* Initialize all tools by querying actionSpace
3038
*/
3139
public async initTools(): Promise<void> {
3240
this.toolDefinitions = [];
3341

34-
// 1. Get agent and its action space
35-
const agent = await this.ensureAgent();
36-
const actionSpace = await agent.getActionSpace();
37-
38-
debug('Action space:', actionSpace.map((a: any) => a.name).join(', '));
42+
// 1. Add platform-specific tools first (device connection, etc.)
43+
// These don't require an agent and should always be available
44+
const platformTools = this.preparePlatformTools();
45+
this.toolDefinitions.push(...platformTools);
46+
47+
// 2. Try to get agent and its action space
48+
let actionSpace: any[];
49+
try {
50+
const agent = await this.ensureAgent();
51+
actionSpace = await agent.getActionSpace();
52+
debug('Action space:', actionSpace.map((a: any) => a.name).join(', '));
53+
} catch (error) {
54+
// If agent initialization fails, use default action space
55+
debug('Using default action space due to initialization failure');
56+
actionSpace = this.getDefaultActionSpace();
57+
}
3958

40-
// 2. Generate tools from action space (core innovation)
59+
// 3. Generate tools from action space (core innovation)
4160
const actionTools = generateToolsFromActionSpace(actionSpace, () =>
4261
this.ensureAgent(),
4362
);
4463

45-
// 3. Add common tools (screenshot, waitFor)
64+
// 4. Add common tools (screenshot, waitFor)
4665
const commonTools = generateCommonTools(() => this.ensureAgent());
4766

48-
// 4. Add platform-specific tools (device connection, etc.)
49-
const platformTools = this.preparePlatformTools();
50-
51-
this.toolDefinitions.push(...actionTools, ...commonTools, ...platformTools);
67+
this.toolDefinitions.push(...actionTools, ...commonTools);
5268

5369
debug('Total tools prepared:', this.toolDefinitions.length);
5470
}
@@ -60,7 +76,7 @@ export abstract class BaseMidsceneTools implements IMidsceneTools {
6076
this.mcpServer = server;
6177

6278
if (this.toolDefinitions.length === 0) {
63-
throw new Error('No tools. Call initTools() first.');
79+
debug('Warning: No tools to register. Tools may be initialized lazily.');
6480
}
6581

6682
for (const toolDef of this.toolDefinitions) {

0 commit comments

Comments
 (0)