Skip to content

Commit db60c93

Browse files
author
SentienceDEV
committed
Add FILL_FORM action, snapshot escalation early-exit, and URL-only verify optimization
- Add FILL_FORM planner action for multi-field forms (login, signup, checkout) that fills all fields deterministically from snapshot in a single step instead of one TYPE action per field (eliminates per-field LLM calls) - Add early-exit in snapshot escalation when element count is unchanged across iterations, preventing unnecessary limit escalation - Skip scroll-after-escalation when page has fewer elements than limitBase (nothing below the fold) - Optimize verifyStepOutcome to use getCurrentUrl() instead of full snapshot when all predicates are URL-only (url_contains, url_equals, url_matches), dramatically reducing snapshot count on form pages - Add 3 new tests for snapshot escalation fixes - Update planner prompt with FILL_FORM action, examples, and rules
1 parent e2f4f96 commit db60c93

3 files changed

Lines changed: 355 additions & 15 deletions

File tree

src/agents/planner-executor/planner-executor-agent.ts

Lines changed: 159 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2215,6 +2215,95 @@ export class PlannerExecutorAgent {
22152215
};
22162216
}
22172217

2218+
if (plannerAction.action === 'FILL_FORM') {
2219+
const fields = plannerAction.fields || [];
2220+
const submitText = plannerAction.submitText || '';
2221+
const elements = ctx.snapshot?.elements || [];
2222+
const actions: string[] = [];
2223+
let filledCount = 0;
2224+
2225+
const inputRoles = ['textbox', 'searchbox', 'combobox', 'input', 'password'];
2226+
2227+
for (const field of fields) {
2228+
const label = (field.label || '').toLowerCase();
2229+
const value = field.value || '';
2230+
let matched: SnapshotElement | null = null;
2231+
2232+
for (const el of elements) {
2233+
const role = (el.role || '').toLowerCase();
2234+
if (!inputRoles.some(r => role.includes(r))) continue;
2235+
2236+
const elText = (el.text || '').toLowerCase();
2237+
const elName = (el.name || '').toLowerCase();
2238+
const elAria = (el.ariaLabel || '').toLowerCase();
2239+
const elNearby = (el.nearbyText || '').toLowerCase();
2240+
const allText = `${elText} ${elName} ${elAria} ${elNearby}`;
2241+
2242+
if (allText.includes(label) || label.includes(elText) || label.includes(elName)) {
2243+
matched = el;
2244+
break;
2245+
}
2246+
}
2247+
2248+
if (matched) {
2249+
await runtime.type(matched.id, value);
2250+
actions.push(`TYPE(${matched.id}, "${value}")`);
2251+
filledCount++;
2252+
if (this.config.verbose) {
2253+
console.log(
2254+
`[FILL_FORM] Typed "${value}" into element ${matched.id} (${matched.text || matched.role})`
2255+
);
2256+
}
2257+
} else if (this.config.verbose) {
2258+
console.log(`[FILL_FORM] No match for field label "${label}"`);
2259+
}
2260+
}
2261+
2262+
if (filledCount === 0) {
2263+
return {
2264+
stepId: stepNum,
2265+
goal: stepGoal,
2266+
status: StepStatus.FAILED,
2267+
actionTaken: 'FILL_FORM(0 fields matched)',
2268+
verificationPassed: false,
2269+
usedVision: false,
2270+
durationMs: Date.now() - stepStart,
2271+
urlBefore: currentUrl,
2272+
urlAfter: await runtime.getCurrentUrl(),
2273+
error: 'No form fields matched the provided labels',
2274+
};
2275+
}
2276+
2277+
let clickedSubmit = false;
2278+
if (submitText) {
2279+
const submitEl = this.findSubmitButtonByText(elements, submitText);
2280+
if (submitEl !== null) {
2281+
await runtime.click(submitEl);
2282+
actions.push(`CLICK(${submitEl})`);
2283+
clickedSubmit = true;
2284+
if (this.config.verbose) {
2285+
console.log(`[FILL_FORM] Clicked submit button element ${submitEl}`);
2286+
}
2287+
}
2288+
}
2289+
2290+
await new Promise(r => setTimeout(r, clickedSubmit ? 1000 : 300));
2291+
const verificationPassed = await this.verifyStepOutcome(runtime, plannerAction);
2292+
const urlAfter = await runtime.getCurrentUrl();
2293+
2294+
return {
2295+
stepId: stepNum,
2296+
goal: stepGoal,
2297+
status: verificationPassed || clickedSubmit ? StepStatus.SUCCESS : StepStatus.FAILED,
2298+
actionTaken: `FILL_FORM(${actions.join(' -> ')})`,
2299+
verificationPassed,
2300+
usedVision: false,
2301+
durationMs: Date.now() - stepStart,
2302+
urlBefore: currentUrl,
2303+
urlAfter,
2304+
};
2305+
}
2306+
22182307
// Handle EXTRACT action — read page content and extract data via LLM
22192308
if (plannerAction.action === 'EXTRACT') {
22202309
const extractQuery =
@@ -3063,6 +3152,7 @@ export class PlannerExecutorAgent {
30633152
let visionReason: string | null = null;
30643153
let pruningCategory: string | null = null;
30653154
let prunedNodeCount = 0;
3155+
let previousElementCount = -1;
30663156

30673157
// Phase 1: Limit escalation loop
30683158
while (currentLimit <= maxLimit) {
@@ -3087,10 +3177,23 @@ export class PlannerExecutorAgent {
30873177
currentLimit
30883178
).length;
30893179

3180+
const elementCount = snap.elements?.length || 0;
3181+
3182+
// Early exit: widening the limit didn't discover new elements
3183+
if (elementCount > 0 && elementCount === previousElementCount) {
3184+
if (this.config.verbose) {
3185+
console.log(
3186+
`[ESCALATION] Element count unchanged (${elementCount}), stopping escalation`
3187+
);
3188+
}
3189+
break;
3190+
}
3191+
previousElementCount = elementCount;
3192+
30903193
const visionResult = detectSnapshotFailure(snap);
30913194
if (this.config.verbose) {
30923195
console.log(
3093-
`[VISION CHECK] elements=${(snap.elements || []).length}, shouldUseVision=${visionResult.shouldUseVision}, reason=${visionResult.reason}, screenshot=${screenshotBase64 ? 'yes' : 'no'}, executor.supportsVision=${this.executor.supportsVision()}`
3196+
`[VISION CHECK] elements=${elementCount}, shouldUseVision=${visionResult.shouldUseVision}, reason=${visionResult.reason}, screenshot=${screenshotBase64 ? 'yes' : 'no'}, executor.supportsVision=${this.executor.supportsVision()}`
30943197
);
30953198
}
30963199
if (visionResult.shouldUseVision) {
@@ -3125,7 +3228,6 @@ export class PlannerExecutorAgent {
31253228
if (!cfg.enabled) break;
31263229

31273230
// Check element count - if sufficient, no need to escalate
3128-
const elementCount = snap.elements?.length || 0;
31293231
const hasEnoughContext = pruningCategory
31303232
? actionableContextCount >= cfg.pruningMinElements
31313233
: elementCount >= 10;
@@ -3153,13 +3255,16 @@ export class PlannerExecutorAgent {
31533255

31543256
// Phase 2: Scroll-after-escalation
31553257
// Only trigger for CLICK actions with specific intents
3258+
// Skip entirely if the page has fewer elements than limitBase (nothing below the fold)
3259+
const pageElementCount = lastSnapshot?.elements?.length ?? 0;
31563260
const shouldTryScroll =
31573261
cfg.scrollAfterEscalation &&
31583262
step !== undefined &&
31593263
lastSnapshot !== null &&
31603264
!requiresVision &&
31613265
step.action === 'CLICK' &&
3162-
step.intent;
3266+
step.intent &&
3267+
pageElementCount >= cfg.limitBase;
31633268

31643269
if (shouldTryScroll && lastSnapshot) {
31653270
// Check if we can find the target element using intent heuristics
@@ -3528,6 +3633,10 @@ export class PlannerExecutorAgent {
35283633
? (step.heuristicHints as Array<Record<string, unknown>>)
35293634
: [],
35303635
reasoning: typeof step.reasoning === 'string' ? step.reasoning : undefined,
3636+
fields: Array.isArray(step.fields)
3637+
? (step.fields as Array<{ label: string; value: string }>)
3638+
: undefined,
3639+
submitText: typeof step.submitText === 'string' ? step.submitText : undefined,
35313640
};
35323641
}
35333642

@@ -3657,6 +3766,10 @@ export class PlannerExecutorAgent {
36573766
}
36583767

36593768
private summarizePlannerActionTarget(plannerAction: StepwisePlannerResponse): string | null {
3769+
if (plannerAction.action === 'FILL_FORM') {
3770+
const fields = plannerAction.fields || [];
3771+
return fields.map(f => `${f.label}=${f.value}`).join(', ') || null;
3772+
}
36603773
if (plannerAction.action === 'TYPE' || plannerAction.action === 'TYPE_AND_SUBMIT') {
36613774
return plannerAction.input || plannerAction.intent || plannerAction.target || null;
36623775
}
@@ -3673,7 +3786,7 @@ export class PlannerExecutorAgent {
36733786
}
36743787

36753788
const action = plannerAction.action;
3676-
if (!['TYPE', 'TYPE_AND_SUBMIT', 'CLICK'].includes(action)) {
3789+
if (!['TYPE', 'TYPE_AND_SUBMIT', 'CLICK', 'FILL_FORM'].includes(action)) {
36773790
return false;
36783791
}
36793792

@@ -3987,15 +4100,30 @@ export class PlannerExecutorAgent {
39874100
const pollMs = Math.max(1, this.config.retry.verifyPollMs);
39884101
const start = Date.now();
39894102

4103+
const allUrlPredicates = plannerAction.verify.every(
4104+
p =>
4105+
p.predicate === 'url_contains' ||
4106+
p.predicate === 'url_equals' ||
4107+
p.predicate === 'url_matches'
4108+
);
4109+
39904110
while (Date.now() - start <= timeoutMs) {
39914111
try {
3992-
const snap = await runtime.snapshot({
3993-
limit: this.config.snapshot.limitBase,
3994-
screenshot: false,
3995-
goal: plannerAction.intent || plannerAction.action,
3996-
});
3997-
if (snap && evaluatePredicates(plannerAction.verify, snap)) {
3998-
return true;
4112+
if (allUrlPredicates) {
4113+
const url = await runtime.getCurrentUrl();
4114+
const pseudoSnap: Snapshot = { url, title: '', elements: [] };
4115+
if (evaluatePredicates(plannerAction.verify, pseudoSnap)) {
4116+
return true;
4117+
}
4118+
} else {
4119+
const snap = await runtime.snapshot({
4120+
limit: this.config.snapshot.limitBase,
4121+
screenshot: false,
4122+
goal: plannerAction.intent || plannerAction.action,
4123+
});
4124+
if (snap && evaluatePredicates(plannerAction.verify, snap)) {
4125+
return true;
4126+
}
39994127
}
40004128
} catch {
40014129
// Keep polling until timeout.
@@ -4613,6 +4741,26 @@ export class PlannerExecutorAgent {
46134741
return candidates[0].id;
46144742
}
46154743

4744+
private findSubmitButtonByText(elements: SnapshotElement[], targetText: string): number | null {
4745+
const lower = targetText.toLowerCase().trim();
4746+
for (const el of elements) {
4747+
const role = (el.role || '').toLowerCase();
4748+
if (!['button', 'link'].includes(role)) continue;
4749+
if (el.clickable === false) continue;
4750+
const text = (el.text || '').toLowerCase().trim();
4751+
const ariaLabel = (el.ariaLabel || '').toLowerCase().trim();
4752+
if (
4753+
text === lower ||
4754+
text.includes(lower) ||
4755+
lower.includes(text) ||
4756+
ariaLabel.includes(lower)
4757+
) {
4758+
return el.id;
4759+
}
4760+
}
4761+
return null;
4762+
}
4763+
46164764
private inferSameUrlWizardProgressAfterNext(
46174765
plannerAction: StepwisePlannerResponse,
46184766
beforeSnapshot: Snapshot | null,

src/agents/planner-executor/prompts.ts

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ export function buildStepwisePlannerPrompt(
5959
Actions:
6060
- NAVIGATE: Go directly to a URL when the next destination is known. Set "target" to the URL.
6161
- CLICK: Click an element. Set "intent" to describe the SPECIFIC element (include label, placeholder, or nearby text, e.g. "email textbox", "display name field", "Next button", NOT just "textbox" or "button"). Set "input" to EXACT text from elements list.
62-
- TYPE: Type text into a form field (not a search box). Set "input" to the VALUE from the goal. Set "intent" to describe the field (e.g., "email field", "name field").
62+
- FILL_FORM: Fill ALL visible form fields and submit. Use for login, signup, checkout, or any multi-field form. Set "fields" to an array of {label, value} pairs. Set "submitText" to the submit button text. Set "verify" to check navigation after submit.
63+
- TYPE: Type text into a SINGLE form field. Prefer FILL_FORM for forms with multiple fields.
6364
- TYPE_AND_SUBMIT: Type text into a search box and submit. Set "input" to the SEARCH QUERY from the goal (NOT the element label).
6465
- SCROLL: Scroll page. Set "direction" to "up" or "down".
6566
- WAIT: Wait for content to appear when a follow-up verification is needed.
@@ -71,8 +72,16 @@ WHEN TO USE DONE:
7172
- "Add to Cart" task: DONE only AFTER clicking the Add to Cart button
7273
- "Search and click product" task: DONE only AFTER clicking a product link
7374
- "Search only" task: DONE after search results appear
75+
- "Log in" task: DONE only AFTER the page navigates away from /login
7476
- If goal has multiple steps, complete ALL steps before returning DONE
7577
78+
CRITICAL RULE FOR FILL_FORM (PREFERRED for login/signup/checkout):
79+
- Use FILL_FORM when the goal provides values for 2+ form fields (e.g. "username: X, password: Y")
80+
- "fields" is an array of {label, value} where label matches the field's visible text/placeholder
81+
- "submitText" is the text on the submit button (e.g. "Sign in", "Log in", "Submit", "Next")
82+
- The system will find and fill each field by matching label to element text/role
83+
- This is MUCH faster than TYPE one field at a time
84+
7685
CRITICAL RULE FOR TYPE_AND_SUBMIT:
7786
- "input" must be the SEARCH QUERY you want to type (e.g., "wireless headphones")
7887
- "input" is NOT the element label (e.g., NOT "Search Amazon")
@@ -89,6 +98,8 @@ CRITICAL RULE FOR ADD TO CART:
8998
9099
Output ONLY valid JSON (no markdown, no \`\`\`):
91100
{"action":"NAVIGATE","target":"https://shop.test/search","verify":[{"predicate":"url_contains","args":["search"]}]}
101+
{"action":"FILL_FORM","fields":[{"label":"username","value":"john@example.com"},{"label":"password","value":"secret123"}],"submitText":"Sign in","verify":[{"predicate":"url_contains","args":["dashboard"]}]}
102+
{"action":"FILL_FORM","fields":[{"label":"email","value":"test@test.com"},{"label":"password","value":"pass123"},{"label":"confirm password","value":"pass123"}],"submitText":"Create Account","verify":[]}
92103
{"action":"TYPE_AND_SUBMIT","intent":"searchbox","input":"wireless headphones","verify":[{"predicate":"url_contains","args":["search"]}]}
93104
{"action":"CLICK","intent":"product link","input":"Sony WH-1000XM4 Wireless...","verify":[]}
94105
{"action":"CLICK","intent":"add to cart button","input":"Add to Cart","verify":[]}
@@ -102,11 +113,11 @@ RULES:
102113
5. Include "verify" when you know a simple URL or element predicate that proves success; otherwise use []
103114
6. Include planner metadata when useful: "target", "required", "stop_if_true", "optional_substeps", "heuristic_hints"
104115
7. "heuristic_hints" entries may use snake_case fields: "intent_pattern", "text_patterns", "role_filter", "attribute_patterns", "priority"
105-
8. Output ONLY JSON - no <think> tags, no markdown, no prose
106-
9. Do NOT output <think> or any reasoning
116+
8. Output ONLY JSON - no 时光网 tags, no markdown, no prose
117+
9. Do NOT output 时光网 or any reasoning
107118
10. Do NOT return DONE until ALL parts of the goal are complete
108119
11. Never copy example URLs from these instructions. Only NAVIGATE to a URL from the user's task, the current page, or a visible element.
109-
12. For multi-step forms: TYPE into each field (action: TYPE) BEFORE clicking Next. Never click Next without filling required fields first.
120+
12. PREFER FILL_FORM for login/signup/checkout forms with 2+ fields. Do NOT use multiple TYPE actions when FILL_FORM can do it in one step.
110121
13. "intent" must be SPECIFIC: describe the element with its label or context (e.g., "email field", "plan dropdown", "Next button on step 2")
111122
14. Treat history results "success", "skipped", and "vision_fallback" as already satisfied. Do not repeat those steps; choose the next incomplete part of the goal.`;
112123

@@ -329,6 +340,7 @@ export interface StepwisePlannerResponse {
329340
| 'CLICK'
330341
| 'TYPE'
331342
| 'TYPE_AND_SUBMIT'
343+
| 'FILL_FORM'
332344
| 'SCROLL'
333345
| 'PRESS'
334346
| 'WAIT'
@@ -339,6 +351,8 @@ export interface StepwisePlannerResponse {
339351
intent?: string;
340352
input?: string;
341353
direction?: 'up' | 'down';
354+
fields?: Array<{ label: string; value: string }>;
355+
submitText?: string;
342356
verify?: Array<{ predicate: string; args: unknown[] }>;
343357
required?: boolean;
344358
stopIfTrue?: boolean;

0 commit comments

Comments
 (0)