From 534cd473bc087192393bdbb15ca14f9651c94e2e Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sun, 10 May 2026 21:01:13 -0700 Subject: [PATCH 1/2] Expand extraction detection and add SCROLL_AND_COUNT action for counting tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Broaden extraction keyword detection: add 'note', 'record', 'summarize', 'outline', 'compare', 'calculate' to ambiguous verbs; add 'what does', 'note down', 'write down' to extraction phrases; add 17 new content nouns (benefits, services, courses, games, pricing, etc.) for ~97% recall on 219 webbench READ tasks (up from ~83%). - Add isCountingTask() to detect counting/totaling tasks. - Add SCROLL_AND_COUNT action: scrolls page to bottom to load all content, then uses readMarkdown to get full page text and asks LLM to count. - Auto-complete extraction and counting tasks after successful EXTRACT or SCROLL_AND_COUNT. - Add action aliases COUNT and SCROLL_COUNT → SCROLL_AND_COUNT. --- .../planner-executor/extraction-keywords.ts | 76 +++++- src/agents/planner-executor/plan-models.ts | 3 +- src/agents/planner-executor/plan-utils.ts | 2 + .../planner-executor-agent.ts | 236 +++++++++++++++--- src/agents/planner-executor/prompts.ts | 4 + 5 files changed, 282 insertions(+), 39 deletions(-) diff --git a/src/agents/planner-executor/extraction-keywords.ts b/src/agents/planner-executor/extraction-keywords.ts index 98e8ded..31b2d86 100644 --- a/src/agents/planner-executor/extraction-keywords.ts +++ b/src/agents/planner-executor/extraction-keywords.ts @@ -49,6 +49,12 @@ const AMBIGUOUS_VERBS: readonly string[] = [ 'gather', 'return', 'output', + 'note', + 'record', + 'summarize', + 'outline', + 'compare', + 'calculate', ]; /** @@ -59,12 +65,15 @@ const EXTRACTION_PHRASES: readonly string[] = [ 'what is', 'what are', "what's", + 'what does', 'show me', 'tell me', 'find the', 'get the', 'read the', 'list the', + 'note down', + 'write down', 'title of', 'price of', 'name of', @@ -104,6 +113,7 @@ const CONTENT_NOUNS: readonly string[] = [ 'summary', 'excerpt', 'price', + 'pricing', 'cost', 'amount', 'name', @@ -140,6 +150,22 @@ const CONTENT_NOUNS: readonly string[] = [ 'product', 'results', 'listings', + 'benefits', + 'services', + 'courses', + 'games', + 'guidelines', + 'steps', + 'tips', + 'events', + 'options', + 'perks', + 'faqs', + 'specifications', + 'features', + 'formats', + 'cities', + 'countries', ]; /** @@ -342,7 +368,55 @@ Example - product price on listing page: Goal: "find the price of the first laptop" Current URL: store.com/laptops (correct page, prices visible) {"action":"EXTRACT","target":"price of first laptop","goal":"Extract the price of the first laptop listing","verify":[],"reasoning":"prices are visible in listing elements"} -`; + +STEP 3 - COUNTING ACROSS FULL PAGE: +If the task asks to COUNT items ("how many", "number of", "count", "total"): +- Use SCROLL_AND_COUNT instead of EXTRACT +- Set "countTarget" to describe what to count (e.g., "listings", "products", "articles") +- The system will scroll through the entire page and sum up counts +- Do NOT use EXTRACT for counting tasks — EXTRACT only sees the current viewport + +Example - count all listings: +Goal: "note how many listings are available" +Current URL: alibaba.com/search?SearchText=smartphones (correct page) +{"action":"SCROLL_AND_COUNT","countTarget":"product listings","goal":"Count total product listings","verify":[]} + `; +} + +// --------------------------------------------------------------------------- +// Counting Task Detection +// --------------------------------------------------------------------------- + +const COUNTING_PHRASES: readonly string[] = [ + 'how many', + 'how much', + 'number of', + 'count the', + 'count of', + 'total number', + 'total count', + 'how numerous', +]; + +const COUNTING_VERBS: readonly string[] = ['count', 'tally', 'enumerate']; + +export function isCountingTask(task: string): boolean { + if (!task) return false; + const taskLower = task.toLowerCase(); + + if (COUNTING_PHRASES.some(phrase => taskLower.includes(phrase))) { + return true; + } + + if ( + COUNTING_VERBS.some(verb => + new RegExp(`\\b${escapeRegExp(verb)}(s|ed|ing)?\\b`).test(taskLower) + ) + ) { + return true; + } + + return false; } // --------------------------------------------------------------------------- diff --git a/src/agents/planner-executor/plan-models.ts b/src/agents/planner-executor/plan-models.ts index 7fbb9ed..b9209fd 100644 --- a/src/agents/planner-executor/plan-models.ts +++ b/src/agents/planner-executor/plan-models.ts @@ -45,6 +45,7 @@ export const ActionType = z.enum([ 'TYPE', 'TYPE_AND_SUBMIT', 'SCROLL', + 'SCROLL_AND_COUNT', 'PRESS', 'WAIT', 'EXTRACT', @@ -90,7 +91,7 @@ export const PlanStepSchema = z.lazy(() => id: z.number().optional().describe('Step ID (1-indexed, contiguous)'), goal: z.string().optional().describe('Human-readable goal for this step'), action: ActionType.describe( - 'Action type: NAVIGATE, CLICK, TYPE, TYPE_AND_SUBMIT, SCROLL, PRESS, WAIT, EXTRACT, STUCK, DONE' + 'Action type: NAVIGATE, CLICK, TYPE, TYPE_AND_SUBMIT, SCROLL, SCROLL_AND_COUNT, PRESS, WAIT, EXTRACT, STUCK, DONE' ), target: z .union([z.string(), z.record(z.string(), z.unknown())]) diff --git a/src/agents/planner-executor/plan-utils.ts b/src/agents/planner-executor/plan-utils.ts index 1cbe390..da2c296 100644 --- a/src/agents/planner-executor/plan-utils.ts +++ b/src/agents/planner-executor/plan-utils.ts @@ -362,6 +362,8 @@ const ACTION_ALIASES: Record = { SCROLL_UP: 'SCROLL', SCROLL_TO: 'SCROLL', SCROLL_INTO_VIEW: 'SCROLL', + COUNT: 'SCROLL_AND_COUNT', + SCROLL_COUNT: 'SCROLL_AND_COUNT', }; /** diff --git a/src/agents/planner-executor/planner-executor-agent.ts b/src/agents/planner-executor/planner-executor-agent.ts index 349f504..25d1015 100644 --- a/src/agents/planner-executor/planner-executor-agent.ts +++ b/src/agents/planner-executor/planner-executor-agent.ts @@ -81,6 +81,7 @@ import { isTextExtractionTask, isExtractionTask, buildExtractionPrompt, + isCountingTask, } from './extraction-keywords'; // --------------------------------------------------------------------------- @@ -1260,10 +1261,12 @@ export class PlannerExecutorAgent { // Record action history after any auth-boundary or optional-substep recovery. if (!actionHistoryRecorded) { - // For EXTRACT actions, include the extracted data so the planner + // For EXTRACT/SCROLL_AND_COUNT actions, include the extracted data so the planner // knows what was already extracted and can avoid repeating + const isExtractOrCount = + plannerAction.action === 'EXTRACT' || plannerAction.action === 'SCROLL_AND_COUNT'; const extractedText = - plannerAction.action === 'EXTRACT' && finalOutcome.extractedData + isExtractOrCount && finalOutcome.extractedData ? typeof finalOutcome.extractedData === 'object' && finalOutcome.extractedData !== null && 'text' in (finalOutcome.extractedData as Record) @@ -1287,44 +1290,26 @@ export class PlannerExecutorAgent { finalOutcome.status === StepStatus.SKIPPED || finalOutcome.status === StepStatus.VISION_FALLBACK ) { + const isExtractAction = + plannerAction.action === 'EXTRACT' || plannerAction.action === 'SCROLL_AND_COUNT'; + const taskHasInteractionLocal = + /\b(click|add|remove|delete|create|update|fill|submit|log\s*in|sign\s*in)\b/i.test( + task + ); + const hasNonExtractActionLocal = this.actionHistory.some( + rec => rec.action !== 'EXTRACT' && rec.action !== 'SCROLL_AND_COUNT' + ); if ( !success && - finalOutcome.status === StepStatus.SUCCESS && - (await this.isCartAdditionTerminal(runtime, task, plannerAction)) - ) { - success = true; - } - - if ( - !success && - finalOutcome.status === StepStatus.SUCCESS && - this.isFinalFormSubmissionAction(task, plannerAction) - ) { - success = true; - } - - // Auto-complete extraction tasks: if the action was a successful EXTRACT - // with actual data, and the overall task is an extraction task, mark as done. - // This prevents infinite EXTRACT loops on extraction-focused tasks. - // Guard: auto-complete if either (a) the task is purely extraction with no - // navigation/search keywords, or (b) at least one non-EXTRACT action has - // been performed already. This avoids premature completion on hybrid tasks - // like "Search for X and extract Y" while still auto-completing "Extract the - // title of the first post" when already on the right page. - const taskHasInteraction = - /\b(search|navigate|go to|click|add to|fill|submit|login|sign)\b/i.test(task); - const hasNonExtractAction = this.actionHistory.some(rec => rec.action !== 'EXTRACT'); - if ( - !success && - plannerAction.action === 'EXTRACT' && + isExtractAction && finalOutcome.status === StepStatus.SUCCESS && finalOutcome.extractedData && - isTextExtractionTask(task) && - (!taskHasInteraction || hasNonExtractAction) + (isTextExtractionTask(task) || isCountingTask(task)) && + (!taskHasInteractionLocal || hasNonExtractActionLocal) ) { if (this.config.verbose) { console.log( - `[EXTRACT] Extraction task completed successfully, transitioning to DONE` + `[EXTRACT] Extraction/counting task completed successfully, transitioning to DONE` ); } success = true; @@ -1791,8 +1776,10 @@ export class PlannerExecutorAgent { } if (!actionHistoryRecorded) { + const isExtractOrCount = + plannerAction.action === 'EXTRACT' || plannerAction.action === 'SCROLL_AND_COUNT'; const extractedText = - plannerAction.action === 'EXTRACT' && finalOutcome.extractedData + isExtractOrCount && finalOutcome.extractedData ? typeof finalOutcome.extractedData === 'object' && finalOutcome.extractedData !== null && 'text' in (finalOutcome.extractedData as Record) @@ -1834,13 +1821,17 @@ export class PlannerExecutorAgent { const taskHasInteraction = /\b(search|navigate|go to|click|add to|fill|submit|login|sign)\b/i.test(task); - const hasNonExtractAction = this.actionHistory.some(rec => rec.action !== 'EXTRACT'); + const hasNonExtractAction = this.actionHistory.some( + rec => rec.action !== 'EXTRACT' && rec.action !== 'SCROLL_AND_COUNT' + ); + const isRetryExtractOrCount = + plannerAction.action === 'EXTRACT' || plannerAction.action === 'SCROLL_AND_COUNT'; if ( !success && - plannerAction.action === 'EXTRACT' && + isRetryExtractOrCount && finalOutcome.status === StepStatus.SUCCESS && finalOutcome.extractedData && - isTextExtractionTask(task) && + (isTextExtractionTask(task) || isCountingTask(task)) && (!taskHasInteraction || hasNonExtractAction) ) { success = true; @@ -2455,6 +2446,176 @@ export class PlannerExecutorAgent { } } + // Handle SCROLL_AND_COUNT action — scroll entire page and count items + if (plannerAction.action === 'SCROLL_AND_COUNT') { + const countTarget = + plannerAction.countTarget || + plannerAction.goal || + plannerAction.intent || + plannerAction.target || + 'items'; + + if (this.config.verbose) { + console.log(`[ACTION] SCROLL_AND_COUNT - target: "${countTarget}"`); + } + + try { + await runtime.scroll('up'); + await new Promise(r => setTimeout(r, 500)); + + const viewportHeight = await runtime.getViewportHeight(); + const scrollDelta = Math.floor(viewportHeight * 0.85); + let scrollsRemaining = 50; + let consecutiveNoChange = 0; + + while (scrollsRemaining-- > 0) { + const beforeUrl = await runtime.getCurrentUrl(); + void beforeUrl; + const scrolled = await runtime.scrollBy(scrollDelta); + if (!scrolled) { + if (this.config.verbose) { + console.log(` [SCROLL_AND_COUNT] scroll returned false, stopping`); + } + break; + } + await new Promise(r => setTimeout(r, 800)); + + const snapshot = await runtime.snapshot({ screenshot: false, goal: countTarget }); + const elementCount = snapshot?.elements?.length ?? 0; + if (elementCount === 0) { + consecutiveNoChange++; + if (consecutiveNoChange >= 3) { + if (this.config.verbose) { + console.log(` [SCROLL_AND_COUNT] 3 consecutive empty viewports, stopping`); + } + break; + } + } else { + consecutiveNoChange = 0; + } + } + + if (this.config.verbose) { + console.log(` [SCROLL_AND_COUNT] finished scrolling, reading page content`); + } + + const stripThinkTags = (text: string): string => + text + .replace(//gi, '') + .replace(/= 0) { + extractedCount = String(parsed); + } + } + } + + if (extractedCount === null) { + const snapshot = await runtime.snapshot({ screenshot: false, goal: countTarget }); + const elements = snapshot?.elements || []; + const elementDescriptions = elements + .slice(0, 80) + .map(e => { + const parts: string[] = []; + if (e.role) parts.push(e.role); + if (e.text) parts.push(e.text.slice(0, 120)); + return parts.join(': '); + }) + .join('\n'); + + const countPrompt = `/no_think +You are a counting assistant. Given the page elements, count how many match the target. + +TARGET: ${countTarget} + +ELEMENTS: +${elementDescriptions} + +Return ONLY a single integer. Do not output anything else. + +COUNT:`; + + const countResp = await this.executor.generate( + 'Return only a single integer. No thinking, no explanation.', + countPrompt, + { temperature: 0.0, max_tokens: 16 } + ); + this.recordTokenUsage('extract', countResp); + + const countText = stripThinkTags((countResp.content || '').trim()); + const parsed = parseInt(countText, 10); + if (!isNaN(parsed) && parsed >= 0) { + extractedCount = String(parsed); + } + } + + if (this.config.verbose) { + console.log(` [SCROLL_AND_COUNT] final count: ${extractedCount}`); + } + + return { + stepId: stepNum, + goal: countTarget, + status: StepStatus.SUCCESS, + actionTaken: `SCROLL_AND_COUNT(${countTarget})`, + verificationPassed: true, + usedVision: false, + durationMs: Date.now() - stepStart, + urlBefore: currentUrl, + urlAfter: await runtime.getCurrentUrl(), + extractedData: { + text: extractedCount || '0', + query: `Count ${countTarget}`, + }, + }; + } catch (e) { + return { + stepId: stepNum, + goal: countTarget, + status: StepStatus.FAILED, + actionTaken: 'SCROLL_AND_COUNT', + verificationPassed: false, + usedVision: false, + durationMs: Date.now() - stepStart, + error: e instanceof Error ? e.message : String(e), + }; + } + } + // For CLICK and TYPE_AND_SUBMIT, we need to find the element const isTypeAction = plannerAction.action === 'TYPE_AND_SUBMIT'; @@ -3637,6 +3798,7 @@ export class PlannerExecutorAgent { ? (step.fields as Array<{ label: string; value: string }>) : undefined, submitText: typeof step.submitText === 'string' ? step.submitText : undefined, + countTarget: typeof step.countTarget === 'string' ? step.countTarget : undefined, }; } diff --git a/src/agents/planner-executor/prompts.ts b/src/agents/planner-executor/prompts.ts index d6f97e3..b8f4b72 100644 --- a/src/agents/planner-executor/prompts.ts +++ b/src/agents/planner-executor/prompts.ts @@ -63,6 +63,7 @@ Actions: - TYPE: Type text into a SINGLE form field. Prefer FILL_FORM for forms with multiple fields. - TYPE_AND_SUBMIT: Type text into a search box and submit. Set "input" to the SEARCH QUERY from the goal (NOT the element label). - SCROLL: Scroll page. Set "direction" to "up" or "down". +- SCROLL_AND_COUNT: Scroll through the ENTIRE page and count items. Use when the task asks "how many", "number of", "count", or "total". Set "countTarget" to describe what to count (e.g., "listings", "products", "articles"). The system scrolls viewport-by-viewport, counts matching items at each position, and sums the total. - WAIT: Wait for content to appear when a follow-up verification is needed. - EXTRACT: Extract the requested information from the current page when the task is data collection. - STUCK: Use only when the page state is blocked and you cannot make safe forward progress. @@ -103,6 +104,7 @@ Output ONLY valid JSON (no markdown, no \`\`\`): {"action":"TYPE_AND_SUBMIT","intent":"searchbox","input":"wireless headphones","verify":[{"predicate":"url_contains","args":["search"]}]} {"action":"CLICK","intent":"product link","input":"Sony WH-1000XM4 Wireless...","verify":[]} {"action":"CLICK","intent":"add to cart button","input":"Add to Cart","verify":[]} +{"action":"SCROLL_AND_COUNT","countTarget":"product listings","goal":"Count total product listings on the page"} {"action":"DONE","intent":"completed"} RULES: @@ -342,6 +344,7 @@ export interface StepwisePlannerResponse { | 'TYPE_AND_SUBMIT' | 'FILL_FORM' | 'SCROLL' + | 'SCROLL_AND_COUNT' | 'PRESS' | 'WAIT' | 'EXTRACT' @@ -353,6 +356,7 @@ export interface StepwisePlannerResponse { direction?: 'up' | 'down'; fields?: Array<{ label: string; value: string }>; submitText?: string; + countTarget?: string; verify?: Array<{ predicate: string; args: unknown[] }>; required?: boolean; stopIfTrue?: boolean; From 3c7418dfe28aee868676f4732d8e1a4687f7c5aa Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sun, 10 May 2026 21:18:51 -0700 Subject: [PATCH 2/2] Fix missing auto-complete checks in main execution loop The isCartAdditionTerminal and isFinalFormSubmissionAction checks were missing from the main execution loop (only present in the retry loop), causing 2 test failures. Restored both checks to the main loop. --- .../planner-executor/planner-executor-agent.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/agents/planner-executor/planner-executor-agent.ts b/src/agents/planner-executor/planner-executor-agent.ts index 25d1015..f4b2736 100644 --- a/src/agents/planner-executor/planner-executor-agent.ts +++ b/src/agents/planner-executor/planner-executor-agent.ts @@ -1290,6 +1290,22 @@ export class PlannerExecutorAgent { finalOutcome.status === StepStatus.SKIPPED || finalOutcome.status === StepStatus.VISION_FALLBACK ) { + if ( + !success && + finalOutcome.status === StepStatus.SUCCESS && + (await this.isCartAdditionTerminal(runtime, task, plannerAction)) + ) { + success = true; + } + + if ( + !success && + finalOutcome.status === StepStatus.SUCCESS && + this.isFinalFormSubmissionAction(task, plannerAction) + ) { + success = true; + } + const isExtractAction = plannerAction.action === 'EXTRACT' || plannerAction.action === 'SCROLL_AND_COUNT'; const taskHasInteractionLocal =