Skip to content

Commit 6703dff

Browse files
authored
Merge pull request #210 from PredicateSystems/extraction
extraction support ported
2 parents f0f22f5 + 8365f46 commit 6703dff

11 files changed

Lines changed: 690 additions & 10 deletions

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/agents/planner-executor/category-pruner.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
} from './pruning-types';
88
import { TaskCategory } from './task-category';
99
import { pruneWithPolicy } from './data-driven-pruner';
10+
import { isTextExtractionTask } from './extraction-keywords';
1011

1112
function textOf(element: SnapshotElement): string {
1213
return String(element.text || element.name || '').toLowerCase();
@@ -141,6 +142,39 @@ function allowSearchRelaxed(element: SnapshotElement): boolean {
141142
return ['button', 'tab', 'menuitem'].includes(roleOf(element));
142143
}
143144

145+
function allowExtraction(element: SnapshotElement): boolean {
146+
const role = roleOf(element);
147+
// Nav links are critical for navigating to the data page (e.g., "Show" link on HN)
148+
if (role === 'link' && element.href) {
149+
return true;
150+
}
151+
// Search inputs for finding data
152+
if (['searchbox', 'textbox', 'combobox'].includes(role)) {
153+
return true;
154+
}
155+
// Buttons for navigation/actions
156+
if (role === 'button') {
157+
return true;
158+
}
159+
// Content-bearing elements (table cells, list items, etc.)
160+
if (['cell', 'row', 'listitem', 'heading'].includes(role)) {
161+
return true;
162+
}
163+
// High-importance elements likely contain relevant data
164+
if (element.importance && element.importance >= 200) {
165+
return true;
166+
}
167+
// Dominant group elements (main content area)
168+
if (element.inDominantGroup) {
169+
return true;
170+
}
171+
return false;
172+
}
173+
174+
function allowExtractionRelaxed(element: SnapshotElement): boolean {
175+
return allowExtraction(element) || isInteractive(element);
176+
}
177+
144178
function allowGeneric(element: SnapshotElement): boolean {
145179
return ['button', 'link', 'textbox', 'searchbox', 'combobox', 'checkbox', 'radio'].includes(
146180
roleOf(element)
@@ -209,6 +243,14 @@ function getPolicy(
209243
return { maxNodes: 60, allow: allowShoppingLoose, block: () => false };
210244
}
211245

246+
if (category === PruningTaskCategory.EXTRACTION) {
247+
// Extraction tasks need: nav links for navigation, content elements for data,
248+
// search inputs, and any interactive elements for reaching the data.
249+
return relaxationLevel === 0
250+
? { maxNodes: 35, allow: allowExtraction, block: blockCommon }
251+
: { maxNodes: 50, allow: allowExtractionRelaxed, block: () => false };
252+
}
253+
212254
if (category === PruningTaskCategory.FORM_FILLING) {
213255
return relaxationLevel === 0
214256
? { maxNodes: 20, allow: allowFormFilling, block: blockCommon }
@@ -232,12 +274,23 @@ export function detectPruningCategory(
232274
): PruningTaskCategory | null {
233275
const normalizedGoal = goal.toLowerCase();
234276

277+
if (taskCategory === TaskCategory.EXTRACTION) {
278+
return PruningTaskCategory.EXTRACTION;
279+
}
235280
if (taskCategory === TaskCategory.SEARCH) {
236281
return PruningTaskCategory.SEARCH;
237282
}
238283
if (taskCategory === TaskCategory.FORM_FILL) {
239284
return PruningTaskCategory.FORM_FILLING;
240285
}
286+
287+
// Extraction keyword detection takes priority over TRANSACTION/SHOPPING
288+
// because "find the title of X" or "extract Y" on an e-commerce site is
289+
// still an extraction task, not a shopping task.
290+
if (isTextExtractionTask(normalizedGoal)) {
291+
return PruningTaskCategory.EXTRACTION;
292+
}
293+
241294
if (taskCategory === TaskCategory.TRANSACTION) {
242295
if (normalizedGoal.includes('checkout')) {
243296
return PruningTaskCategory.CHECKOUT;

0 commit comments

Comments
 (0)