77} from './pruning-types' ;
88import { TaskCategory } from './task-category' ;
99import { pruneWithPolicy } from './data-driven-pruner' ;
10+ import { isTextExtractionTask } from './extraction-keywords' ;
1011
1112function textOf ( element : SnapshotElement ) : string {
1213 return String ( element . text || element . name || '' ) . toLowerCase ( ) ;
@@ -141,6 +142,39 @@ function allowSearchRelaxed(element: SnapshotElement): boolean {
141142 return [ 'button' , 'tab' , 'menuitem' ] . includes ( roleOf ( element ) ) ;
142143}
143144
145+ function allowExtraction ( element : SnapshotElement ) : boolean {
146+ const role = roleOf ( element ) ;
147+ // Nav links are critical for navigating to the data page (e.g., "Show" link on HN)
148+ if ( role === 'link' && element . href ) {
149+ return true ;
150+ }
151+ // Search inputs for finding data
152+ if ( [ 'searchbox' , 'textbox' , 'combobox' ] . includes ( role ) ) {
153+ return true ;
154+ }
155+ // Buttons for navigation/actions
156+ if ( role === 'button' ) {
157+ return true ;
158+ }
159+ // Content-bearing elements (table cells, list items, etc.)
160+ if ( [ 'cell' , 'row' , 'listitem' , 'heading' ] . includes ( role ) ) {
161+ return true ;
162+ }
163+ // High-importance elements likely contain relevant data
164+ if ( element . importance && element . importance >= 200 ) {
165+ return true ;
166+ }
167+ // Dominant group elements (main content area)
168+ if ( element . inDominantGroup ) {
169+ return true ;
170+ }
171+ return false ;
172+ }
173+
174+ function allowExtractionRelaxed ( element : SnapshotElement ) : boolean {
175+ return allowExtraction ( element ) || isInteractive ( element ) ;
176+ }
177+
144178function allowGeneric ( element : SnapshotElement ) : boolean {
145179 return [ 'button' , 'link' , 'textbox' , 'searchbox' , 'combobox' , 'checkbox' , 'radio' ] . includes (
146180 roleOf ( element )
@@ -209,6 +243,14 @@ function getPolicy(
209243 return { maxNodes : 60 , allow : allowShoppingLoose , block : ( ) => false } ;
210244 }
211245
246+ if ( category === PruningTaskCategory . EXTRACTION ) {
247+ // Extraction tasks need: nav links for navigation, content elements for data,
248+ // search inputs, and any interactive elements for reaching the data.
249+ return relaxationLevel === 0
250+ ? { maxNodes : 35 , allow : allowExtraction , block : blockCommon }
251+ : { maxNodes : 50 , allow : allowExtractionRelaxed , block : ( ) => false } ;
252+ }
253+
212254 if ( category === PruningTaskCategory . FORM_FILLING ) {
213255 return relaxationLevel === 0
214256 ? { maxNodes : 20 , allow : allowFormFilling , block : blockCommon }
@@ -232,12 +274,23 @@ export function detectPruningCategory(
232274) : PruningTaskCategory | null {
233275 const normalizedGoal = goal . toLowerCase ( ) ;
234276
277+ if ( taskCategory === TaskCategory . EXTRACTION ) {
278+ return PruningTaskCategory . EXTRACTION ;
279+ }
235280 if ( taskCategory === TaskCategory . SEARCH ) {
236281 return PruningTaskCategory . SEARCH ;
237282 }
238283 if ( taskCategory === TaskCategory . FORM_FILL ) {
239284 return PruningTaskCategory . FORM_FILLING ;
240285 }
286+
287+ // Extraction keyword detection takes priority over TRANSACTION/SHOPPING
288+ // because "find the title of X" or "extract Y" on an e-commerce site is
289+ // still an extraction task, not a shopping task.
290+ if ( isTextExtractionTask ( normalizedGoal ) ) {
291+ return PruningTaskCategory . EXTRACTION ;
292+ }
293+
241294 if ( taskCategory === TaskCategory . TRANSACTION ) {
242295 if ( normalizedGoal . includes ( 'checkout' ) ) {
243296 return PruningTaskCategory . CHECKOUT ;
0 commit comments