-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathtestBrowserExtraction.ts
More file actions
117 lines (95 loc) · 3.47 KB
/
testBrowserExtraction.ts
File metadata and controls
117 lines (95 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
import { extract, ContentFormat, Browser } from "../index";
import { z } from "zod";
import * as path from "path";
import { config } from "dotenv";
// Load environment variables from .env file
config({ path: path.resolve(process.cwd(), ".env") });
const productCatalogSchema = z.object({
products: z
.array(
z.object({
name: z.string().describe("Product name or title"),
brand: z.string().optional().describe("Brand name"),
price: z.number().describe("Current price"),
originalPrice: z
.number()
.optional()
.describe("Original price if on sale"),
rating: z.number().optional().describe("Product rating out of 5"),
reviewCount: z.number().optional().describe("Number of reviews"),
productUrl: z.string().url().describe("Link to product detail page"),
imageUrl: z.string().url().optional().describe("Product image URL"),
})
)
.describe("List of bread and bakery products"),
});
async function testProductCatalogExtraction() {
console.log("🍞 Testing Product Catalog Extraction...\n");
const testUrl =
"https://www.walmart.ca/en/browse/grocery/bread-bakery/10019_6000194327359";
try {
console.log(`📡 Loading product catalog page: ${testUrl}`);
console.log("🤖 Using Browser class to load the page...\n");
// Create browser instance
const browser = new Browser({
type: "local",
headless: false,
});
await browser.start();
console.log("✅ Browser started successfully");
// Create page and load content using direct Playwright API
const page = await browser.newPage();
await page.goto(testUrl);
try {
await page.waitForLoadState("networkidle", { timeout: 10000 });
} catch {
console.log("Network idle timeout, continuing...");
}
const html = await page.content();
console.log(`📄 Loaded ${html.length} characters of HTML`);
await browser.close();
console.log("✅ Browser closed");
// Now extract product data from the loaded HTML
console.log("\n🧠 Extracting product data using LLM...");
const result = await extract({
llm: new ChatGoogleGenerativeAI({
apiKey: process.env.GOOGLE_API_KEY,
model: "gemini-2.5-flash",
temperature: 0,
}),
content: html,
format: ContentFormat.HTML,
sourceUrl: testUrl,
schema: productCatalogSchema,
htmlExtractionOptions: {
extractMainHtml: true,
includeImages: true,
cleanUrls: true,
},
});
console.log("✅ Extraction successful!");
console.log("🍞 EXTRACTED PRODUCT CATALOG DATA:");
console.log("=".repeat(80));
console.log(JSON.stringify(result.data, null, 2));
console.log("=".repeat(80));
console.log("\n💰 Token Usage:");
console.log(`Input tokens: ${result.usage.inputTokens}`);
console.log(`Output tokens: ${result.usage.outputTokens}`);
} catch (error) {
console.error("❌ Error during product catalog extraction:", error);
}
}
async function main() {
if (!process.env.GOOGLE_API_KEY) {
console.error("❌ Please set GOOGLE_API_KEY environment variable");
process.exit(1);
}
console.log("🚀 Starting product catalog extraction\n");
await testProductCatalogExtraction();
console.log("\n🎉 Extraction completed!");
}
if (require.main === module) {
main().catch(console.error);
}
export { testProductCatalogExtraction };