diff --git a/README.md b/README.md index 255a186..698caad 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,23 @@ Lightfeed Extractor is a Typescript library built for robust web data extraction ## Installation ```bash -npm install @lightfeed/extractor +npm install @lightfeed/extractor @langchain/openai +``` + +Install the LangChain integration package for your chosen provider. `@langchain/core` is a peer dependency — it's shared automatically: + +```bash +# OpenAI +npm install @lightfeed/extractor @langchain/openai + +# Google Gemini +npm install @lightfeed/extractor @langchain/google-genai + +# Anthropic +npm install @lightfeed/extractor @langchain/anthropic + +# Ollama (local models) +npm install @lightfeed/extractor @langchain/ollama ``` ## Usage @@ -61,7 +77,8 @@ npm install @lightfeed/extractor This example demonstrates extracting structured product data from a real e-commerce website using a local headed Playwright browser. For production environments, you can use a Playwright browser in [serverless](#serverless-browser) or [remote](#remote-browser) mode. ```typescript -import { extract, ContentFormat, LLMProvider, Browser } from "@lightfeed/extractor"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat, Browser } from "@lightfeed/extractor"; import { z } from "zod"; // Define schema for product catalog extraction @@ -114,12 +131,15 @@ try { // Extract structured product data console.log("Extracting product data using LLM..."); const result = await extract({ + llm: new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }), content: html, format: ContentFormat.HTML, sourceUrl: pageUrl, schema: productCatalogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, // Use environment variable htmlExtractionOptions: { extractMainHtml: true, includeImages: true, @@ -164,15 +184,21 @@ try { ### Extracting from Markdown or Plain Text -You can also extract structured data directly from HTML, Markdown or text string: +You can also extract structured data directly from HTML, Markdown or text string. Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/): ```typescript +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat } from "@lightfeed/extractor"; + const result = await extract({ + llm: new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }), content: markdownContent, - // Specify that content is Markdown. In addition to HTML and Markdown, you can also extract plain text by ContentFormat.TXT format: ContentFormat.MARKDOWN, schema: mySchema, - googleApiKey: "your-google-gemini-api-key", }); ``` @@ -182,13 +208,12 @@ You can provide a custom prompt to guide the extraction process: ```typescript const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, sourceUrl: "https://example.com/products", - // In custom prompt, defined what data should be retrieved prompt: "Extract ONLY products that are on sale or have special discounts. Include their original prices, discounted prices, and product URL.", - googleApiKey: "your-google-gemini-api-key", }); ``` @@ -222,12 +247,12 @@ const schema = z.object({ }); const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: schema, sourceUrl: "https://acme.com/products/smart-security-camera", extractionContext: extractionContext, - googleApiKey: "your-google-gemini-api-key", }); // The LLM will use the context to extract store name (acme) and consider the location @@ -241,27 +266,44 @@ console.log(result.data); // } ``` -### Customizing LLM Provider and Managing Token Limits +### Using Any LangChain Model + +Pass **any LangChain chat model** via the `llm` option. Use OpenAI, Google Gemini, Anthropic, Mistral, Ollama, Azure OpenAI, AWS Bedrock, or any [LangChain-supported provider](https://js.langchain.com/docs/integrations/chat/): + +```typescript +// OpenAI +import { ChatOpenAI } from "@langchain/openai"; +const llm = new ChatOpenAI({ modelName: "gpt-4o-mini", apiKey: process.env.OPENAI_API_KEY }); + +// Google Gemini +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +const llm = new ChatGoogleGenerativeAI({ model: "gemini-2.5-flash", apiKey: process.env.GOOGLE_API_KEY }); + +// Anthropic +import { ChatAnthropic } from "@langchain/anthropic"; +const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514", apiKey: process.env.ANTHROPIC_API_KEY }); + +// Ollama (local) +import { ChatOllama } from "@langchain/ollama"; +const llm = new ChatOllama({ model: "llama3" }); +``` + +### Managing Token Limits -You can customize LLM and manage token limits to control costs and ensure your content fits within the model's maximum context window: +Use `maxInputTokens` to truncate content when it exceeds the model's context window: ```typescript -// Extract from Markdown with token limit const result = await extract({ + llm: new ChatOpenAI({ modelName: "gpt-4o-mini", apiKey: "..." }), content: markdownContent, format: ContentFormat.MARKDOWN, schema, - // Provide model provider and model name - provider: LLMProvider.OPENAI, - modelName: "gpt-4o-mini", - openaiApiKey: "your-openai-api-key", - // Limit to roughly 128K tokens (max input for gpt-4o-mini) - maxInputTokens: 128000, + maxInputTokens: 128000, // Roughly 128K tokens (4 chars/token) }); ``` > [!WARNING] -> For OpenAI models, optional schema is not supported. You need to change `.optional()` to `.nullable()`. +> For OpenAI models, optional schema is not supported. Use `.nullable()` instead of `.optional()`. ### Extracting from Main HTML @@ -269,6 +311,7 @@ For blog posts or articles with lots of navigation elements, headers, and footer ```typescript const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, @@ -304,6 +347,7 @@ const productListSchema = z.object({ }); const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, @@ -320,6 +364,7 @@ The library can clean URLs to remove tracking parameters and unnecessary compone ```typescript const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, @@ -337,17 +382,9 @@ const result = await extract({ ## LLM Extraction Function -### LLM API Keys - -The library currently supports Google Gemini and OpenAI ChatGPT models. It will check for LLM API keys in the following order: - -1. Directly provided API key parameter (`googleApiKey` or `openaiApiKey`) -2. Environment variables (`GOOGLE_API_KEY` or `OPENAI_API_KEY`) +### LLM Configuration -While the library can use environment variables, it's recommended to explicitly provide API keys in production code for better control and transparency. - -> [!NOTE] -> Want support for additional LLM providers? Please [create an issue](https://github.com/lightfeed/extractor/issues/new/choose) and let us know which providers you'd like to see supported. +Pass a [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance via the `llm` option. Install the LangChain integration for your provider (e.g. `@langchain/openai`, `@langchain/google-genai`, `@langchain/anthropic`) and configure API keys on the model instance. ### `extract(options: ExtractorOptions): Promise>` @@ -357,15 +394,11 @@ Main function to extract structured data from content. | Option | Type | Description | Default | |--------|------|-------------|---------| +| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance (ChatOpenAI, ChatGoogleGenerativeAI, ChatAnthropic, etc.) | Required | | `content` | `string` | HTML, markdown, or plain text content to extract from | Required | | `format` | `ContentFormat` | Content format (HTML, MARKDOWN, or TXT) | Required | | `schema` | `z.ZodTypeAny` | Zod schema defining the structure to extract | Required | | `prompt` | `string` | Custom prompt to guide the extraction process | Internal default prompt | -| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI) | `LLMProvider.GOOGLE_GEMINI` | -| `modelName` | `string` | Model name to use | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini | -| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider) | From env `GOOGLE_API_KEY` | -| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider) | From env `OPENAI_API_KEY` | -| `temperature` | `number` | Temperature for the LLM (0-1) | `0` | | `htmlExtractionOptions` | `HTMLExtractionOptions` | HTML-specific options for content extraction [see below](#htmlextractionoptions) | `{}` | | `sourceUrl` | `string` | URL of the HTML content, required when format is HTML to properly handle relative URLs | Required for HTML format | | `maxInputTokens` | `number` | Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. When specified, content will be truncated if the total prompt size exceeds this limit. | `undefined` | @@ -653,6 +686,7 @@ const schema = z.object({ }); const result = await extract({ + llm: myLLM, content: markdownContent, format: ContentFormat.MARKDOWN, schema, diff --git a/package.json b/package.json index 3d9fa01..774356e 100644 --- a/package.json +++ b/package.json @@ -42,8 +42,11 @@ "html", "markdown", "structured-data", + "langchain", "openai", - "gemini" + "gemini", + "anthropic", + "ollama" ], "author": "Lightfeed", "license": "Apache-2.0", @@ -51,13 +54,12 @@ "url": "https://github.com/lightfeed/extractor/issues" }, "homepage": "https://github.com/lightfeed/extractor#readme", + "peerDependencies": { + "@langchain/core": ">=1.1.31" + }, "dependencies": { - "@langchain/core": "^1.1.31", - "@langchain/google-genai": "^2.1.24", - "@langchain/openai": "^1.2.12", "cheerio": "^1.0.0", "jsonrepair": "^3.12.0", - "langchain": "^1.2.30", "playwright": "npm:rebrowser-playwright-core@1.49.1", "turndown": "^7.2.0", "xmldom": "^0.6.0", @@ -65,6 +67,9 @@ "zod": "^3.24.3" }, "devDependencies": { + "@langchain/core": "^1.1.31", + "@langchain/google-genai": "^2.1.24", + "@langchain/openai": "^1.2.12", "@types/jest": "^29.5.12", "@types/node": "^22.15.3", "@types/turndown": "^5.0.5", diff --git a/src/dev/runLocalTest.ts b/src/dev/runLocalTest.ts index e6b373a..1f51da9 100644 --- a/src/dev/runLocalTest.ts +++ b/src/dev/runLocalTest.ts @@ -2,11 +2,30 @@ import * as fs from "fs"; import * as path from "path"; import { config } from "dotenv"; import { z } from "zod"; -import { extract, ContentFormat, LLMProvider } from "../index"; +import { ChatOpenAI } from "@langchain/openai"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat } from "../index"; // Load environment variables from .env file config({ path: path.resolve(process.cwd(), ".env") }); +type Provider = "gemini" | "openai"; + +function createLLM(provider: Provider) { + if (provider === "gemini") { + return new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }); + } + return new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }); +} + // Helper to load HTML test fixtures function loadFixture(filename: string): string { return fs.readFileSync( @@ -67,34 +86,25 @@ const productSchemaOpenAI = z.object({ }); // Test functions -async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) { +async function testBlogExtraction(provider: Provider = "gemini") { console.log(`Testing blog post extraction with ${provider}...`); try { const html = loadFixture("blog-post.html"); - // Check for required API key - if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) { + if (provider === "gemini" && !process.env.GOOGLE_API_KEY) { console.error("Error: GOOGLE_API_KEY environment variable is required"); process.exit(1); - } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) { + } else if (provider === "openai" && !process.env.OPENAI_API_KEY) { console.error("Error: OPENAI_API_KEY environment variable is required"); process.exit(1); } - const apiKey = - provider === LLMProvider.GOOGLE_GEMINI - ? process.env.GOOGLE_API_KEY - : process.env.OPENAI_API_KEY; - const result = await extract({ + llm: createLLM(provider), content: html, format: ContentFormat.HTML, - schema: - provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI, - provider, - googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined, - openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined, + schema: provider === "gemini" ? blogSchema : blogSchemaOpenAI, htmlExtractionOptions: { extractMainHtml: false, }, @@ -113,36 +123,25 @@ async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) { } } -async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) { +async function testProductExtraction(provider: Provider = "gemini") { console.log(`Testing product listing extraction with ${provider}...`); try { const html = loadFixture("product-list.html"); - // Check for required API key - if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) { + if (provider === "gemini" && !process.env.GOOGLE_API_KEY) { console.error("Error: GOOGLE_API_KEY environment variable is required"); process.exit(1); - } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) { + } else if (provider === "openai" && !process.env.OPENAI_API_KEY) { console.error("Error: OPENAI_API_KEY environment variable is required"); process.exit(1); } - const apiKey = - provider === LLMProvider.GOOGLE_GEMINI - ? process.env.GOOGLE_API_KEY - : process.env.OPENAI_API_KEY; - const result = await extract({ + llm: createLLM(provider), content: html, format: ContentFormat.HTML, - schema: - provider === LLMProvider.GOOGLE_GEMINI - ? productSchema - : productSchemaOpenAI, - provider, - googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined, - openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined, + schema: provider === "gemini" ? productSchema : productSchemaOpenAI, htmlExtractionOptions: { extractMainHtml: true, }, @@ -163,38 +162,32 @@ async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) { // Run tests based on command line arguments async function main() { - // Parse arguments: content type and provider const args = process.argv.slice(2); - const contentType = args[0] || "all"; // 'blog', 'product', or 'all' - const provider = - args[1]?.toUpperCase() === "OPENAI" - ? LLMProvider.OPENAI - : args[1]?.toUpperCase() === "GEMINI" - ? LLMProvider.GOOGLE_GEMINI - : "all"; // 'OPENAI', 'GEMINI', or 'all' + const contentType = args[0] || "all"; + const providerArg = args[1]?.toUpperCase(); + const provider: Provider | "all" = + providerArg === "OPENAI" ? "openai" : providerArg === "GEMINI" ? "gemini" : "all"; console.log("API Keys available:"); console.log(`- GOOGLE_API_KEY: ${process.env.GOOGLE_API_KEY ? "Yes" : "No"}`); console.log(`- OPENAI_API_KEY: ${process.env.OPENAI_API_KEY ? "Yes" : "No"}`); console.log(""); - // Run blog tests if (contentType === "blog" || contentType === "all") { - if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") { - await testBlogExtraction(LLMProvider.GOOGLE_GEMINI); + if (provider === "gemini" || provider === "all") { + await testBlogExtraction("gemini"); } - if (provider === LLMProvider.OPENAI || provider === "all") { - await testBlogExtraction(LLMProvider.OPENAI); + if (provider === "openai" || provider === "all") { + await testBlogExtraction("openai"); } } - // Run product tests if (contentType === "product" || contentType === "all") { - if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") { - await testProductExtraction(LLMProvider.GOOGLE_GEMINI); + if (provider === "gemini" || provider === "all") { + await testProductExtraction("gemini"); } - if (provider === LLMProvider.OPENAI || provider === "all") { - await testProductExtraction(LLMProvider.OPENAI); + if (provider === "openai" || provider === "all") { + await testProductExtraction("openai"); } } } diff --git a/src/dev/testBrowserExtraction.ts b/src/dev/testBrowserExtraction.ts index 672816a..bcc2aca 100644 --- a/src/dev/testBrowserExtraction.ts +++ b/src/dev/testBrowserExtraction.ts @@ -1,4 +1,5 @@ -import { extract, ContentFormat, LLMProvider, Browser } from "../index"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat, Browser } from "../index"; import { z } from "zod"; import * as path from "path"; import { config } from "dotenv"; @@ -65,12 +66,15 @@ async function testProductCatalogExtraction() { console.log("\n🧠 Extracting product data using LLM..."); const result = await extract({ + llm: new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }), content: html, format: ContentFormat.HTML, sourceUrl: testUrl, schema: productCatalogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, htmlExtractionOptions: { extractMainHtml: true, includeImages: true, diff --git a/src/dev/testUsage.ts b/src/dev/testUsage.ts index d5c15ff..86887c6 100644 --- a/src/dev/testUsage.ts +++ b/src/dev/testUsage.ts @@ -1,7 +1,8 @@ import { config } from "dotenv"; import * as path from "path"; import { z } from "zod"; -import { extract, ContentFormat, LLMProvider } from "../index"; +import { ChatOpenAI } from "@langchain/openai"; +import { extract, ContentFormat } from "../index"; // Load environment variables from .env file config({ path: path.resolve(process.cwd(), ".env") }); @@ -32,11 +33,14 @@ This is a test of the usage tracking system. try { // Run extraction const result = await extract({ + llm: new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }), content: markdown, format: ContentFormat.MARKDOWN, schema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Log the results diff --git a/src/example.ts b/src/example.ts index 470c715..2e694e2 100644 --- a/src/example.ts +++ b/src/example.ts @@ -1,4 +1,5 @@ -import { extract, ContentFormat, LLMProvider } from "./index"; +import { ChatOpenAI } from "@langchain/openai"; +import { extract, ContentFormat } from "./index"; import { z } from "zod"; import { config } from "dotenv"; import * as path from "path"; @@ -11,8 +12,8 @@ config({ path: path.resolve(process.cwd(), ".env") }); async function example() { try { // Check if API key is available - if (!process.env.GOOGLE_API_KEY) { - console.error("Error: GOOGLE_API_KEY environment variable is required"); + if (!process.env.OPENAI_API_KEY) { + console.error("Error: OPENAI_API_KEY environment variable is required"); return; } @@ -46,12 +47,14 @@ async function example() { // Extract data from HTML const result = await extract({ + llm: new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }), content: htmlContent, format: ContentFormat.HTML, schema, - // Using Google Gemini by default - openaiApiKey: process.env.OPENAI_API_KEY, - provider: LLMProvider.OPENAI, sourceUrl, }); diff --git a/src/extractors.ts b/src/extractors.ts index 901e188..24bb4c5 100644 --- a/src/extractors.ts +++ b/src/extractors.ts @@ -1,7 +1,6 @@ -import { ChatOpenAI } from "@langchain/openai"; -import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import { z } from "zod"; -import { LLMProvider, Usage, ContentFormat } from "./types"; +import { Usage, ContentFormat } from "./types"; import { AIMessage } from "@langchain/core/messages"; import { safeSanitizedParser, @@ -39,35 +38,6 @@ export function getUsage(output: LLMResult): Usage { return usage; } -/** - * Create LLM instance based on provider and configuration - */ -export function createLLM( - provider: LLMProvider, - modelName: string, - apiKey: string, - temperature: number = 0, -) { - switch (provider) { - case LLMProvider.OPENAI: - return new ChatOpenAI({ - apiKey, - modelName, - temperature, - }); - - case LLMProvider.GOOGLE_GEMINI: - return new ChatGoogleGenerativeAI({ - apiKey, - model: modelName, - temperature, - }); - - default: - throw new Error(`Unsupported LLM provider: ${provider}`); - } -} - interface ExtractionPromptOptions { format: string; content: string; @@ -181,16 +151,12 @@ export function truncateContent({ export async function extractWithLLM( content: string, schema: T, - provider: LLMProvider, - modelName: string, - apiKey: string, - temperature: number = 0, + llm: BaseChatModel, customPrompt?: string, format: string = ContentFormat.MARKDOWN, maxInputTokens?: number, extractionContext?: Record, ): Promise<{ data: z.infer; usage: Usage }> { - const llm = createLLM(provider, modelName, apiKey, temperature); let usage: Usage = {}; // Truncate content if maxInputTokens is specified diff --git a/src/index.ts b/src/index.ts index 08cf600..8865e00 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,30 +3,19 @@ import { htmlToMarkdown } from "./converters"; import { extractWithLLM } from "./extractors"; import { ContentFormat, - LLMProvider, ExtractorOptions, ExtractorResult, HTMLExtractionOptions, } from "./types"; -// Default model names -const DEFAULT_MODELS = { - [LLMProvider.GOOGLE_GEMINI]: "gemini-2.5-flash", - [LLMProvider.OPENAI]: "gpt-4o-mini", -}; - /** * Extract structured data from HTML, markdown, or plain text content using an LLM * * @param options Configuration options for extraction + * @param options.llm A LangChain chat model instance (ChatOpenAI, ChatAnthropic, etc.) * @param options.content HTML, markdown, or plain text content to extract from * @param options.format Content format (HTML, MARKDOWN, or TXT) * @param options.schema Zod schema defining the structure to extract - * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI) - * @param options.modelName Model name to use (provider-specific) - * @param options.googleApiKey Google API key (if using Google Gemini provider) - * @param options.openaiApiKey OpenAI API key (if using OpenAI provider) - * @param options.temperature Temperature for the LLM (0-1) * @param options.prompt Custom prompt to guide the extraction process * @param options.sourceUrl URL of the HTML content (required for HTML format) * @param options.htmlExtractionOptions HTML-specific options for content extraction @@ -37,27 +26,6 @@ const DEFAULT_MODELS = { export async function extract( options: ExtractorOptions ): Promise>> { - // Validate required parameters - const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI; - let apiKey: string; - - if (provider === LLMProvider.GOOGLE_GEMINI) { - apiKey = options.googleApiKey ?? process.env.GOOGLE_API_KEY ?? ""; - if (!apiKey) { - throw new Error( - "Google API key is required. Provide googleApiKey option or set GOOGLE_API_KEY environment variable." - ); - } - } else if (provider === LLMProvider.OPENAI) { - apiKey = options.openaiApiKey ?? process.env.OPENAI_API_KEY ?? ""; - if (!apiKey) { - throw new Error( - "OpenAI API key is required. Provide openaiApiKey option or set OPENAI_API_KEY environment variable." - ); - } - } else { - throw new Error(`Unsupported LLM provider: ${provider}`); - } // Validate sourceUrl for HTML format if (options.format === ContentFormat.HTML && !options.sourceUrl) { @@ -66,9 +34,6 @@ export async function extract( ); } - // Get model name (use defaults if not provided) - const modelName = options.modelName ?? DEFAULT_MODELS[provider]; - // Convert HTML to markdown if needed let content = options.content; let formatToUse = options.format; @@ -79,7 +44,6 @@ export async function extract( options.htmlExtractionOptions, options.sourceUrl ); - // For the LLM, the content is now markdown formatToUse = ContentFormat.MARKDOWN; } @@ -87,17 +51,13 @@ export async function extract( const { data, usage } = await extractWithLLM( content, options.schema, - provider, - modelName, - apiKey, - options.temperature ?? 0, + options.llm, options.prompt, - formatToUse.toString(), // Pass the correct format based on actual content + formatToUse.toString(), options.maxInputTokens, options.extractionContext ); - // Return the full result return { data, processedContent: content, diff --git a/src/types.ts b/src/types.ts index 9d828ba..3642a0b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,4 +1,5 @@ import { z } from "zod"; +import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import type { Browser, LaunchOptions, ConnectOverCDPOptions } from "playwright"; /** @@ -10,14 +11,6 @@ export enum ContentFormat { TXT = "txt", } -/** - * Supported LLM providers - */ -export enum LLMProvider { - OPENAI = "openai", - GOOGLE_GEMINI = "google_gemini", -} - /** * Proxy configuration for network requests */ @@ -126,20 +119,18 @@ export interface ExtractorOptions { /** Schema for structured extraction */ schema: T; - /** LLM Provider (OpenAI or Google Gemini) */ - provider?: LLMProvider; - - /** Model name to use */ - modelName?: string; - - /** OpenAI API key */ - openaiApiKey?: string; - - /** Google API key */ - googleApiKey?: string; - - /** Temperature for the LLM (0-1), defaults to 0 */ - temperature?: number; + /** + * A LangChain chat model instance to use for extraction. + * Accepts any LangChain chat model (ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc.). + * + * @example + * ```typescript + * import { ChatOpenAI } from "@langchain/openai"; + * const llm = new ChatOpenAI({ model: "gpt-4o-mini" }); + * const result = await extract({ llm, content, format, schema }); + * ``` + */ + llm: BaseChatModel; /** HTML-specific extraction options (only applies when format is HTML) */ htmlExtractionOptions?: HTMLExtractionOptions; diff --git a/tests/integration/browser-extraction.test.ts b/tests/integration/browser-extraction.test.ts index 6230796..f62125f 100644 --- a/tests/integration/browser-extraction.test.ts +++ b/tests/integration/browser-extraction.test.ts @@ -1,6 +1,15 @@ -import { extract, ContentFormat, LLMProvider, Browser } from "../../src/index"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat, Browser } from "../../src/index"; import { z } from "zod"; +function createGeminiLLM() { + return new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }); +} + const testSchema = z.object({ title: z.string(), description: z.string().optional(), @@ -33,12 +42,11 @@ describe("Browser + Extraction Integration Tests", () => { // Extract data from the loaded HTML const result = await extract({ + llm: createGeminiLLM(), content: html, format: ContentFormat.HTML, sourceUrl: testUrl, schema: testSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, }); expect(result.data).toBeDefined(); diff --git a/tests/integration/extract.test.ts b/tests/integration/extract.test.ts index f944306..26d3054 100644 --- a/tests/integration/extract.test.ts +++ b/tests/integration/extract.test.ts @@ -1,14 +1,31 @@ import * as fs from "fs"; import * as path from "path"; import { z } from "zod"; +import { ChatOpenAI } from "@langchain/openai"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; import { extract, ContentFormat, - LLMProvider, ExtractorResult, } from "../../src"; import { htmlToMarkdown } from "../../src/converters"; +function createGeminiLLM() { + return new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }); +} + +function createOpenAILLM(modelName = "gpt-4o-mini") { + return new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName, + temperature: 0, + }); +} + // Read the sample HTML files const blogPostHtml = fs.readFileSync( path.resolve(__dirname, "../fixtures/blog-post.html"), @@ -77,11 +94,10 @@ describe("Extract Integration Tests", () => { describe("Blog Post Extraction", () => { test("should extract blog post data using Google Gemini default model", async () => { const result = await extract({ + llm: createGeminiLLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/blog/async-await", }); @@ -90,11 +106,10 @@ describe("Extract Integration Tests", () => { test("should extract blog post data using OpenAI default model", async () => { const result = await extract({ + llm: createOpenAILLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com/blog/async-await", }); @@ -223,11 +238,10 @@ describe("Extract Integration Tests", () => { describe("Product List Extraction", () => { test("should extract product list data using Google Gemini", async () => { const result = await extract({ + llm: createGeminiLLM(), content: productListHtml, format: ContentFormat.HTML, schema: productSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/products", htmlExtractionOptions: { extractMainHtml: true, @@ -239,11 +253,10 @@ describe("Extract Integration Tests", () => { test("should extract product list data using OpenAI", async () => { const result = await extract({ + llm: createOpenAILLM(), content: productListHtml, format: ContentFormat.HTML, schema: productSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com/products", htmlExtractionOptions: { extractMainHtml: true, @@ -268,9 +281,7 @@ describe("Extract Integration Tests", () => { // a value that is not expected by the schema. price: z.number().describe("Use 'N/A' if not available").nullable(), }), - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, - modelName: "gpt-3.5-turbo", + llm: createOpenAILLM("gpt-3.5-turbo"), }); expect(result.data).toEqual( expect.objectContaining({ @@ -299,8 +310,7 @@ describe("Extract Integration Tests", () => { // to fail in some cases to return the structured output. content: z.string().optional(), }), - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, + llm: createGeminiLLM(), sourceUrl: "https://example.com/blog/async-await", }); expect(result.data).toBeDefined(); @@ -319,11 +329,10 @@ describe("Extract Integration Tests", () => { }); const result = await extract({ + llm: createOpenAILLM(), content: markdownContent, format: ContentFormat.MARKDOWN, schema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the extracted data @@ -347,11 +356,10 @@ describe("Extract Integration Tests", () => { }); const result = await extract({ + llm: createOpenAILLM(), content: markdownContent, format: ContentFormat.MARKDOWN, schema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the extracted data @@ -378,11 +386,10 @@ describe("Extract Integration Tests", () => { }; const result = await extract({ + llm: createGeminiLLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/blog/async-await", extractionContext: partialData, }); @@ -400,11 +407,10 @@ describe("Extract Integration Tests", () => { }; const result = await extract({ + llm: createOpenAILLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com/blog/async-await", extractionContext: partialData, }); @@ -436,11 +442,10 @@ describe("Extract Integration Tests", () => { }; const result = await extract({ + llm: createGeminiLLM(), content: productListHtml, format: ContentFormat.HTML, schema: productSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/products", extractionContext: partialData, prompt: @@ -596,11 +601,10 @@ describe("Image Extraction Integration Tests", () => { // Test with OpenAI test("should extract images using OpenAI when includeImages is true", async () => { const result = await extract({ + llm: createOpenAILLM(), content: articleWithImages, format: ContentFormat.HTML, schema: articleSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, htmlExtractionOptions: { includeImages: true, }, @@ -613,11 +617,10 @@ describe("Image Extraction Integration Tests", () => { // Test with Google Gemini test("should extract images using Google Gemini when includeImages is true", async () => { const result = await extract({ + llm: createGeminiLLM(), content: articleWithImages, format: ContentFormat.HTML, schema: articleSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, htmlExtractionOptions: { includeImages: true, }, diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts index 3cc1392..663db63 100644 --- a/tests/integration/processedContent.test.ts +++ b/tests/integration/processedContent.test.ts @@ -1,5 +1,14 @@ import { z } from "zod"; -import { extract, ContentFormat, LLMProvider } from "../../src"; +import { ChatOpenAI } from "@langchain/openai"; +import { extract, ContentFormat } from "../../src"; + +function createOpenAILLM() { + return new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }); +} describe("ProcessedContent Integration Tests", () => { const simpleSchema = z.object({ @@ -25,11 +34,10 @@ describe("ProcessedContent Integration Tests", () => { "Title: Simple Test\n\nThis is a test of plain text extraction."; const result = await extract({ + llm: createOpenAILLM(), content: plainTextContent, format: ContentFormat.TXT, schema: simpleSchema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the processedContent is the same as the original content @@ -46,11 +54,10 @@ describe("ProcessedContent Integration Tests", () => { "# Simple Test\n\nThis is a test of markdown extraction."; const result = await extract({ + llm: createOpenAILLM(), content: markdownContent, format: ContentFormat.MARKDOWN, schema: simpleSchema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the processedContent is the same as the original content @@ -67,11 +74,10 @@ describe("ProcessedContent Integration Tests", () => { "

Simple Test

This is a test of HTML extraction.

"; const result = await extract({ + llm: createOpenAILLM(), content: htmlContent, format: ContentFormat.HTML, schema: simpleSchema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com", }); diff --git a/tests/unit/extractors.test.ts b/tests/unit/extractors.test.ts index 7678087..f5c9272 100644 --- a/tests/unit/extractors.test.ts +++ b/tests/unit/extractors.test.ts @@ -1,47 +1,26 @@ import { getUsage, - createLLM, extractWithLLM, truncateContent, generateExtractionPrompt, } from "../../src/extractors"; -import { LLMProvider, ContentFormat } from "../../src/types"; +import { ContentFormat } from "../../src/types"; import { z } from "zod"; -// Mock the LLM providers -jest.mock("@langchain/openai", () => ({ - ChatOpenAI: jest.fn().mockImplementation(() => ({ - constructor: { name: "ChatOpenAI" }, +function createMockLLM() { + return { withStructuredOutput: jest.fn().mockImplementation(() => ({ invoke: jest.fn().mockResolvedValue({ parsed: { title: "Test Title", content: "Test Content" }, raw: { tool_calls: [ - { - args: { title: "Test Title", content: "Test Content" }, - }, + { args: { title: "Test Title", content: "Test Content" } }, ], }, }), })), - })), -})); - -jest.mock("@langchain/google-genai", () => ({ - ChatGoogleGenerativeAI: jest.fn().mockImplementation(() => ({ - constructor: { name: "ChatGoogleGenerativeAI" }, - withStructuredOutput: jest.fn().mockImplementation(() => ({ - invoke: jest.fn().mockResolvedValue({ - parsed: { title: "Test Title", content: "Test Content" }, - raw: { - lc_kwargs: { - content: '{"title":"Test Title","content":"Test Content"}', - }, - }, - }), - })), - })), -})); + }; +} describe("extractors", () => { const mockSchema = z.object({ @@ -50,7 +29,6 @@ describe("extractors", () => { }); const mockContent = "Test content"; - const mockApiKey = "test-api-key"; beforeEach(() => { jest.clearAllMocks(); @@ -95,63 +73,10 @@ describe("extractors", () => { }); }); - describe("createLLM", () => { - it("should create ChatOpenAI instance for OPENAI provider", () => { - const llm = createLLM( - LLMProvider.OPENAI, - "gpt-4o-mini", - "fake-api-key", - 0 - ); - - expect(llm).toBeDefined(); - expect(llm.constructor.name).toBe("ChatOpenAI"); - }); - - it("should create ChatGoogleGenerativeAI instance for GOOGLE_GEMINI provider", () => { - const llm = createLLM( - LLMProvider.GOOGLE_GEMINI, - "gemini-2.5-flash", - "fake-api-key", - 0 - ); - - expect(llm).toBeDefined(); - expect(llm.constructor.name).toBe("ChatGoogleGenerativeAI"); - }); - - it("should throw error for unsupported provider", () => { - expect(() => { - // @ts-ignore - Testing invalid provider - createLLM("unsupported-provider", "model", "api-key", 0); - }).toThrow("Unsupported LLM provider"); - }); - }); - describe("extractWithLLM", () => { - it("should extract data using OpenAI", async () => { - const result = await extractWithLLM( - mockContent, - mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey - ); - - expect(result.data).toEqual({ - title: "Test Title", - content: "Test Content", - }); - }); - - it("should extract data using Google Gemini", async () => { - const result = await extractWithLLM( - mockContent, - mockSchema, - LLMProvider.GOOGLE_GEMINI, - "gemini-2.5-flash", - mockApiKey - ); + it("should extract data using provided LLM", async () => { + const llm = createMockLLM() as any; + const result = await extractWithLLM(mockContent, mockSchema, llm); expect(result.data).toEqual({ title: "Test Title", @@ -160,14 +85,12 @@ describe("extractors", () => { }); it("should handle custom prompts", async () => { + const llm = createMockLLM() as any; const customPrompt = "Extract the main topic and summary"; const result = await extractWithLLM( mockContent, mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey, - 0, + llm, customPrompt ); @@ -178,13 +101,11 @@ describe("extractors", () => { }); it("should handle different content formats", async () => { + const llm = createMockLLM() as any; const result = await extractWithLLM( mockContent, mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey, - 0, + llm, undefined, ContentFormat.TXT ); @@ -196,18 +117,16 @@ describe("extractors", () => { }); it("should handle extraction context", async () => { + const llm = createMockLLM() as any; const extractionContext = { title: "Existing Title", - content: "", // Empty field that should be filled + content: "", }; const result = await extractWithLLM( mockContent, mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey, - 0, + llm, undefined, ContentFormat.TXT, undefined,