From e9bdebe134d8c68d1d458b238c6d697ae9aa19b4 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 10 Mar 2026 23:12:58 -0700 Subject: [PATCH 1/3] Enhance extraction functionality to support any LangChain chat model. Introduced `llm` option in `ExtractorOptions` to allow users to pass custom models, overriding provider-specific settings. Updated related functions and README for clarity on usage. --- README.md | 75 +++++++++++++++++++++++++++-------- package.json | 5 ++- src/extractors.ts | 7 +--- src/index.ts | 73 +++++++++++++++++++--------------- src/types.ts | 25 +++++++++--- tests/unit/extractors.test.ts | 37 ++++++----------- 6 files changed, 139 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 255a186..ba5563c 100644 --- a/README.md +++ b/README.md @@ -241,9 +241,55 @@ console.log(result.data); // } ``` -### Customizing LLM Provider and Managing Token Limits +### Bring Your Own LLM (Any LangChain Model) -You can customize LLM and manage token limits to control costs and ensure your content fits within the model's maximum context window: +You can pass **any LangChain chat model** directly via the `llm` option. This lets you use any provider supported by LangChain — Anthropic, Mistral, Cohere, Ollama, Azure OpenAI, AWS Bedrock, and more — without being limited to the built-in OpenAI and Google Gemini providers. + +```typescript +import { extract, ContentFormat } from "@lightfeed/extractor"; +import { ChatAnthropic } from "@langchain/anthropic"; + +const llm = new ChatAnthropic({ + model: "claude-sonnet-4-20250514", + apiKey: process.env.ANTHROPIC_API_KEY, +}); + +const result = await extract({ + llm, + content: markdownContent, + format: ContentFormat.MARKDOWN, + schema: mySchema, +}); +``` + +This works with any LangChain-compatible chat model: + +```typescript +// Ollama (local models) +import { ChatOllama } from "@langchain/ollama"; +const llm = new ChatOllama({ model: "llama3" }); + +// Mistral +import { ChatMistralAI } from "@langchain/mistralai"; +const llm = new ChatMistralAI({ model: "mistral-large-latest" }); + +// Azure OpenAI +import { AzureChatOpenAI } from "@langchain/openai"; +const llm = new AzureChatOpenAI({ + azureOpenAIApiDeploymentName: "my-deployment", +}); + +// AWS Bedrock +import { ChatBedrockConverse } from "@langchain/aws"; +const llm = new ChatBedrockConverse({ model: "anthropic.claude-3-sonnet-20240229-v1:0" }); +``` + +> [!NOTE] +> When using the `llm` option, the `provider`, `modelName`, `temperature`, and API key options (`googleApiKey`, `openaiApiKey`) are ignored — configure those directly on the LangChain model instance. You only need to install the LangChain integration package for the provider you want to use (e.g., `@langchain/anthropic`, `@langchain/ollama`). + +### Customizing Built-in LLM Provider and Managing Token Limits + +If you prefer not to manage LangChain instances directly, you can use the built-in provider shortcuts for OpenAI and Google Gemini: ```typescript // Extract from Markdown with token limit @@ -337,17 +383,13 @@ const result = await extract({ ## LLM Extraction Function -### LLM API Keys +### LLM Configuration -The library currently supports Google Gemini and OpenAI ChatGPT models. It will check for LLM API keys in the following order: +The library supports three ways to configure the LLM, in order of priority: -1. Directly provided API key parameter (`googleApiKey` or `openaiApiKey`) -2. Environment variables (`GOOGLE_API_KEY` or `OPENAI_API_KEY`) - -While the library can use environment variables, it's recommended to explicitly provide API keys in production code for better control and transparency. - -> [!NOTE] -> Want support for additional LLM providers? Please [create an issue](https://github.com/lightfeed/extractor/issues/new/choose) and let us know which providers you'd like to see supported. +1. **Custom LLM instance** (`llm`) — Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) directly. This gives you full control over the model, parameters, and provider. +2. **Built-in provider shortcuts** (`provider` + API key) — Use the built-in OpenAI or Google Gemini providers with just an API key and optional model name. +3. **Environment variables** (`GOOGLE_API_KEY` or `OPENAI_API_KEY`) — Falls back to env vars when using built-in providers without explicit keys. ### `extract(options: ExtractorOptions): Promise>` @@ -360,12 +402,13 @@ Main function to extract structured data from content. | `content` | `string` | HTML, markdown, or plain text content to extract from | Required | | `format` | `ContentFormat` | Content format (HTML, MARKDOWN, or TXT) | Required | | `schema` | `z.ZodTypeAny` | Zod schema defining the structure to extract | Required | +| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance. When provided, `provider`, `modelName`, `temperature`, and API key options are ignored. | `undefined` | | `prompt` | `string` | Custom prompt to guide the extraction process | Internal default prompt | -| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI) | `LLMProvider.GOOGLE_GEMINI` | -| `modelName` | `string` | Model name to use | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini | -| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider) | From env `GOOGLE_API_KEY` | -| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider) | From env `OPENAI_API_KEY` | -| `temperature` | `number` | Temperature for the LLM (0-1) | `0` | +| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when `llm` is provided. | `LLMProvider.GOOGLE_GEMINI` | +| `modelName` | `string` | Model name to use. Ignored when `llm` is provided. | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini | +| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider). Ignored when `llm` is provided. | From env `GOOGLE_API_KEY` | +| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider). Ignored when `llm` is provided. | From env `OPENAI_API_KEY` | +| `temperature` | `number` | Temperature for the LLM (0-1). Ignored when `llm` is provided. | `0` | | `htmlExtractionOptions` | `HTMLExtractionOptions` | HTML-specific options for content extraction [see below](#htmlextractionoptions) | `{}` | | `sourceUrl` | `string` | URL of the HTML content, required when format is HTML to properly handle relative URLs | Required for HTML format | | `maxInputTokens` | `number` | Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. When specified, content will be truncated if the total prompt size exceeds this limit. | `undefined` | diff --git a/package.json b/package.json index 3d9fa01..ccca8e0 100644 --- a/package.json +++ b/package.json @@ -42,8 +42,11 @@ "html", "markdown", "structured-data", + "langchain", "openai", - "gemini" + "gemini", + "anthropic", + "ollama" ], "author": "Lightfeed", "license": "Apache-2.0", diff --git a/src/extractors.ts b/src/extractors.ts index 901e188..683518a 100644 --- a/src/extractors.ts +++ b/src/extractors.ts @@ -1,5 +1,6 @@ import { ChatOpenAI } from "@langchain/openai"; import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import { z } from "zod"; import { LLMProvider, Usage, ContentFormat } from "./types"; import { AIMessage } from "@langchain/core/messages"; @@ -181,16 +182,12 @@ export function truncateContent({ export async function extractWithLLM( content: string, schema: T, - provider: LLMProvider, - modelName: string, - apiKey: string, - temperature: number = 0, + llm: BaseChatModel, customPrompt?: string, format: string = ContentFormat.MARKDOWN, maxInputTokens?: number, extractionContext?: Record, ): Promise<{ data: z.infer; usage: Usage }> { - const llm = createLLM(provider, modelName, apiKey, temperature); let usage: Usage = {}; // Truncate content if maxInputTokens is specified diff --git a/src/index.ts b/src/index.ts index 08cf600..0136b49 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,7 @@ import { z } from "zod"; +import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import { htmlToMarkdown } from "./converters"; -import { extractWithLLM } from "./extractors"; +import { createLLM, extractWithLLM } from "./extractors"; import { ContentFormat, LLMProvider, @@ -16,28 +17,15 @@ const DEFAULT_MODELS = { }; /** - * Extract structured data from HTML, markdown, or plain text content using an LLM - * - * @param options Configuration options for extraction - * @param options.content HTML, markdown, or plain text content to extract from - * @param options.format Content format (HTML, MARKDOWN, or TXT) - * @param options.schema Zod schema defining the structure to extract - * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI) - * @param options.modelName Model name to use (provider-specific) - * @param options.googleApiKey Google API key (if using Google Gemini provider) - * @param options.openaiApiKey OpenAI API key (if using OpenAI provider) - * @param options.temperature Temperature for the LLM (0-1) - * @param options.prompt Custom prompt to guide the extraction process - * @param options.sourceUrl URL of the HTML content (required for HTML format) - * @param options.htmlExtractionOptions HTML-specific options for content extraction - * @param options.maxInputTokens Maximum number of input tokens to send to the LLM - * @param options.extractionContext Extraction context that provides additional information for the extraction process (partial data, metadata, etc.) - * @returns The extracted data, original content, and usage statistics + * Resolve the LLM to use: either a user-provided instance or one created from provider config. */ -export async function extract( - options: ExtractorOptions -): Promise>> { - // Validate required parameters +function resolveLLM( + options: ExtractorOptions, +): BaseChatModel { + if (options.llm) { + return options.llm; + } + const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI; let apiKey: string; @@ -59,6 +47,35 @@ export async function extract( throw new Error(`Unsupported LLM provider: ${provider}`); } + const modelName = options.modelName ?? DEFAULT_MODELS[provider]; + return createLLM(provider, modelName, apiKey, options.temperature ?? 0); +} + +/** + * Extract structured data from HTML, markdown, or plain text content using an LLM + * + * @param options Configuration options for extraction + * @param options.content HTML, markdown, or plain text content to extract from + * @param options.format Content format (HTML, MARKDOWN, or TXT) + * @param options.schema Zod schema defining the structure to extract + * @param options.llm A LangChain chat model instance. When provided, provider/modelName/apiKey options are ignored. + * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when llm is provided. + * @param options.modelName Model name to use (provider-specific). Ignored when llm is provided. + * @param options.googleApiKey Google API key (if using Google Gemini provider). Ignored when llm is provided. + * @param options.openaiApiKey OpenAI API key (if using OpenAI provider). Ignored when llm is provided. + * @param options.temperature Temperature for the LLM (0-1). Ignored when llm is provided. + * @param options.prompt Custom prompt to guide the extraction process + * @param options.sourceUrl URL of the HTML content (required for HTML format) + * @param options.htmlExtractionOptions HTML-specific options for content extraction + * @param options.maxInputTokens Maximum number of input tokens to send to the LLM + * @param options.extractionContext Extraction context that provides additional information for the extraction process (partial data, metadata, etc.) + * @returns The extracted data, original content, and usage statistics + */ +export async function extract( + options: ExtractorOptions +): Promise>> { + const llm = resolveLLM(options); + // Validate sourceUrl for HTML format if (options.format === ContentFormat.HTML && !options.sourceUrl) { throw new Error( @@ -66,9 +83,6 @@ export async function extract( ); } - // Get model name (use defaults if not provided) - const modelName = options.modelName ?? DEFAULT_MODELS[provider]; - // Convert HTML to markdown if needed let content = options.content; let formatToUse = options.format; @@ -79,7 +93,6 @@ export async function extract( options.htmlExtractionOptions, options.sourceUrl ); - // For the LLM, the content is now markdown formatToUse = ContentFormat.MARKDOWN; } @@ -87,17 +100,13 @@ export async function extract( const { data, usage } = await extractWithLLM( content, options.schema, - provider, - modelName, - apiKey, - options.temperature ?? 0, + llm, options.prompt, - formatToUse.toString(), // Pass the correct format based on actual content + formatToUse.toString(), options.maxInputTokens, options.extractionContext ); - // Return the full result return { data, processedContent: content, diff --git a/src/types.ts b/src/types.ts index 9d828ba..e0ec0fd 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,4 +1,5 @@ import { z } from "zod"; +import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import type { Browser, LaunchOptions, ConnectOverCDPOptions } from "playwright"; /** @@ -126,19 +127,33 @@ export interface ExtractorOptions { /** Schema for structured extraction */ schema: T; - /** LLM Provider (OpenAI or Google Gemini) */ + /** + * A LangChain chat model instance to use for extraction. + * When provided, `provider`, `modelName`, and API key options are ignored. + * Accepts any LangChain chat model (ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc.). + * + * @example + * ```typescript + * import { ChatAnthropic } from "@langchain/anthropic"; + * const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514" }); + * const result = await extract({ llm, content, format, schema }); + * ``` + */ + llm?: BaseChatModel; + + /** LLM Provider (OpenAI or Google Gemini). Ignored when `llm` is provided. */ provider?: LLMProvider; - /** Model name to use */ + /** Model name to use. Ignored when `llm` is provided. */ modelName?: string; - /** OpenAI API key */ + /** OpenAI API key. Ignored when `llm` is provided. */ openaiApiKey?: string; - /** Google API key */ + /** Google API key. Ignored when `llm` is provided. */ googleApiKey?: string; - /** Temperature for the LLM (0-1), defaults to 0 */ + /** Temperature for the LLM (0-1), defaults to 0. Ignored when `llm` is provided. */ temperature?: number; /** HTML-specific extraction options (only applies when format is HTML) */ diff --git a/tests/unit/extractors.test.ts b/tests/unit/extractors.test.ts index 7678087..b249b26 100644 --- a/tests/unit/extractors.test.ts +++ b/tests/unit/extractors.test.ts @@ -130,13 +130,8 @@ describe("extractors", () => { describe("extractWithLLM", () => { it("should extract data using OpenAI", async () => { - const result = await extractWithLLM( - mockContent, - mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey - ); + const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); + const result = await extractWithLLM(mockContent, mockSchema, llm); expect(result.data).toEqual({ title: "Test Title", @@ -145,13 +140,13 @@ describe("extractors", () => { }); it("should extract data using Google Gemini", async () => { - const result = await extractWithLLM( - mockContent, - mockSchema, + const llm = createLLM( LLMProvider.GOOGLE_GEMINI, "gemini-2.5-flash", - mockApiKey + mockApiKey, + 0 ); + const result = await extractWithLLM(mockContent, mockSchema, llm); expect(result.data).toEqual({ title: "Test Title", @@ -160,14 +155,12 @@ describe("extractors", () => { }); it("should handle custom prompts", async () => { + const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); const customPrompt = "Extract the main topic and summary"; const result = await extractWithLLM( mockContent, mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey, - 0, + llm, customPrompt ); @@ -178,13 +171,11 @@ describe("extractors", () => { }); it("should handle different content formats", async () => { + const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); const result = await extractWithLLM( mockContent, mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey, - 0, + llm, undefined, ContentFormat.TXT ); @@ -196,18 +187,16 @@ describe("extractors", () => { }); it("should handle extraction context", async () => { + const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); const extractionContext = { title: "Existing Title", - content: "", // Empty field that should be filled + content: "", }; const result = await extractWithLLM( mockContent, mockSchema, - LLMProvider.OPENAI, - "gpt-4o-mini", - mockApiKey, - 0, + llm, undefined, ContentFormat.TXT, undefined, From 7db859502b0ba9bb38a0578dbe5013121450f6d7 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 10 Mar 2026 23:23:41 -0700 Subject: [PATCH 2/3] Refactor LLM provider handling by removing `LLMProvider` enum and related logic. Updated `ExtractorOptions` to require a `llm` instance directly, simplifying the API. Adjusted tests and documentation to reflect these changes. --- README.md | 122 +++++++++---------- package.json | 4 +- src/dev/runLocalTest.ts | 91 +++++++------- src/dev/testBrowserExtraction.ts | 10 +- src/dev/testUsage.ts | 10 +- src/example.ts | 15 ++- src/extractors.ts | 33 +---- src/index.ts | 55 +-------- src/types.ts | 30 +---- tests/integration/browser-extraction.test.ts | 14 ++- tests/integration/extract.test.ts | 59 ++++----- tests/integration/processedContent.test.ts | 20 +-- tests/unit/extractors.test.ts | 92 ++------------ 13 files changed, 195 insertions(+), 360 deletions(-) diff --git a/README.md b/README.md index ba5563c..148e726 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,20 @@ Lightfeed Extractor is a Typescript library built for robust web data extraction ## Installation ```bash -npm install @lightfeed/extractor +npm install @lightfeed/extractor @langchain/core +``` + +Install the LangChain integration for your chosen LLM provider: + +```bash +# For OpenAI +npm install @langchain/openai + +# For Google Gemini +npm install @langchain/google-genai + +# For Anthropic, Ollama, etc. +npm install @langchain/anthropic # or @langchain/ollama, @langchain/mistralai, etc. ``` ## Usage @@ -61,7 +74,8 @@ npm install @lightfeed/extractor This example demonstrates extracting structured product data from a real e-commerce website using a local headed Playwright browser. For production environments, you can use a Playwright browser in [serverless](#serverless-browser) or [remote](#remote-browser) mode. ```typescript -import { extract, ContentFormat, LLMProvider, Browser } from "@lightfeed/extractor"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat, Browser } from "@lightfeed/extractor"; import { z } from "zod"; // Define schema for product catalog extraction @@ -114,12 +128,15 @@ try { // Extract structured product data console.log("Extracting product data using LLM..."); const result = await extract({ + llm: new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }), content: html, format: ContentFormat.HTML, sourceUrl: pageUrl, schema: productCatalogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, // Use environment variable htmlExtractionOptions: { extractMainHtml: true, includeImages: true, @@ -164,15 +181,21 @@ try { ### Extracting from Markdown or Plain Text -You can also extract structured data directly from HTML, Markdown or text string: +You can also extract structured data directly from HTML, Markdown or text string. Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/): ```typescript +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat } from "@lightfeed/extractor"; + const result = await extract({ + llm: new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }), content: markdownContent, - // Specify that content is Markdown. In addition to HTML and Markdown, you can also extract plain text by ContentFormat.TXT format: ContentFormat.MARKDOWN, schema: mySchema, - googleApiKey: "your-google-gemini-api-key", }); ``` @@ -182,13 +205,12 @@ You can provide a custom prompt to guide the extraction process: ```typescript const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, sourceUrl: "https://example.com/products", - // In custom prompt, defined what data should be retrieved prompt: "Extract ONLY products that are on sale or have special discounts. Include their original prices, discounted prices, and product URL.", - googleApiKey: "your-google-gemini-api-key", }); ``` @@ -222,12 +244,12 @@ const schema = z.object({ }); const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: schema, sourceUrl: "https://acme.com/products/smart-security-camera", extractionContext: extractionContext, - googleApiKey: "your-google-gemini-api-key", }); // The LLM will use the context to extract store name (acme) and consider the location @@ -241,73 +263,44 @@ console.log(result.data); // } ``` -### Bring Your Own LLM (Any LangChain Model) +### Using Any LangChain Model -You can pass **any LangChain chat model** directly via the `llm` option. This lets you use any provider supported by LangChain — Anthropic, Mistral, Cohere, Ollama, Azure OpenAI, AWS Bedrock, and more — without being limited to the built-in OpenAI and Google Gemini providers. +Pass **any LangChain chat model** via the `llm` option. Use OpenAI, Google Gemini, Anthropic, Mistral, Ollama, Azure OpenAI, AWS Bedrock, or any [LangChain-supported provider](https://js.langchain.com/docs/integrations/chat/): ```typescript -import { extract, ContentFormat } from "@lightfeed/extractor"; -import { ChatAnthropic } from "@langchain/anthropic"; +// OpenAI +import { ChatOpenAI } from "@langchain/openai"; +const llm = new ChatOpenAI({ modelName: "gpt-4o-mini", apiKey: process.env.OPENAI_API_KEY }); -const llm = new ChatAnthropic({ - model: "claude-sonnet-4-20250514", - apiKey: process.env.ANTHROPIC_API_KEY, -}); +// Google Gemini +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +const llm = new ChatGoogleGenerativeAI({ model: "gemini-2.5-flash", apiKey: process.env.GOOGLE_API_KEY }); -const result = await extract({ - llm, - content: markdownContent, - format: ContentFormat.MARKDOWN, - schema: mySchema, -}); -``` - -This works with any LangChain-compatible chat model: +// Anthropic +import { ChatAnthropic } from "@langchain/anthropic"; +const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514", apiKey: process.env.ANTHROPIC_API_KEY }); -```typescript -// Ollama (local models) +// Ollama (local) import { ChatOllama } from "@langchain/ollama"; const llm = new ChatOllama({ model: "llama3" }); - -// Mistral -import { ChatMistralAI } from "@langchain/mistralai"; -const llm = new ChatMistralAI({ model: "mistral-large-latest" }); - -// Azure OpenAI -import { AzureChatOpenAI } from "@langchain/openai"; -const llm = new AzureChatOpenAI({ - azureOpenAIApiDeploymentName: "my-deployment", -}); - -// AWS Bedrock -import { ChatBedrockConverse } from "@langchain/aws"; -const llm = new ChatBedrockConverse({ model: "anthropic.claude-3-sonnet-20240229-v1:0" }); ``` -> [!NOTE] -> When using the `llm` option, the `provider`, `modelName`, `temperature`, and API key options (`googleApiKey`, `openaiApiKey`) are ignored — configure those directly on the LangChain model instance. You only need to install the LangChain integration package for the provider you want to use (e.g., `@langchain/anthropic`, `@langchain/ollama`). - -### Customizing Built-in LLM Provider and Managing Token Limits +### Managing Token Limits -If you prefer not to manage LangChain instances directly, you can use the built-in provider shortcuts for OpenAI and Google Gemini: +Use `maxInputTokens` to truncate content when it exceeds the model's context window: ```typescript -// Extract from Markdown with token limit const result = await extract({ + llm: new ChatOpenAI({ modelName: "gpt-4o-mini", apiKey: "..." }), content: markdownContent, format: ContentFormat.MARKDOWN, schema, - // Provide model provider and model name - provider: LLMProvider.OPENAI, - modelName: "gpt-4o-mini", - openaiApiKey: "your-openai-api-key", - // Limit to roughly 128K tokens (max input for gpt-4o-mini) - maxInputTokens: 128000, + maxInputTokens: 128000, // Roughly 128K tokens (4 chars/token) }); ``` > [!WARNING] -> For OpenAI models, optional schema is not supported. You need to change `.optional()` to `.nullable()`. +> For OpenAI models, optional schema is not supported. Use `.nullable()` instead of `.optional()`. ### Extracting from Main HTML @@ -315,6 +308,7 @@ For blog posts or articles with lots of navigation elements, headers, and footer ```typescript const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, @@ -350,6 +344,7 @@ const productListSchema = z.object({ }); const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, @@ -366,6 +361,7 @@ The library can clean URLs to remove tracking parameters and unnecessary compone ```typescript const result = await extract({ + llm: myLLM, content: htmlContent, format: ContentFormat.HTML, schema: mySchema, @@ -385,11 +381,7 @@ const result = await extract({ ### LLM Configuration -The library supports three ways to configure the LLM, in order of priority: - -1. **Custom LLM instance** (`llm`) — Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) directly. This gives you full control over the model, parameters, and provider. -2. **Built-in provider shortcuts** (`provider` + API key) — Use the built-in OpenAI or Google Gemini providers with just an API key and optional model name. -3. **Environment variables** (`GOOGLE_API_KEY` or `OPENAI_API_KEY`) — Falls back to env vars when using built-in providers without explicit keys. +Pass a [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance via the `llm` option. Install the LangChain integration for your provider (e.g. `@langchain/openai`, `@langchain/google-genai`, `@langchain/anthropic`) and configure API keys on the model instance. ### `extract(options: ExtractorOptions): Promise>` @@ -399,16 +391,11 @@ Main function to extract structured data from content. | Option | Type | Description | Default | |--------|------|-------------|---------| +| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance (ChatOpenAI, ChatGoogleGenerativeAI, ChatAnthropic, etc.) | Required | | `content` | `string` | HTML, markdown, or plain text content to extract from | Required | | `format` | `ContentFormat` | Content format (HTML, MARKDOWN, or TXT) | Required | | `schema` | `z.ZodTypeAny` | Zod schema defining the structure to extract | Required | -| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance. When provided, `provider`, `modelName`, `temperature`, and API key options are ignored. | `undefined` | | `prompt` | `string` | Custom prompt to guide the extraction process | Internal default prompt | -| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when `llm` is provided. | `LLMProvider.GOOGLE_GEMINI` | -| `modelName` | `string` | Model name to use. Ignored when `llm` is provided. | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini | -| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider). Ignored when `llm` is provided. | From env `GOOGLE_API_KEY` | -| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider). Ignored when `llm` is provided. | From env `OPENAI_API_KEY` | -| `temperature` | `number` | Temperature for the LLM (0-1). Ignored when `llm` is provided. | `0` | | `htmlExtractionOptions` | `HTMLExtractionOptions` | HTML-specific options for content extraction [see below](#htmlextractionoptions) | `{}` | | `sourceUrl` | `string` | URL of the HTML content, required when format is HTML to properly handle relative URLs | Required for HTML format | | `maxInputTokens` | `number` | Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. When specified, content will be truncated if the total prompt size exceeds this limit. | `undefined` | @@ -696,6 +683,7 @@ const schema = z.object({ }); const result = await extract({ + llm: myLLM, content: markdownContent, format: ContentFormat.MARKDOWN, schema, diff --git a/package.json b/package.json index ccca8e0..1ab543a 100644 --- a/package.json +++ b/package.json @@ -56,8 +56,6 @@ "homepage": "https://github.com/lightfeed/extractor#readme", "dependencies": { "@langchain/core": "^1.1.31", - "@langchain/google-genai": "^2.1.24", - "@langchain/openai": "^1.2.12", "cheerio": "^1.0.0", "jsonrepair": "^3.12.0", "langchain": "^1.2.30", @@ -68,6 +66,8 @@ "zod": "^3.24.3" }, "devDependencies": { + "@langchain/google-genai": "^2.1.24", + "@langchain/openai": "^1.2.12", "@types/jest": "^29.5.12", "@types/node": "^22.15.3", "@types/turndown": "^5.0.5", diff --git a/src/dev/runLocalTest.ts b/src/dev/runLocalTest.ts index e6b373a..1f51da9 100644 --- a/src/dev/runLocalTest.ts +++ b/src/dev/runLocalTest.ts @@ -2,11 +2,30 @@ import * as fs from "fs"; import * as path from "path"; import { config } from "dotenv"; import { z } from "zod"; -import { extract, ContentFormat, LLMProvider } from "../index"; +import { ChatOpenAI } from "@langchain/openai"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat } from "../index"; // Load environment variables from .env file config({ path: path.resolve(process.cwd(), ".env") }); +type Provider = "gemini" | "openai"; + +function createLLM(provider: Provider) { + if (provider === "gemini") { + return new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }); + } + return new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }); +} + // Helper to load HTML test fixtures function loadFixture(filename: string): string { return fs.readFileSync( @@ -67,34 +86,25 @@ const productSchemaOpenAI = z.object({ }); // Test functions -async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) { +async function testBlogExtraction(provider: Provider = "gemini") { console.log(`Testing blog post extraction with ${provider}...`); try { const html = loadFixture("blog-post.html"); - // Check for required API key - if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) { + if (provider === "gemini" && !process.env.GOOGLE_API_KEY) { console.error("Error: GOOGLE_API_KEY environment variable is required"); process.exit(1); - } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) { + } else if (provider === "openai" && !process.env.OPENAI_API_KEY) { console.error("Error: OPENAI_API_KEY environment variable is required"); process.exit(1); } - const apiKey = - provider === LLMProvider.GOOGLE_GEMINI - ? process.env.GOOGLE_API_KEY - : process.env.OPENAI_API_KEY; - const result = await extract({ + llm: createLLM(provider), content: html, format: ContentFormat.HTML, - schema: - provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI, - provider, - googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined, - openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined, + schema: provider === "gemini" ? blogSchema : blogSchemaOpenAI, htmlExtractionOptions: { extractMainHtml: false, }, @@ -113,36 +123,25 @@ async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) { } } -async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) { +async function testProductExtraction(provider: Provider = "gemini") { console.log(`Testing product listing extraction with ${provider}...`); try { const html = loadFixture("product-list.html"); - // Check for required API key - if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) { + if (provider === "gemini" && !process.env.GOOGLE_API_KEY) { console.error("Error: GOOGLE_API_KEY environment variable is required"); process.exit(1); - } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) { + } else if (provider === "openai" && !process.env.OPENAI_API_KEY) { console.error("Error: OPENAI_API_KEY environment variable is required"); process.exit(1); } - const apiKey = - provider === LLMProvider.GOOGLE_GEMINI - ? process.env.GOOGLE_API_KEY - : process.env.OPENAI_API_KEY; - const result = await extract({ + llm: createLLM(provider), content: html, format: ContentFormat.HTML, - schema: - provider === LLMProvider.GOOGLE_GEMINI - ? productSchema - : productSchemaOpenAI, - provider, - googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined, - openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined, + schema: provider === "gemini" ? productSchema : productSchemaOpenAI, htmlExtractionOptions: { extractMainHtml: true, }, @@ -163,38 +162,32 @@ async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) { // Run tests based on command line arguments async function main() { - // Parse arguments: content type and provider const args = process.argv.slice(2); - const contentType = args[0] || "all"; // 'blog', 'product', or 'all' - const provider = - args[1]?.toUpperCase() === "OPENAI" - ? LLMProvider.OPENAI - : args[1]?.toUpperCase() === "GEMINI" - ? LLMProvider.GOOGLE_GEMINI - : "all"; // 'OPENAI', 'GEMINI', or 'all' + const contentType = args[0] || "all"; + const providerArg = args[1]?.toUpperCase(); + const provider: Provider | "all" = + providerArg === "OPENAI" ? "openai" : providerArg === "GEMINI" ? "gemini" : "all"; console.log("API Keys available:"); console.log(`- GOOGLE_API_KEY: ${process.env.GOOGLE_API_KEY ? "Yes" : "No"}`); console.log(`- OPENAI_API_KEY: ${process.env.OPENAI_API_KEY ? "Yes" : "No"}`); console.log(""); - // Run blog tests if (contentType === "blog" || contentType === "all") { - if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") { - await testBlogExtraction(LLMProvider.GOOGLE_GEMINI); + if (provider === "gemini" || provider === "all") { + await testBlogExtraction("gemini"); } - if (provider === LLMProvider.OPENAI || provider === "all") { - await testBlogExtraction(LLMProvider.OPENAI); + if (provider === "openai" || provider === "all") { + await testBlogExtraction("openai"); } } - // Run product tests if (contentType === "product" || contentType === "all") { - if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") { - await testProductExtraction(LLMProvider.GOOGLE_GEMINI); + if (provider === "gemini" || provider === "all") { + await testProductExtraction("gemini"); } - if (provider === LLMProvider.OPENAI || provider === "all") { - await testProductExtraction(LLMProvider.OPENAI); + if (provider === "openai" || provider === "all") { + await testProductExtraction("openai"); } } } diff --git a/src/dev/testBrowserExtraction.ts b/src/dev/testBrowserExtraction.ts index 672816a..bcc2aca 100644 --- a/src/dev/testBrowserExtraction.ts +++ b/src/dev/testBrowserExtraction.ts @@ -1,4 +1,5 @@ -import { extract, ContentFormat, LLMProvider, Browser } from "../index"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat, Browser } from "../index"; import { z } from "zod"; import * as path from "path"; import { config } from "dotenv"; @@ -65,12 +66,15 @@ async function testProductCatalogExtraction() { console.log("\n🧠 Extracting product data using LLM..."); const result = await extract({ + llm: new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }), content: html, format: ContentFormat.HTML, sourceUrl: testUrl, schema: productCatalogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, htmlExtractionOptions: { extractMainHtml: true, includeImages: true, diff --git a/src/dev/testUsage.ts b/src/dev/testUsage.ts index d5c15ff..86887c6 100644 --- a/src/dev/testUsage.ts +++ b/src/dev/testUsage.ts @@ -1,7 +1,8 @@ import { config } from "dotenv"; import * as path from "path"; import { z } from "zod"; -import { extract, ContentFormat, LLMProvider } from "../index"; +import { ChatOpenAI } from "@langchain/openai"; +import { extract, ContentFormat } from "../index"; // Load environment variables from .env file config({ path: path.resolve(process.cwd(), ".env") }); @@ -32,11 +33,14 @@ This is a test of the usage tracking system. try { // Run extraction const result = await extract({ + llm: new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }), content: markdown, format: ContentFormat.MARKDOWN, schema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Log the results diff --git a/src/example.ts b/src/example.ts index 470c715..2e694e2 100644 --- a/src/example.ts +++ b/src/example.ts @@ -1,4 +1,5 @@ -import { extract, ContentFormat, LLMProvider } from "./index"; +import { ChatOpenAI } from "@langchain/openai"; +import { extract, ContentFormat } from "./index"; import { z } from "zod"; import { config } from "dotenv"; import * as path from "path"; @@ -11,8 +12,8 @@ config({ path: path.resolve(process.cwd(), ".env") }); async function example() { try { // Check if API key is available - if (!process.env.GOOGLE_API_KEY) { - console.error("Error: GOOGLE_API_KEY environment variable is required"); + if (!process.env.OPENAI_API_KEY) { + console.error("Error: OPENAI_API_KEY environment variable is required"); return; } @@ -46,12 +47,14 @@ async function example() { // Extract data from HTML const result = await extract({ + llm: new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }), content: htmlContent, format: ContentFormat.HTML, schema, - // Using Google Gemini by default - openaiApiKey: process.env.OPENAI_API_KEY, - provider: LLMProvider.OPENAI, sourceUrl, }); diff --git a/src/extractors.ts b/src/extractors.ts index 683518a..24bb4c5 100644 --- a/src/extractors.ts +++ b/src/extractors.ts @@ -1,8 +1,6 @@ -import { ChatOpenAI } from "@langchain/openai"; -import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import { z } from "zod"; -import { LLMProvider, Usage, ContentFormat } from "./types"; +import { Usage, ContentFormat } from "./types"; import { AIMessage } from "@langchain/core/messages"; import { safeSanitizedParser, @@ -40,35 +38,6 @@ export function getUsage(output: LLMResult): Usage { return usage; } -/** - * Create LLM instance based on provider and configuration - */ -export function createLLM( - provider: LLMProvider, - modelName: string, - apiKey: string, - temperature: number = 0, -) { - switch (provider) { - case LLMProvider.OPENAI: - return new ChatOpenAI({ - apiKey, - modelName, - temperature, - }); - - case LLMProvider.GOOGLE_GEMINI: - return new ChatGoogleGenerativeAI({ - apiKey, - model: modelName, - temperature, - }); - - default: - throw new Error(`Unsupported LLM provider: ${provider}`); - } -} - interface ExtractionPromptOptions { format: string; content: string; diff --git a/src/index.ts b/src/index.ts index 0136b49..8865e00 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,69 +1,21 @@ import { z } from "zod"; -import type { BaseChatModel } from "@langchain/core/language_models/chat_models"; import { htmlToMarkdown } from "./converters"; -import { createLLM, extractWithLLM } from "./extractors"; +import { extractWithLLM } from "./extractors"; import { ContentFormat, - LLMProvider, ExtractorOptions, ExtractorResult, HTMLExtractionOptions, } from "./types"; -// Default model names -const DEFAULT_MODELS = { - [LLMProvider.GOOGLE_GEMINI]: "gemini-2.5-flash", - [LLMProvider.OPENAI]: "gpt-4o-mini", -}; - -/** - * Resolve the LLM to use: either a user-provided instance or one created from provider config. - */ -function resolveLLM( - options: ExtractorOptions, -): BaseChatModel { - if (options.llm) { - return options.llm; - } - - const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI; - let apiKey: string; - - if (provider === LLMProvider.GOOGLE_GEMINI) { - apiKey = options.googleApiKey ?? process.env.GOOGLE_API_KEY ?? ""; - if (!apiKey) { - throw new Error( - "Google API key is required. Provide googleApiKey option or set GOOGLE_API_KEY environment variable." - ); - } - } else if (provider === LLMProvider.OPENAI) { - apiKey = options.openaiApiKey ?? process.env.OPENAI_API_KEY ?? ""; - if (!apiKey) { - throw new Error( - "OpenAI API key is required. Provide openaiApiKey option or set OPENAI_API_KEY environment variable." - ); - } - } else { - throw new Error(`Unsupported LLM provider: ${provider}`); - } - - const modelName = options.modelName ?? DEFAULT_MODELS[provider]; - return createLLM(provider, modelName, apiKey, options.temperature ?? 0); -} - /** * Extract structured data from HTML, markdown, or plain text content using an LLM * * @param options Configuration options for extraction + * @param options.llm A LangChain chat model instance (ChatOpenAI, ChatAnthropic, etc.) * @param options.content HTML, markdown, or plain text content to extract from * @param options.format Content format (HTML, MARKDOWN, or TXT) * @param options.schema Zod schema defining the structure to extract - * @param options.llm A LangChain chat model instance. When provided, provider/modelName/apiKey options are ignored. - * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when llm is provided. - * @param options.modelName Model name to use (provider-specific). Ignored when llm is provided. - * @param options.googleApiKey Google API key (if using Google Gemini provider). Ignored when llm is provided. - * @param options.openaiApiKey OpenAI API key (if using OpenAI provider). Ignored when llm is provided. - * @param options.temperature Temperature for the LLM (0-1). Ignored when llm is provided. * @param options.prompt Custom prompt to guide the extraction process * @param options.sourceUrl URL of the HTML content (required for HTML format) * @param options.htmlExtractionOptions HTML-specific options for content extraction @@ -74,7 +26,6 @@ function resolveLLM( export async function extract( options: ExtractorOptions ): Promise>> { - const llm = resolveLLM(options); // Validate sourceUrl for HTML format if (options.format === ContentFormat.HTML && !options.sourceUrl) { @@ -100,7 +51,7 @@ export async function extract( const { data, usage } = await extractWithLLM( content, options.schema, - llm, + options.llm, options.prompt, formatToUse.toString(), options.maxInputTokens, diff --git a/src/types.ts b/src/types.ts index e0ec0fd..3642a0b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -11,14 +11,6 @@ export enum ContentFormat { TXT = "txt", } -/** - * Supported LLM providers - */ -export enum LLMProvider { - OPENAI = "openai", - GOOGLE_GEMINI = "google_gemini", -} - /** * Proxy configuration for network requests */ @@ -129,32 +121,16 @@ export interface ExtractorOptions { /** * A LangChain chat model instance to use for extraction. - * When provided, `provider`, `modelName`, and API key options are ignored. * Accepts any LangChain chat model (ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc.). * * @example * ```typescript - * import { ChatAnthropic } from "@langchain/anthropic"; - * const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514" }); + * import { ChatOpenAI } from "@langchain/openai"; + * const llm = new ChatOpenAI({ model: "gpt-4o-mini" }); * const result = await extract({ llm, content, format, schema }); * ``` */ - llm?: BaseChatModel; - - /** LLM Provider (OpenAI or Google Gemini). Ignored when `llm` is provided. */ - provider?: LLMProvider; - - /** Model name to use. Ignored when `llm` is provided. */ - modelName?: string; - - /** OpenAI API key. Ignored when `llm` is provided. */ - openaiApiKey?: string; - - /** Google API key. Ignored when `llm` is provided. */ - googleApiKey?: string; - - /** Temperature for the LLM (0-1), defaults to 0. Ignored when `llm` is provided. */ - temperature?: number; + llm: BaseChatModel; /** HTML-specific extraction options (only applies when format is HTML) */ htmlExtractionOptions?: HTMLExtractionOptions; diff --git a/tests/integration/browser-extraction.test.ts b/tests/integration/browser-extraction.test.ts index 6230796..f62125f 100644 --- a/tests/integration/browser-extraction.test.ts +++ b/tests/integration/browser-extraction.test.ts @@ -1,6 +1,15 @@ -import { extract, ContentFormat, LLMProvider, Browser } from "../../src/index"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { extract, ContentFormat, Browser } from "../../src/index"; import { z } from "zod"; +function createGeminiLLM() { + return new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }); +} + const testSchema = z.object({ title: z.string(), description: z.string().optional(), @@ -33,12 +42,11 @@ describe("Browser + Extraction Integration Tests", () => { // Extract data from the loaded HTML const result = await extract({ + llm: createGeminiLLM(), content: html, format: ContentFormat.HTML, sourceUrl: testUrl, schema: testSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, }); expect(result.data).toBeDefined(); diff --git a/tests/integration/extract.test.ts b/tests/integration/extract.test.ts index f944306..26d3054 100644 --- a/tests/integration/extract.test.ts +++ b/tests/integration/extract.test.ts @@ -1,14 +1,31 @@ import * as fs from "fs"; import * as path from "path"; import { z } from "zod"; +import { ChatOpenAI } from "@langchain/openai"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; import { extract, ContentFormat, - LLMProvider, ExtractorResult, } from "../../src"; import { htmlToMarkdown } from "../../src/converters"; +function createGeminiLLM() { + return new ChatGoogleGenerativeAI({ + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.5-flash", + temperature: 0, + }); +} + +function createOpenAILLM(modelName = "gpt-4o-mini") { + return new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName, + temperature: 0, + }); +} + // Read the sample HTML files const blogPostHtml = fs.readFileSync( path.resolve(__dirname, "../fixtures/blog-post.html"), @@ -77,11 +94,10 @@ describe("Extract Integration Tests", () => { describe("Blog Post Extraction", () => { test("should extract blog post data using Google Gemini default model", async () => { const result = await extract({ + llm: createGeminiLLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/blog/async-await", }); @@ -90,11 +106,10 @@ describe("Extract Integration Tests", () => { test("should extract blog post data using OpenAI default model", async () => { const result = await extract({ + llm: createOpenAILLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com/blog/async-await", }); @@ -223,11 +238,10 @@ describe("Extract Integration Tests", () => { describe("Product List Extraction", () => { test("should extract product list data using Google Gemini", async () => { const result = await extract({ + llm: createGeminiLLM(), content: productListHtml, format: ContentFormat.HTML, schema: productSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/products", htmlExtractionOptions: { extractMainHtml: true, @@ -239,11 +253,10 @@ describe("Extract Integration Tests", () => { test("should extract product list data using OpenAI", async () => { const result = await extract({ + llm: createOpenAILLM(), content: productListHtml, format: ContentFormat.HTML, schema: productSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com/products", htmlExtractionOptions: { extractMainHtml: true, @@ -268,9 +281,7 @@ describe("Extract Integration Tests", () => { // a value that is not expected by the schema. price: z.number().describe("Use 'N/A' if not available").nullable(), }), - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, - modelName: "gpt-3.5-turbo", + llm: createOpenAILLM("gpt-3.5-turbo"), }); expect(result.data).toEqual( expect.objectContaining({ @@ -299,8 +310,7 @@ describe("Extract Integration Tests", () => { // to fail in some cases to return the structured output. content: z.string().optional(), }), - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, + llm: createGeminiLLM(), sourceUrl: "https://example.com/blog/async-await", }); expect(result.data).toBeDefined(); @@ -319,11 +329,10 @@ describe("Extract Integration Tests", () => { }); const result = await extract({ + llm: createOpenAILLM(), content: markdownContent, format: ContentFormat.MARKDOWN, schema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the extracted data @@ -347,11 +356,10 @@ describe("Extract Integration Tests", () => { }); const result = await extract({ + llm: createOpenAILLM(), content: markdownContent, format: ContentFormat.MARKDOWN, schema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the extracted data @@ -378,11 +386,10 @@ describe("Extract Integration Tests", () => { }; const result = await extract({ + llm: createGeminiLLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/blog/async-await", extractionContext: partialData, }); @@ -400,11 +407,10 @@ describe("Extract Integration Tests", () => { }; const result = await extract({ + llm: createOpenAILLM(), content: blogPostHtml, format: ContentFormat.HTML, schema: blogSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com/blog/async-await", extractionContext: partialData, }); @@ -436,11 +442,10 @@ describe("Extract Integration Tests", () => { }; const result = await extract({ + llm: createGeminiLLM(), content: productListHtml, format: ContentFormat.HTML, schema: productSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, sourceUrl: "https://example.com/products", extractionContext: partialData, prompt: @@ -596,11 +601,10 @@ describe("Image Extraction Integration Tests", () => { // Test with OpenAI test("should extract images using OpenAI when includeImages is true", async () => { const result = await extract({ + llm: createOpenAILLM(), content: articleWithImages, format: ContentFormat.HTML, schema: articleSchemaOpenAI, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, htmlExtractionOptions: { includeImages: true, }, @@ -613,11 +617,10 @@ describe("Image Extraction Integration Tests", () => { // Test with Google Gemini test("should extract images using Google Gemini when includeImages is true", async () => { const result = await extract({ + llm: createGeminiLLM(), content: articleWithImages, format: ContentFormat.HTML, schema: articleSchema, - provider: LLMProvider.GOOGLE_GEMINI, - googleApiKey: process.env.GOOGLE_API_KEY, htmlExtractionOptions: { includeImages: true, }, diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts index 3cc1392..663db63 100644 --- a/tests/integration/processedContent.test.ts +++ b/tests/integration/processedContent.test.ts @@ -1,5 +1,14 @@ import { z } from "zod"; -import { extract, ContentFormat, LLMProvider } from "../../src"; +import { ChatOpenAI } from "@langchain/openai"; +import { extract, ContentFormat } from "../../src"; + +function createOpenAILLM() { + return new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + modelName: "gpt-4o-mini", + temperature: 0, + }); +} describe("ProcessedContent Integration Tests", () => { const simpleSchema = z.object({ @@ -25,11 +34,10 @@ describe("ProcessedContent Integration Tests", () => { "Title: Simple Test\n\nThis is a test of plain text extraction."; const result = await extract({ + llm: createOpenAILLM(), content: plainTextContent, format: ContentFormat.TXT, schema: simpleSchema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the processedContent is the same as the original content @@ -46,11 +54,10 @@ describe("ProcessedContent Integration Tests", () => { "# Simple Test\n\nThis is a test of markdown extraction."; const result = await extract({ + llm: createOpenAILLM(), content: markdownContent, format: ContentFormat.MARKDOWN, schema: simpleSchema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, }); // Verify the processedContent is the same as the original content @@ -67,11 +74,10 @@ describe("ProcessedContent Integration Tests", () => { "

Simple Test

This is a test of HTML extraction.

"; const result = await extract({ + llm: createOpenAILLM(), content: htmlContent, format: ContentFormat.HTML, schema: simpleSchema, - provider: LLMProvider.OPENAI, - openaiApiKey: process.env.OPENAI_API_KEY, sourceUrl: "https://example.com", }); diff --git a/tests/unit/extractors.test.ts b/tests/unit/extractors.test.ts index b249b26..f5c9272 100644 --- a/tests/unit/extractors.test.ts +++ b/tests/unit/extractors.test.ts @@ -1,47 +1,26 @@ import { getUsage, - createLLM, extractWithLLM, truncateContent, generateExtractionPrompt, } from "../../src/extractors"; -import { LLMProvider, ContentFormat } from "../../src/types"; +import { ContentFormat } from "../../src/types"; import { z } from "zod"; -// Mock the LLM providers -jest.mock("@langchain/openai", () => ({ - ChatOpenAI: jest.fn().mockImplementation(() => ({ - constructor: { name: "ChatOpenAI" }, +function createMockLLM() { + return { withStructuredOutput: jest.fn().mockImplementation(() => ({ invoke: jest.fn().mockResolvedValue({ parsed: { title: "Test Title", content: "Test Content" }, raw: { tool_calls: [ - { - args: { title: "Test Title", content: "Test Content" }, - }, + { args: { title: "Test Title", content: "Test Content" } }, ], }, }), })), - })), -})); - -jest.mock("@langchain/google-genai", () => ({ - ChatGoogleGenerativeAI: jest.fn().mockImplementation(() => ({ - constructor: { name: "ChatGoogleGenerativeAI" }, - withStructuredOutput: jest.fn().mockImplementation(() => ({ - invoke: jest.fn().mockResolvedValue({ - parsed: { title: "Test Title", content: "Test Content" }, - raw: { - lc_kwargs: { - content: '{"title":"Test Title","content":"Test Content"}', - }, - }, - }), - })), - })), -})); + }; +} describe("extractors", () => { const mockSchema = z.object({ @@ -50,7 +29,6 @@ describe("extractors", () => { }); const mockContent = "Test content"; - const mockApiKey = "test-api-key"; beforeEach(() => { jest.clearAllMocks(); @@ -95,57 +73,9 @@ describe("extractors", () => { }); }); - describe("createLLM", () => { - it("should create ChatOpenAI instance for OPENAI provider", () => { - const llm = createLLM( - LLMProvider.OPENAI, - "gpt-4o-mini", - "fake-api-key", - 0 - ); - - expect(llm).toBeDefined(); - expect(llm.constructor.name).toBe("ChatOpenAI"); - }); - - it("should create ChatGoogleGenerativeAI instance for GOOGLE_GEMINI provider", () => { - const llm = createLLM( - LLMProvider.GOOGLE_GEMINI, - "gemini-2.5-flash", - "fake-api-key", - 0 - ); - - expect(llm).toBeDefined(); - expect(llm.constructor.name).toBe("ChatGoogleGenerativeAI"); - }); - - it("should throw error for unsupported provider", () => { - expect(() => { - // @ts-ignore - Testing invalid provider - createLLM("unsupported-provider", "model", "api-key", 0); - }).toThrow("Unsupported LLM provider"); - }); - }); - describe("extractWithLLM", () => { - it("should extract data using OpenAI", async () => { - const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); - const result = await extractWithLLM(mockContent, mockSchema, llm); - - expect(result.data).toEqual({ - title: "Test Title", - content: "Test Content", - }); - }); - - it("should extract data using Google Gemini", async () => { - const llm = createLLM( - LLMProvider.GOOGLE_GEMINI, - "gemini-2.5-flash", - mockApiKey, - 0 - ); + it("should extract data using provided LLM", async () => { + const llm = createMockLLM() as any; const result = await extractWithLLM(mockContent, mockSchema, llm); expect(result.data).toEqual({ @@ -155,7 +85,7 @@ describe("extractors", () => { }); it("should handle custom prompts", async () => { - const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); + const llm = createMockLLM() as any; const customPrompt = "Extract the main topic and summary"; const result = await extractWithLLM( mockContent, @@ -171,7 +101,7 @@ describe("extractors", () => { }); it("should handle different content formats", async () => { - const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); + const llm = createMockLLM() as any; const result = await extractWithLLM( mockContent, mockSchema, @@ -187,7 +117,7 @@ describe("extractors", () => { }); it("should handle extraction context", async () => { - const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0); + const llm = createMockLLM() as any; const extractionContext = { title: "Existing Title", content: "", From 5d1bd21089ee1ad3f8e846302f6387c4594c37b4 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 10 Mar 2026 23:29:47 -0700 Subject: [PATCH 3/3] Update package.json to add @langchain/core as a peer dependency and adjust README installation instructions for LangChain integration packages. --- README.md | 19 +++++++++++-------- package.json | 6 ++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 148e726..698caad 100644 --- a/README.md +++ b/README.md @@ -51,20 +51,23 @@ Lightfeed Extractor is a Typescript library built for robust web data extraction ## Installation ```bash -npm install @lightfeed/extractor @langchain/core +npm install @lightfeed/extractor @langchain/openai ``` -Install the LangChain integration for your chosen LLM provider: +Install the LangChain integration package for your chosen provider. `@langchain/core` is a peer dependency — it's shared automatically: ```bash -# For OpenAI -npm install @langchain/openai +# OpenAI +npm install @lightfeed/extractor @langchain/openai -# For Google Gemini -npm install @langchain/google-genai +# Google Gemini +npm install @lightfeed/extractor @langchain/google-genai -# For Anthropic, Ollama, etc. -npm install @langchain/anthropic # or @langchain/ollama, @langchain/mistralai, etc. +# Anthropic +npm install @lightfeed/extractor @langchain/anthropic + +# Ollama (local models) +npm install @lightfeed/extractor @langchain/ollama ``` ## Usage diff --git a/package.json b/package.json index 1ab543a..774356e 100644 --- a/package.json +++ b/package.json @@ -54,11 +54,12 @@ "url": "https://github.com/lightfeed/extractor/issues" }, "homepage": "https://github.com/lightfeed/extractor#readme", + "peerDependencies": { + "@langchain/core": ">=1.1.31" + }, "dependencies": { - "@langchain/core": "^1.1.31", "cheerio": "^1.0.0", "jsonrepair": "^3.12.0", - "langchain": "^1.2.30", "playwright": "npm:rebrowser-playwright-core@1.49.1", "turndown": "^7.2.0", "xmldom": "^0.6.0", @@ -66,6 +67,7 @@ "zod": "^3.24.3" }, "devDependencies": { + "@langchain/core": "^1.1.31", "@langchain/google-genai": "^2.1.24", "@langchain/openai": "^1.2.12", "@types/jest": "^29.5.12",