From e9bdebe134d8c68d1d458b238c6d697ae9aa19b4 Mon Sep 17 00:00:00 2001
From: Andrew Zhong <axzhong3@gmail.com>
Date: Tue, 10 Mar 2026 23:12:58 -0700
Subject: [PATCH 1/3] Enhance extraction functionality to support any LangChain
 chat model. Introduced `llm` option in `ExtractorOptions` to allow users to
 pass custom models, overriding provider-specific settings. Updated related
 functions and README for clarity on usage.

---
 README.md                     | 75 +++++++++++++++++++++++++++--------
 package.json                  |  5 ++-
 src/extractors.ts             |  7 +---
 src/index.ts                  | 73 +++++++++++++++++++---------------
 src/types.ts                  | 25 +++++++++---
 tests/unit/extractors.test.ts | 37 ++++++-----------
 6 files changed, 139 insertions(+), 83 deletions(-)

diff --git a/README.md b/README.md
index 255a186..ba5563c 100644
--- a/README.md
+++ b/README.md
@@ -241,9 +241,55 @@ console.log(result.data);
 // }
 ```
 
-### Customizing LLM Provider and Managing Token Limits
+### Bring Your Own LLM (Any LangChain Model)
 
-You can customize LLM and manage token limits to control costs and ensure your content fits within the model's maximum context window:
+You can pass **any LangChain chat model** directly via the `llm` option. This lets you use any provider supported by LangChain — Anthropic, Mistral, Cohere, Ollama, Azure OpenAI, AWS Bedrock, and more — without being limited to the built-in OpenAI and Google Gemini providers.
+
+```typescript
+import { extract, ContentFormat } from "@lightfeed/extractor";
+import { ChatAnthropic } from "@langchain/anthropic";
+
+const llm = new ChatAnthropic({
+  model: "claude-sonnet-4-20250514",
+  apiKey: process.env.ANTHROPIC_API_KEY,
+});
+
+const result = await extract({
+  llm,
+  content: markdownContent,
+  format: ContentFormat.MARKDOWN,
+  schema: mySchema,
+});
+```
+
+This works with any LangChain-compatible chat model:
+
+```typescript
+// Ollama (local models)
+import { ChatOllama } from "@langchain/ollama";
+const llm = new ChatOllama({ model: "llama3" });
+
+// Mistral
+import { ChatMistralAI } from "@langchain/mistralai";
+const llm = new ChatMistralAI({ model: "mistral-large-latest" });
+
+// Azure OpenAI
+import { AzureChatOpenAI } from "@langchain/openai";
+const llm = new AzureChatOpenAI({
+  azureOpenAIApiDeploymentName: "my-deployment",
+});
+
+// AWS Bedrock
+import { ChatBedrockConverse } from "@langchain/aws";
+const llm = new ChatBedrockConverse({ model: "anthropic.claude-3-sonnet-20240229-v1:0" });
+```
+
+> [!NOTE]
+> When using the `llm` option, the `provider`, `modelName`, `temperature`, and API key options (`googleApiKey`, `openaiApiKey`) are ignored — configure those directly on the LangChain model instance. You only need to install the LangChain integration package for the provider you want to use (e.g., `@langchain/anthropic`, `@langchain/ollama`).
+
+### Customizing Built-in LLM Provider and Managing Token Limits
+
+If you prefer not to manage LangChain instances directly, you can use the built-in provider shortcuts for OpenAI and Google Gemini:
 
 ```typescript
 // Extract from Markdown with token limit
@@ -337,17 +383,13 @@ const result = await extract({
 
 ## LLM Extraction Function
 
-### LLM API Keys
+### LLM Configuration
 
-The library currently supports Google Gemini and OpenAI ChatGPT models. It will check for LLM API keys in the following order:
+The library supports three ways to configure the LLM, in order of priority:
 
-1. Directly provided API key parameter (`googleApiKey` or `openaiApiKey`)
-2. Environment variables (`GOOGLE_API_KEY` or `OPENAI_API_KEY`)
-
-While the library can use environment variables, it's recommended to explicitly provide API keys in production code for better control and transparency.
-
-> [!NOTE]
-> Want support for additional LLM providers? Please [create an issue](https://github.com/lightfeed/extractor/issues/new/choose) and let us know which providers you'd like to see supported.
+1. **Custom LLM instance** (`llm`) — Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) directly. This gives you full control over the model, parameters, and provider.
+2. **Built-in provider shortcuts** (`provider` + API key) — Use the built-in OpenAI or Google Gemini providers with just an API key and optional model name.
+3. **Environment variables** (`GOOGLE_API_KEY` or `OPENAI_API_KEY`) — Falls back to env vars when using built-in providers without explicit keys.
 
 ### `extract<T>(options: ExtractorOptions<T>): Promise<ExtractorResult<T>>`
 
@@ -360,12 +402,13 @@ Main function to extract structured data from content.
 | `content` | `string` | HTML, markdown, or plain text content to extract from | Required |
 | `format` | `ContentFormat` | Content format (HTML, MARKDOWN, or TXT) | Required |
 | `schema` | `z.ZodTypeAny` | Zod schema defining the structure to extract | Required |
+| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance. When provided, `provider`, `modelName`, `temperature`, and API key options are ignored. | `undefined` |
 | `prompt` | `string` | Custom prompt to guide the extraction process | Internal default prompt |
-| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI) | `LLMProvider.GOOGLE_GEMINI` |
-| `modelName` | `string` | Model name to use | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini  |
-| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider) | From env `GOOGLE_API_KEY` |
-| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider) | From env `OPENAI_API_KEY` |
-| `temperature` | `number` | Temperature for the LLM (0-1) | `0` |
+| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when `llm` is provided. | `LLMProvider.GOOGLE_GEMINI` |
+| `modelName` | `string` | Model name to use. Ignored when `llm` is provided. | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini  |
+| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider). Ignored when `llm` is provided. | From env `GOOGLE_API_KEY` |
+| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider). Ignored when `llm` is provided. | From env `OPENAI_API_KEY` |
+| `temperature` | `number` | Temperature for the LLM (0-1). Ignored when `llm` is provided. | `0` |
 | `htmlExtractionOptions` | `HTMLExtractionOptions` | HTML-specific options for content extraction [see below](#htmlextractionoptions) | `{}` |
 | `sourceUrl` | `string` | URL of the HTML content, required when format is HTML to properly handle relative URLs | Required for HTML format |
 | `maxInputTokens` | `number` | Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. When specified, content will be truncated if the total prompt size exceeds this limit. | `undefined` |
diff --git a/package.json b/package.json
index 3d9fa01..ccca8e0 100644
--- a/package.json
+++ b/package.json
@@ -42,8 +42,11 @@
     "html",
     "markdown",
     "structured-data",
+    "langchain",
     "openai",
-    "gemini"
+    "gemini",
+    "anthropic",
+    "ollama"
   ],
   "author": "Lightfeed",
   "license": "Apache-2.0",
diff --git a/src/extractors.ts b/src/extractors.ts
index 901e188..683518a 100644
--- a/src/extractors.ts
+++ b/src/extractors.ts
@@ -1,5 +1,6 @@
 import { ChatOpenAI } from "@langchain/openai";
 import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
 import { z } from "zod";
 import { LLMProvider, Usage, ContentFormat } from "./types";
 import { AIMessage } from "@langchain/core/messages";
@@ -181,16 +182,12 @@ export function truncateContent({
 export async function extractWithLLM<T extends z.ZodTypeAny>(
   content: string,
   schema: T,
-  provider: LLMProvider,
-  modelName: string,
-  apiKey: string,
-  temperature: number = 0,
+  llm: BaseChatModel,
   customPrompt?: string,
   format: string = ContentFormat.MARKDOWN,
   maxInputTokens?: number,
   extractionContext?: Record<string, any>,
 ): Promise<{ data: z.infer<T>; usage: Usage }> {
-  const llm = createLLM(provider, modelName, apiKey, temperature);
   let usage: Usage = {};
 
   // Truncate content if maxInputTokens is specified
diff --git a/src/index.ts b/src/index.ts
index 08cf600..0136b49 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,6 +1,7 @@
 import { z } from "zod";
+import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
 import { htmlToMarkdown } from "./converters";
-import { extractWithLLM } from "./extractors";
+import { createLLM, extractWithLLM } from "./extractors";
 import {
   ContentFormat,
   LLMProvider,
@@ -16,28 +17,15 @@ const DEFAULT_MODELS = {
 };
 
 /**
- * Extract structured data from HTML, markdown, or plain text content using an LLM
- *
- * @param options Configuration options for extraction
- * @param options.content HTML, markdown, or plain text content to extract from
- * @param options.format Content format (HTML, MARKDOWN, or TXT)
- * @param options.schema Zod schema defining the structure to extract
- * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI)
- * @param options.modelName Model name to use (provider-specific)
- * @param options.googleApiKey Google API key (if using Google Gemini provider)
- * @param options.openaiApiKey OpenAI API key (if using OpenAI provider)
- * @param options.temperature Temperature for the LLM (0-1)
- * @param options.prompt Custom prompt to guide the extraction process
- * @param options.sourceUrl URL of the HTML content (required for HTML format)
- * @param options.htmlExtractionOptions HTML-specific options for content extraction
- * @param options.maxInputTokens Maximum number of input tokens to send to the LLM
- * @param options.extractionContext Extraction context that provides additional information for the extraction process (partial data, metadata, etc.)
- * @returns The extracted data, original content, and usage statistics
+ * Resolve the LLM to use: either a user-provided instance or one created from provider config.
  */
-export async function extract<T extends z.ZodTypeAny>(
-  options: ExtractorOptions<T>
-): Promise<ExtractorResult<z.infer<T>>> {
-  // Validate required parameters
+function resolveLLM<T extends z.ZodTypeAny>(
+  options: ExtractorOptions<T>,
+): BaseChatModel {
+  if (options.llm) {
+    return options.llm;
+  }
+
   const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI;
   let apiKey: string;
 
@@ -59,6 +47,35 @@ export async function extract<T extends z.ZodTypeAny>(
     throw new Error(`Unsupported LLM provider: ${provider}`);
   }
 
+  const modelName = options.modelName ?? DEFAULT_MODELS[provider];
+  return createLLM(provider, modelName, apiKey, options.temperature ?? 0);
+}
+
+/**
+ * Extract structured data from HTML, markdown, or plain text content using an LLM
+ *
+ * @param options Configuration options for extraction
+ * @param options.content HTML, markdown, or plain text content to extract from
+ * @param options.format Content format (HTML, MARKDOWN, or TXT)
+ * @param options.schema Zod schema defining the structure to extract
+ * @param options.llm A LangChain chat model instance. When provided, provider/modelName/apiKey options are ignored.
+ * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when llm is provided.
+ * @param options.modelName Model name to use (provider-specific). Ignored when llm is provided.
+ * @param options.googleApiKey Google API key (if using Google Gemini provider). Ignored when llm is provided.
+ * @param options.openaiApiKey OpenAI API key (if using OpenAI provider). Ignored when llm is provided.
+ * @param options.temperature Temperature for the LLM (0-1). Ignored when llm is provided.
+ * @param options.prompt Custom prompt to guide the extraction process
+ * @param options.sourceUrl URL of the HTML content (required for HTML format)
+ * @param options.htmlExtractionOptions HTML-specific options for content extraction
+ * @param options.maxInputTokens Maximum number of input tokens to send to the LLM
+ * @param options.extractionContext Extraction context that provides additional information for the extraction process (partial data, metadata, etc.)
+ * @returns The extracted data, original content, and usage statistics
+ */
+export async function extract<T extends z.ZodTypeAny>(
+  options: ExtractorOptions<T>
+): Promise<ExtractorResult<z.infer<T>>> {
+  const llm = resolveLLM(options);
+
   // Validate sourceUrl for HTML format
   if (options.format === ContentFormat.HTML && !options.sourceUrl) {
     throw new Error(
@@ -66,9 +83,6 @@ export async function extract<T extends z.ZodTypeAny>(
     );
   }
 
-  // Get model name (use defaults if not provided)
-  const modelName = options.modelName ?? DEFAULT_MODELS[provider];
-
   // Convert HTML to markdown if needed
   let content = options.content;
   let formatToUse = options.format;
@@ -79,7 +93,6 @@ export async function extract<T extends z.ZodTypeAny>(
       options.htmlExtractionOptions,
       options.sourceUrl
     );
-    // For the LLM, the content is now markdown
     formatToUse = ContentFormat.MARKDOWN;
   }
 
@@ -87,17 +100,13 @@ export async function extract<T extends z.ZodTypeAny>(
   const { data, usage } = await extractWithLLM(
     content,
     options.schema,
-    provider,
-    modelName,
-    apiKey,
-    options.temperature ?? 0,
+    llm,
     options.prompt,
-    formatToUse.toString(), // Pass the correct format based on actual content
+    formatToUse.toString(),
     options.maxInputTokens,
     options.extractionContext
   );
 
-  // Return the full result
   return {
     data,
     processedContent: content,
diff --git a/src/types.ts b/src/types.ts
index 9d828ba..e0ec0fd 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,4 +1,5 @@
 import { z } from "zod";
+import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
 import type { Browser, LaunchOptions, ConnectOverCDPOptions } from "playwright";
 
 /**
@@ -126,19 +127,33 @@ export interface ExtractorOptions<T extends z.ZodTypeAny> {
   /** Schema for structured extraction */
   schema: T;
 
-  /** LLM Provider (OpenAI or Google Gemini) */
+  /**
+   * A LangChain chat model instance to use for extraction.
+   * When provided, `provider`, `modelName`, and API key options are ignored.
+   * Accepts any LangChain chat model (ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc.).
+   *
+   * @example
+   * ```typescript
+   * import { ChatAnthropic } from "@langchain/anthropic";
+   * const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514" });
+   * const result = await extract({ llm, content, format, schema });
+   * ```
+   */
+  llm?: BaseChatModel;
+
+  /** LLM Provider (OpenAI or Google Gemini). Ignored when `llm` is provided. */
   provider?: LLMProvider;
 
-  /** Model name to use */
+  /** Model name to use. Ignored when `llm` is provided. */
   modelName?: string;
 
-  /** OpenAI API key */
+  /** OpenAI API key. Ignored when `llm` is provided. */
   openaiApiKey?: string;
 
-  /** Google API key */
+  /** Google API key. Ignored when `llm` is provided. */
   googleApiKey?: string;
 
-  /** Temperature for the LLM (0-1), defaults to 0 */
+  /** Temperature for the LLM (0-1), defaults to 0. Ignored when `llm` is provided. */
   temperature?: number;
 
   /** HTML-specific extraction options (only applies when format is HTML) */
diff --git a/tests/unit/extractors.test.ts b/tests/unit/extractors.test.ts
index 7678087..b249b26 100644
--- a/tests/unit/extractors.test.ts
+++ b/tests/unit/extractors.test.ts
@@ -130,13 +130,8 @@ describe("extractors", () => {
 
   describe("extractWithLLM", () => {
     it("should extract data using OpenAI", async () => {
-      const result = await extractWithLLM(
-        mockContent,
-        mockSchema,
-        LLMProvider.OPENAI,
-        "gpt-4o-mini",
-        mockApiKey
-      );
+      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
+      const result = await extractWithLLM(mockContent, mockSchema, llm);
 
       expect(result.data).toEqual({
         title: "Test Title",
@@ -145,13 +140,13 @@ describe("extractors", () => {
     });
 
     it("should extract data using Google Gemini", async () => {
-      const result = await extractWithLLM(
-        mockContent,
-        mockSchema,
+      const llm = createLLM(
         LLMProvider.GOOGLE_GEMINI,
         "gemini-2.5-flash",
-        mockApiKey
+        mockApiKey,
+        0
       );
+      const result = await extractWithLLM(mockContent, mockSchema, llm);
 
       expect(result.data).toEqual({
         title: "Test Title",
@@ -160,14 +155,12 @@ describe("extractors", () => {
     });
 
     it("should handle custom prompts", async () => {
+      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
       const customPrompt = "Extract the main topic and summary";
       const result = await extractWithLLM(
         mockContent,
         mockSchema,
-        LLMProvider.OPENAI,
-        "gpt-4o-mini",
-        mockApiKey,
-        0,
+        llm,
         customPrompt
       );
 
@@ -178,13 +171,11 @@ describe("extractors", () => {
     });
 
     it("should handle different content formats", async () => {
+      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
       const result = await extractWithLLM(
         mockContent,
         mockSchema,
-        LLMProvider.OPENAI,
-        "gpt-4o-mini",
-        mockApiKey,
-        0,
+        llm,
         undefined,
         ContentFormat.TXT
       );
@@ -196,18 +187,16 @@ describe("extractors", () => {
     });
 
     it("should handle extraction context", async () => {
+      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
       const extractionContext = {
         title: "Existing Title",
-        content: "", // Empty field that should be filled
+        content: "",
       };
 
       const result = await extractWithLLM(
         mockContent,
         mockSchema,
-        LLMProvider.OPENAI,
-        "gpt-4o-mini",
-        mockApiKey,
-        0,
+        llm,
         undefined,
         ContentFormat.TXT,
         undefined,

From 7db859502b0ba9bb38a0578dbe5013121450f6d7 Mon Sep 17 00:00:00 2001
From: Andrew Zhong <axzhong3@gmail.com>
Date: Tue, 10 Mar 2026 23:23:41 -0700
Subject: [PATCH 2/3] Refactor LLM provider handling by removing `LLMProvider`
 enum and related logic. Updated `ExtractorOptions` to require a `llm`
 instance directly, simplifying the API. Adjusted tests and documentation to
 reflect these changes.

---
 README.md                                    | 122 +++++++++----------
 package.json                                 |   4 +-
 src/dev/runLocalTest.ts                      |  91 +++++++-------
 src/dev/testBrowserExtraction.ts             |  10 +-
 src/dev/testUsage.ts                         |  10 +-
 src/example.ts                               |  15 ++-
 src/extractors.ts                            |  33 +----
 src/index.ts                                 |  55 +--------
 src/types.ts                                 |  30 +----
 tests/integration/browser-extraction.test.ts |  14 ++-
 tests/integration/extract.test.ts            |  59 ++++-----
 tests/integration/processedContent.test.ts   |  20 +--
 tests/unit/extractors.test.ts                |  92 ++------------
 13 files changed, 195 insertions(+), 360 deletions(-)

diff --git a/README.md b/README.md
index ba5563c..148e726 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,20 @@ Lightfeed Extractor is a Typescript library built for robust web data extraction
 ## Installation
 
 ```bash
-npm install @lightfeed/extractor
+npm install @lightfeed/extractor @langchain/core
+```
+
+Install the LangChain integration for your chosen LLM provider:
+
+```bash
+# For OpenAI
+npm install @langchain/openai
+
+# For Google Gemini
+npm install @langchain/google-genai
+
+# For Anthropic, Ollama, etc.
+npm install @langchain/anthropic   # or @langchain/ollama, @langchain/mistralai, etc.
 ```
 
 ## Usage
@@ -61,7 +74,8 @@ npm install @lightfeed/extractor
 This example demonstrates extracting structured product data from a real e-commerce website using a local headed Playwright browser. For production environments, you can use a Playwright browser in [serverless](#serverless-browser) or [remote](#remote-browser) mode.
 
 ```typescript
-import { extract, ContentFormat, LLMProvider, Browser } from "@lightfeed/extractor";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { extract, ContentFormat, Browser } from "@lightfeed/extractor";
 import { z } from "zod";
 
 // Define schema for product catalog extraction
@@ -114,12 +128,15 @@ try {
   // Extract structured product data
   console.log("Extracting product data using LLM...");
   const result = await extract({
+    llm: new ChatGoogleGenerativeAI({
+      apiKey: process.env.GOOGLE_API_KEY,
+      model: "gemini-2.5-flash",
+      temperature: 0,
+    }),
     content: html,
     format: ContentFormat.HTML,
     sourceUrl: pageUrl,
     schema: productCatalogSchema,
-    provider: LLMProvider.GOOGLE_GEMINI,
-    googleApiKey: process.env.GOOGLE_API_KEY, // Use environment variable
     htmlExtractionOptions: {
       extractMainHtml: true,
       includeImages: true,
@@ -164,15 +181,21 @@ try {
 
 ### Extracting from Markdown or Plain Text
 
-You can also extract structured data directly from HTML, Markdown or text string:
+You can also extract structured data directly from HTML, Markdown or text string. Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/):
 
 ```typescript
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { extract, ContentFormat } from "@lightfeed/extractor";
+
 const result = await extract({
+  llm: new ChatGoogleGenerativeAI({
+    apiKey: process.env.GOOGLE_API_KEY,
+    model: "gemini-2.5-flash",
+    temperature: 0,
+  }),
   content: markdownContent,
-  // Specify that content is Markdown. In addition to HTML and Markdown, you can also extract plain text by ContentFormat.TXT
   format: ContentFormat.MARKDOWN,
   schema: mySchema,
-  googleApiKey: "your-google-gemini-api-key",
 });
 ```
 
@@ -182,13 +205,12 @@ You can provide a custom prompt to guide the extraction process:
 
 ```typescript
 const result = await extract({
+  llm: myLLM,
   content: htmlContent,
   format: ContentFormat.HTML,
   schema: mySchema,
   sourceUrl: "https://example.com/products",
-  // In custom prompt, defined what data should be retrieved
   prompt: "Extract ONLY products that are on sale or have special discounts. Include their original prices, discounted prices, and product URL.",
-  googleApiKey: "your-google-gemini-api-key",
 });
 ```
 
@@ -222,12 +244,12 @@ const schema = z.object({
 });
 
 const result = await extract({
+  llm: myLLM,
   content: htmlContent,
   format: ContentFormat.HTML,
   schema: schema,
   sourceUrl: "https://acme.com/products/smart-security-camera",
   extractionContext: extractionContext,
-  googleApiKey: "your-google-gemini-api-key",
 });
 
 // The LLM will use the context to extract store name (acme) and consider the location
@@ -241,73 +263,44 @@ console.log(result.data);
 // }
 ```
 
-### Bring Your Own LLM (Any LangChain Model)
+### Using Any LangChain Model
 
-You can pass **any LangChain chat model** directly via the `llm` option. This lets you use any provider supported by LangChain — Anthropic, Mistral, Cohere, Ollama, Azure OpenAI, AWS Bedrock, and more — without being limited to the built-in OpenAI and Google Gemini providers.
+Pass **any LangChain chat model** via the `llm` option. Use OpenAI, Google Gemini, Anthropic, Mistral, Ollama, Azure OpenAI, AWS Bedrock, or any [LangChain-supported provider](https://js.langchain.com/docs/integrations/chat/):
 
 ```typescript
-import { extract, ContentFormat } from "@lightfeed/extractor";
-import { ChatAnthropic } from "@langchain/anthropic";
+// OpenAI
+import { ChatOpenAI } from "@langchain/openai";
+const llm = new ChatOpenAI({ modelName: "gpt-4o-mini", apiKey: process.env.OPENAI_API_KEY });
 
-const llm = new ChatAnthropic({
-  model: "claude-sonnet-4-20250514",
-  apiKey: process.env.ANTHROPIC_API_KEY,
-});
+// Google Gemini
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+const llm = new ChatGoogleGenerativeAI({ model: "gemini-2.5-flash", apiKey: process.env.GOOGLE_API_KEY });
 
-const result = await extract({
-  llm,
-  content: markdownContent,
-  format: ContentFormat.MARKDOWN,
-  schema: mySchema,
-});
-```
-
-This works with any LangChain-compatible chat model:
+// Anthropic
+import { ChatAnthropic } from "@langchain/anthropic";
+const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514", apiKey: process.env.ANTHROPIC_API_KEY });
 
-```typescript
-// Ollama (local models)
+// Ollama (local)
 import { ChatOllama } from "@langchain/ollama";
 const llm = new ChatOllama({ model: "llama3" });
-
-// Mistral
-import { ChatMistralAI } from "@langchain/mistralai";
-const llm = new ChatMistralAI({ model: "mistral-large-latest" });
-
-// Azure OpenAI
-import { AzureChatOpenAI } from "@langchain/openai";
-const llm = new AzureChatOpenAI({
-  azureOpenAIApiDeploymentName: "my-deployment",
-});
-
-// AWS Bedrock
-import { ChatBedrockConverse } from "@langchain/aws";
-const llm = new ChatBedrockConverse({ model: "anthropic.claude-3-sonnet-20240229-v1:0" });
 ```
 
-> [!NOTE]
-> When using the `llm` option, the `provider`, `modelName`, `temperature`, and API key options (`googleApiKey`, `openaiApiKey`) are ignored — configure those directly on the LangChain model instance. You only need to install the LangChain integration package for the provider you want to use (e.g., `@langchain/anthropic`, `@langchain/ollama`).
-
-### Customizing Built-in LLM Provider and Managing Token Limits
+### Managing Token Limits
 
-If you prefer not to manage LangChain instances directly, you can use the built-in provider shortcuts for OpenAI and Google Gemini:
+Use `maxInputTokens` to truncate content when it exceeds the model's context window:
 
 ```typescript
-// Extract from Markdown with token limit
 const result = await extract({
+  llm: new ChatOpenAI({ modelName: "gpt-4o-mini", apiKey: "..." }),
   content: markdownContent,
   format: ContentFormat.MARKDOWN,
   schema,
-  // Provide model provider and model name
-  provider: LLMProvider.OPENAI,
-  modelName: "gpt-4o-mini",
-  openaiApiKey: "your-openai-api-key",
-  // Limit to roughly 128K tokens (max input for gpt-4o-mini)
-  maxInputTokens: 128000,
+  maxInputTokens: 128000, // Roughly 128K tokens (4 chars/token)
 });
 ```
 
 > [!WARNING]
-> For OpenAI models, optional schema is not supported. You need to change `.optional()` to `.nullable()`.
+> For OpenAI models, optional schema is not supported. Use `.nullable()` instead of `.optional()`.
 
 ### Extracting from Main HTML
 
@@ -315,6 +308,7 @@ For blog posts or articles with lots of navigation elements, headers, and footer
 
 ```typescript
 const result = await extract({
+  llm: myLLM,
   content: htmlContent,
   format: ContentFormat.HTML,
   schema: mySchema,
@@ -350,6 +344,7 @@ const productListSchema = z.object({
 });
 
 const result = await extract({
+  llm: myLLM,
   content: htmlContent,
   format: ContentFormat.HTML,
   schema: mySchema,
@@ -366,6 +361,7 @@ The library can clean URLs to remove tracking parameters and unnecessary compone
 
 ```typescript
 const result = await extract({
+  llm: myLLM,
   content: htmlContent,
   format: ContentFormat.HTML,
   schema: mySchema,
@@ -385,11 +381,7 @@ const result = await extract({
 
 ### LLM Configuration
 
-The library supports three ways to configure the LLM, in order of priority:
-
-1. **Custom LLM instance** (`llm`) — Pass any [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) directly. This gives you full control over the model, parameters, and provider.
-2. **Built-in provider shortcuts** (`provider` + API key) — Use the built-in OpenAI or Google Gemini providers with just an API key and optional model name.
-3. **Environment variables** (`GOOGLE_API_KEY` or `OPENAI_API_KEY`) — Falls back to env vars when using built-in providers without explicit keys.
+Pass a [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance via the `llm` option. Install the LangChain integration for your provider (e.g. `@langchain/openai`, `@langchain/google-genai`, `@langchain/anthropic`) and configure API keys on the model instance.
 
 ### `extract<T>(options: ExtractorOptions<T>): Promise<ExtractorResult<T>>`
 
@@ -399,16 +391,11 @@ Main function to extract structured data from content.
 
 | Option | Type | Description | Default |
 |--------|------|-------------|---------|
+| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance (ChatOpenAI, ChatGoogleGenerativeAI, ChatAnthropic, etc.) | Required |
 | `content` | `string` | HTML, markdown, or plain text content to extract from | Required |
 | `format` | `ContentFormat` | Content format (HTML, MARKDOWN, or TXT) | Required |
 | `schema` | `z.ZodTypeAny` | Zod schema defining the structure to extract | Required |
-| `llm` | `BaseChatModel` | A [LangChain chat model](https://js.langchain.com/docs/integrations/chat/) instance. When provided, `provider`, `modelName`, `temperature`, and API key options are ignored. | `undefined` |
 | `prompt` | `string` | Custom prompt to guide the extraction process | Internal default prompt |
-| `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when `llm` is provided. | `LLMProvider.GOOGLE_GEMINI` |
-| `modelName` | `string` | Model name to use. Ignored when `llm` is provided. | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini  |
-| `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider). Ignored when `llm` is provided. | From env `GOOGLE_API_KEY` |
-| `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider). Ignored when `llm` is provided. | From env `OPENAI_API_KEY` |
-| `temperature` | `number` | Temperature for the LLM (0-1). Ignored when `llm` is provided. | `0` |
 | `htmlExtractionOptions` | `HTMLExtractionOptions` | HTML-specific options for content extraction [see below](#htmlextractionoptions) | `{}` |
 | `sourceUrl` | `string` | URL of the HTML content, required when format is HTML to properly handle relative URLs | Required for HTML format |
 | `maxInputTokens` | `number` | Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. When specified, content will be truncated if the total prompt size exceeds this limit. | `undefined` |
@@ -696,6 +683,7 @@ const schema = z.object({
 });
 
 const result = await extract({
+  llm: myLLM,
   content: markdownContent,
   format: ContentFormat.MARKDOWN,
   schema,
diff --git a/package.json b/package.json
index ccca8e0..1ab543a 100644
--- a/package.json
+++ b/package.json
@@ -56,8 +56,6 @@
   "homepage": "https://github.com/lightfeed/extractor#readme",
   "dependencies": {
     "@langchain/core": "^1.1.31",
-    "@langchain/google-genai": "^2.1.24",
-    "@langchain/openai": "^1.2.12",
     "cheerio": "^1.0.0",
     "jsonrepair": "^3.12.0",
     "langchain": "^1.2.30",
@@ -68,6 +66,8 @@
     "zod": "^3.24.3"
   },
   "devDependencies": {
+    "@langchain/google-genai": "^2.1.24",
+    "@langchain/openai": "^1.2.12",
     "@types/jest": "^29.5.12",
     "@types/node": "^22.15.3",
     "@types/turndown": "^5.0.5",
diff --git a/src/dev/runLocalTest.ts b/src/dev/runLocalTest.ts
index e6b373a..1f51da9 100644
--- a/src/dev/runLocalTest.ts
+++ b/src/dev/runLocalTest.ts
@@ -2,11 +2,30 @@ import * as fs from "fs";
 import * as path from "path";
 import { config } from "dotenv";
 import { z } from "zod";
-import { extract, ContentFormat, LLMProvider } from "../index";
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { extract, ContentFormat } from "../index";
 
 // Load environment variables from .env file
 config({ path: path.resolve(process.cwd(), ".env") });
 
+type Provider = "gemini" | "openai";
+
+function createLLM(provider: Provider) {
+  if (provider === "gemini") {
+    return new ChatGoogleGenerativeAI({
+      apiKey: process.env.GOOGLE_API_KEY,
+      model: "gemini-2.5-flash",
+      temperature: 0,
+    });
+  }
+  return new ChatOpenAI({
+    apiKey: process.env.OPENAI_API_KEY,
+    modelName: "gpt-4o-mini",
+    temperature: 0,
+  });
+}
+
 // Helper to load HTML test fixtures
 function loadFixture(filename: string): string {
   return fs.readFileSync(
@@ -67,34 +86,25 @@ const productSchemaOpenAI = z.object({
 });
 
 // Test functions
-async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
+async function testBlogExtraction(provider: Provider = "gemini") {
   console.log(`Testing blog post extraction with ${provider}...`);
 
   try {
     const html = loadFixture("blog-post.html");
 
-    // Check for required API key
-    if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) {
+    if (provider === "gemini" && !process.env.GOOGLE_API_KEY) {
       console.error("Error: GOOGLE_API_KEY environment variable is required");
       process.exit(1);
-    } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) {
+    } else if (provider === "openai" && !process.env.OPENAI_API_KEY) {
       console.error("Error: OPENAI_API_KEY environment variable is required");
       process.exit(1);
     }
 
-    const apiKey =
-      provider === LLMProvider.GOOGLE_GEMINI
-        ? process.env.GOOGLE_API_KEY
-        : process.env.OPENAI_API_KEY;
-
     const result = await extract({
+      llm: createLLM(provider),
       content: html,
       format: ContentFormat.HTML,
-      schema:
-        provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI,
-      provider,
-      googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
-      openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
+      schema: provider === "gemini" ? blogSchema : blogSchemaOpenAI,
       htmlExtractionOptions: {
         extractMainHtml: false,
       },
@@ -113,36 +123,25 @@ async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
   }
 }
 
-async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
+async function testProductExtraction(provider: Provider = "gemini") {
   console.log(`Testing product listing extraction with ${provider}...`);
 
   try {
     const html = loadFixture("product-list.html");
 
-    // Check for required API key
-    if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) {
+    if (provider === "gemini" && !process.env.GOOGLE_API_KEY) {
       console.error("Error: GOOGLE_API_KEY environment variable is required");
       process.exit(1);
-    } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) {
+    } else if (provider === "openai" && !process.env.OPENAI_API_KEY) {
       console.error("Error: OPENAI_API_KEY environment variable is required");
       process.exit(1);
     }
 
-    const apiKey =
-      provider === LLMProvider.GOOGLE_GEMINI
-        ? process.env.GOOGLE_API_KEY
-        : process.env.OPENAI_API_KEY;
-
     const result = await extract({
+      llm: createLLM(provider),
       content: html,
       format: ContentFormat.HTML,
-      schema:
-        provider === LLMProvider.GOOGLE_GEMINI
-          ? productSchema
-          : productSchemaOpenAI,
-      provider,
-      googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
-      openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
+      schema: provider === "gemini" ? productSchema : productSchemaOpenAI,
       htmlExtractionOptions: {
         extractMainHtml: true,
       },
@@ -163,38 +162,32 @@ async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
 
 // Run tests based on command line arguments
 async function main() {
-  // Parse arguments: content type and provider
   const args = process.argv.slice(2);
-  const contentType = args[0] || "all"; // 'blog', 'product', or 'all'
-  const provider =
-    args[1]?.toUpperCase() === "OPENAI"
-      ? LLMProvider.OPENAI
-      : args[1]?.toUpperCase() === "GEMINI"
-      ? LLMProvider.GOOGLE_GEMINI
-      : "all"; // 'OPENAI', 'GEMINI', or 'all'
+  const contentType = args[0] || "all";
+  const providerArg = args[1]?.toUpperCase();
+  const provider: Provider | "all" =
+    providerArg === "OPENAI" ? "openai" : providerArg === "GEMINI" ? "gemini" : "all";
 
   console.log("API Keys available:");
   console.log(`- GOOGLE_API_KEY: ${process.env.GOOGLE_API_KEY ? "Yes" : "No"}`);
   console.log(`- OPENAI_API_KEY: ${process.env.OPENAI_API_KEY ? "Yes" : "No"}`);
   console.log("");
 
-  // Run blog tests
   if (contentType === "blog" || contentType === "all") {
-    if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") {
-      await testBlogExtraction(LLMProvider.GOOGLE_GEMINI);
+    if (provider === "gemini" || provider === "all") {
+      await testBlogExtraction("gemini");
     }
-    if (provider === LLMProvider.OPENAI || provider === "all") {
-      await testBlogExtraction(LLMProvider.OPENAI);
+    if (provider === "openai" || provider === "all") {
+      await testBlogExtraction("openai");
     }
   }
 
-  // Run product tests
   if (contentType === "product" || contentType === "all") {
-    if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") {
-      await testProductExtraction(LLMProvider.GOOGLE_GEMINI);
+    if (provider === "gemini" || provider === "all") {
+      await testProductExtraction("gemini");
     }
-    if (provider === LLMProvider.OPENAI || provider === "all") {
-      await testProductExtraction(LLMProvider.OPENAI);
+    if (provider === "openai" || provider === "all") {
+      await testProductExtraction("openai");
     }
   }
 }
diff --git a/src/dev/testBrowserExtraction.ts b/src/dev/testBrowserExtraction.ts
index 672816a..bcc2aca 100644
--- a/src/dev/testBrowserExtraction.ts
+++ b/src/dev/testBrowserExtraction.ts
@@ -1,4 +1,5 @@
-import { extract, ContentFormat, LLMProvider, Browser } from "../index";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { extract, ContentFormat, Browser } from "../index";
 import { z } from "zod";
 import * as path from "path";
 import { config } from "dotenv";
@@ -65,12 +66,15 @@ async function testProductCatalogExtraction() {
     console.log("\n🧠 Extracting product data using LLM...");
 
     const result = await extract({
+      llm: new ChatGoogleGenerativeAI({
+        apiKey: process.env.GOOGLE_API_KEY,
+        model: "gemini-2.5-flash",
+        temperature: 0,
+      }),
       content: html,
       format: ContentFormat.HTML,
       sourceUrl: testUrl,
       schema: productCatalogSchema,
-      provider: LLMProvider.GOOGLE_GEMINI,
-      googleApiKey: process.env.GOOGLE_API_KEY,
       htmlExtractionOptions: {
         extractMainHtml: true,
         includeImages: true,
diff --git a/src/dev/testUsage.ts b/src/dev/testUsage.ts
index d5c15ff..86887c6 100644
--- a/src/dev/testUsage.ts
+++ b/src/dev/testUsage.ts
@@ -1,7 +1,8 @@
 import { config } from "dotenv";
 import * as path from "path";
 import { z } from "zod";
-import { extract, ContentFormat, LLMProvider } from "../index";
+import { ChatOpenAI } from "@langchain/openai";
+import { extract, ContentFormat } from "../index";
 
 // Load environment variables from .env file
 config({ path: path.resolve(process.cwd(), ".env") });
@@ -32,11 +33,14 @@ This is a test of the usage tracking system.
   try {
     // Run extraction
     const result = await extract({
+      llm: new ChatOpenAI({
+        apiKey: process.env.OPENAI_API_KEY,
+        modelName: "gpt-4o-mini",
+        temperature: 0,
+      }),
       content: markdown,
       format: ContentFormat.MARKDOWN,
       schema,
-      provider: LLMProvider.OPENAI,
-      openaiApiKey: process.env.OPENAI_API_KEY,
     });
 
     // Log the results
diff --git a/src/example.ts b/src/example.ts
index 470c715..2e694e2 100644
--- a/src/example.ts
+++ b/src/example.ts
@@ -1,4 +1,5 @@
-import { extract, ContentFormat, LLMProvider } from "./index";
+import { ChatOpenAI } from "@langchain/openai";
+import { extract, ContentFormat } from "./index";
 import { z } from "zod";
 import { config } from "dotenv";
 import * as path from "path";
@@ -11,8 +12,8 @@ config({ path: path.resolve(process.cwd(), ".env") });
 async function example() {
   try {
     // Check if API key is available
-    if (!process.env.GOOGLE_API_KEY) {
-      console.error("Error: GOOGLE_API_KEY environment variable is required");
+    if (!process.env.OPENAI_API_KEY) {
+      console.error("Error: OPENAI_API_KEY environment variable is required");
       return;
     }
 
@@ -46,12 +47,14 @@ async function example() {
 
     // Extract data from HTML
     const result = await extract({
+      llm: new ChatOpenAI({
+        apiKey: process.env.OPENAI_API_KEY,
+        modelName: "gpt-4o-mini",
+        temperature: 0,
+      }),
       content: htmlContent,
       format: ContentFormat.HTML,
       schema,
-      // Using Google Gemini by default
-      openaiApiKey: process.env.OPENAI_API_KEY,
-      provider: LLMProvider.OPENAI,
       sourceUrl,
     });
 
diff --git a/src/extractors.ts b/src/extractors.ts
index 683518a..24bb4c5 100644
--- a/src/extractors.ts
+++ b/src/extractors.ts
@@ -1,8 +1,6 @@
-import { ChatOpenAI } from "@langchain/openai";
-import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
 import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
 import { z } from "zod";
-import { LLMProvider, Usage, ContentFormat } from "./types";
+import { Usage, ContentFormat } from "./types";
 import { AIMessage } from "@langchain/core/messages";
 import {
   safeSanitizedParser,
@@ -40,35 +38,6 @@ export function getUsage(output: LLMResult): Usage {
   return usage;
 }
 
-/**
- * Create LLM instance based on provider and configuration
- */
-export function createLLM(
-  provider: LLMProvider,
-  modelName: string,
-  apiKey: string,
-  temperature: number = 0,
-) {
-  switch (provider) {
-    case LLMProvider.OPENAI:
-      return new ChatOpenAI({
-        apiKey,
-        modelName,
-        temperature,
-      });
-
-    case LLMProvider.GOOGLE_GEMINI:
-      return new ChatGoogleGenerativeAI({
-        apiKey,
-        model: modelName,
-        temperature,
-      });
-
-    default:
-      throw new Error(`Unsupported LLM provider: ${provider}`);
-  }
-}
-
 interface ExtractionPromptOptions {
   format: string;
   content: string;
diff --git a/src/index.ts b/src/index.ts
index 0136b49..8865e00 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,69 +1,21 @@
 import { z } from "zod";
-import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
 import { htmlToMarkdown } from "./converters";
-import { createLLM, extractWithLLM } from "./extractors";
+import { extractWithLLM } from "./extractors";
 import {
   ContentFormat,
-  LLMProvider,
   ExtractorOptions,
   ExtractorResult,
   HTMLExtractionOptions,
 } from "./types";
 
-// Default model names
-const DEFAULT_MODELS = {
-  [LLMProvider.GOOGLE_GEMINI]: "gemini-2.5-flash",
-  [LLMProvider.OPENAI]: "gpt-4o-mini",
-};
-
-/**
- * Resolve the LLM to use: either a user-provided instance or one created from provider config.
- */
-function resolveLLM<T extends z.ZodTypeAny>(
-  options: ExtractorOptions<T>,
-): BaseChatModel {
-  if (options.llm) {
-    return options.llm;
-  }
-
-  const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI;
-  let apiKey: string;
-
-  if (provider === LLMProvider.GOOGLE_GEMINI) {
-    apiKey = options.googleApiKey ?? process.env.GOOGLE_API_KEY ?? "";
-    if (!apiKey) {
-      throw new Error(
-        "Google API key is required. Provide googleApiKey option or set GOOGLE_API_KEY environment variable."
-      );
-    }
-  } else if (provider === LLMProvider.OPENAI) {
-    apiKey = options.openaiApiKey ?? process.env.OPENAI_API_KEY ?? "";
-    if (!apiKey) {
-      throw new Error(
-        "OpenAI API key is required. Provide openaiApiKey option or set OPENAI_API_KEY environment variable."
-      );
-    }
-  } else {
-    throw new Error(`Unsupported LLM provider: ${provider}`);
-  }
-
-  const modelName = options.modelName ?? DEFAULT_MODELS[provider];
-  return createLLM(provider, modelName, apiKey, options.temperature ?? 0);
-}
-
 /**
  * Extract structured data from HTML, markdown, or plain text content using an LLM
  *
  * @param options Configuration options for extraction
+ * @param options.llm A LangChain chat model instance (ChatOpenAI, ChatAnthropic, etc.)
  * @param options.content HTML, markdown, or plain text content to extract from
  * @param options.format Content format (HTML, MARKDOWN, or TXT)
  * @param options.schema Zod schema defining the structure to extract
- * @param options.llm A LangChain chat model instance. When provided, provider/modelName/apiKey options are ignored.
- * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI). Ignored when llm is provided.
- * @param options.modelName Model name to use (provider-specific). Ignored when llm is provided.
- * @param options.googleApiKey Google API key (if using Google Gemini provider). Ignored when llm is provided.
- * @param options.openaiApiKey OpenAI API key (if using OpenAI provider). Ignored when llm is provided.
- * @param options.temperature Temperature for the LLM (0-1). Ignored when llm is provided.
  * @param options.prompt Custom prompt to guide the extraction process
  * @param options.sourceUrl URL of the HTML content (required for HTML format)
  * @param options.htmlExtractionOptions HTML-specific options for content extraction
@@ -74,7 +26,6 @@ function resolveLLM<T extends z.ZodTypeAny>(
 export async function extract<T extends z.ZodTypeAny>(
   options: ExtractorOptions<T>
 ): Promise<ExtractorResult<z.infer<T>>> {
-  const llm = resolveLLM(options);
 
   // Validate sourceUrl for HTML format
   if (options.format === ContentFormat.HTML && !options.sourceUrl) {
@@ -100,7 +51,7 @@ export async function extract<T extends z.ZodTypeAny>(
   const { data, usage } = await extractWithLLM(
     content,
     options.schema,
-    llm,
+    options.llm,
     options.prompt,
     formatToUse.toString(),
     options.maxInputTokens,
diff --git a/src/types.ts b/src/types.ts
index e0ec0fd..3642a0b 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -11,14 +11,6 @@ export enum ContentFormat {
   TXT = "txt",
 }
 
-/**
- * Supported LLM providers
- */
-export enum LLMProvider {
-  OPENAI = "openai",
-  GOOGLE_GEMINI = "google_gemini",
-}
-
 /**
  * Proxy configuration for network requests
  */
@@ -129,32 +121,16 @@ export interface ExtractorOptions<T extends z.ZodTypeAny> {
 
   /**
    * A LangChain chat model instance to use for extraction.
-   * When provided, `provider`, `modelName`, and API key options are ignored.
    * Accepts any LangChain chat model (ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc.).
    *
    * @example
    * ```typescript
-   * import { ChatAnthropic } from "@langchain/anthropic";
-   * const llm = new ChatAnthropic({ model: "claude-sonnet-4-20250514" });
+   * import { ChatOpenAI } from "@langchain/openai";
+   * const llm = new ChatOpenAI({ model: "gpt-4o-mini" });
    * const result = await extract({ llm, content, format, schema });
    * ```
    */
-  llm?: BaseChatModel;
-
-  /** LLM Provider (OpenAI or Google Gemini). Ignored when `llm` is provided. */
-  provider?: LLMProvider;
-
-  /** Model name to use. Ignored when `llm` is provided. */
-  modelName?: string;
-
-  /** OpenAI API key. Ignored when `llm` is provided. */
-  openaiApiKey?: string;
-
-  /** Google API key. Ignored when `llm` is provided. */
-  googleApiKey?: string;
-
-  /** Temperature for the LLM (0-1), defaults to 0. Ignored when `llm` is provided. */
-  temperature?: number;
+  llm: BaseChatModel;
 
   /** HTML-specific extraction options (only applies when format is HTML) */
   htmlExtractionOptions?: HTMLExtractionOptions;
diff --git a/tests/integration/browser-extraction.test.ts b/tests/integration/browser-extraction.test.ts
index 6230796..f62125f 100644
--- a/tests/integration/browser-extraction.test.ts
+++ b/tests/integration/browser-extraction.test.ts
@@ -1,6 +1,15 @@
-import { extract, ContentFormat, LLMProvider, Browser } from "../../src/index";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { extract, ContentFormat, Browser } from "../../src/index";
 import { z } from "zod";
 
+function createGeminiLLM() {
+  return new ChatGoogleGenerativeAI({
+    apiKey: process.env.GOOGLE_API_KEY,
+    model: "gemini-2.5-flash",
+    temperature: 0,
+  });
+}
+
 const testSchema = z.object({
   title: z.string(),
   description: z.string().optional(),
@@ -33,12 +42,11 @@ describe("Browser + Extraction Integration Tests", () => {
 
         // Extract data from the loaded HTML
         const result = await extract({
+          llm: createGeminiLLM(),
           content: html,
           format: ContentFormat.HTML,
           sourceUrl: testUrl,
           schema: testSchema,
-          provider: LLMProvider.GOOGLE_GEMINI,
-          googleApiKey: process.env.GOOGLE_API_KEY,
         });
 
         expect(result.data).toBeDefined();
diff --git a/tests/integration/extract.test.ts b/tests/integration/extract.test.ts
index f944306..26d3054 100644
--- a/tests/integration/extract.test.ts
+++ b/tests/integration/extract.test.ts
@@ -1,14 +1,31 @@
 import * as fs from "fs";
 import * as path from "path";
 import { z } from "zod";
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
 import {
   extract,
   ContentFormat,
-  LLMProvider,
   ExtractorResult,
 } from "../../src";
 import { htmlToMarkdown } from "../../src/converters";
 
+function createGeminiLLM() {
+  return new ChatGoogleGenerativeAI({
+    apiKey: process.env.GOOGLE_API_KEY,
+    model: "gemini-2.5-flash",
+    temperature: 0,
+  });
+}
+
+function createOpenAILLM(modelName = "gpt-4o-mini") {
+  return new ChatOpenAI({
+    apiKey: process.env.OPENAI_API_KEY,
+    modelName,
+    temperature: 0,
+  });
+}
+
 // Read the sample HTML files
 const blogPostHtml = fs.readFileSync(
   path.resolve(__dirname, "../fixtures/blog-post.html"),
@@ -77,11 +94,10 @@ describe("Extract Integration Tests", () => {
   describe("Blog Post Extraction", () => {
     test("should extract blog post data using Google Gemini default model", async () => {
       const result = await extract({
+        llm: createGeminiLLM(),
         content: blogPostHtml,
         format: ContentFormat.HTML,
         schema: blogSchema,
-        provider: LLMProvider.GOOGLE_GEMINI,
-        googleApiKey: process.env.GOOGLE_API_KEY,
         sourceUrl: "https://example.com/blog/async-await",
       });
 
@@ -90,11 +106,10 @@ describe("Extract Integration Tests", () => {
 
     test("should extract blog post data using OpenAI default model", async () => {
       const result = await extract({
+        llm: createOpenAILLM(),
         content: blogPostHtml,
         format: ContentFormat.HTML,
         schema: blogSchemaOpenAI,
-        provider: LLMProvider.OPENAI,
-        openaiApiKey: process.env.OPENAI_API_KEY,
         sourceUrl: "https://example.com/blog/async-await",
       });
 
@@ -223,11 +238,10 @@ describe("Extract Integration Tests", () => {
   describe("Product List Extraction", () => {
     test("should extract product list data using Google Gemini", async () => {
       const result = await extract({
+        llm: createGeminiLLM(),
         content: productListHtml,
         format: ContentFormat.HTML,
         schema: productSchema,
-        provider: LLMProvider.GOOGLE_GEMINI,
-        googleApiKey: process.env.GOOGLE_API_KEY,
         sourceUrl: "https://example.com/products",
         htmlExtractionOptions: {
           extractMainHtml: true,
@@ -239,11 +253,10 @@ describe("Extract Integration Tests", () => {
 
     test("should extract product list data using OpenAI", async () => {
       const result = await extract({
+        llm: createOpenAILLM(),
         content: productListHtml,
         format: ContentFormat.HTML,
         schema: productSchemaOpenAI,
-        provider: LLMProvider.OPENAI,
-        openaiApiKey: process.env.OPENAI_API_KEY,
         sourceUrl: "https://example.com/products",
         htmlExtractionOptions: {
           extractMainHtml: true,
@@ -268,9 +281,7 @@ describe("Extract Integration Tests", () => {
           // a value that is not expected by the schema.
           price: z.number().describe("Use 'N/A' if not available").nullable(),
         }),
-        provider: LLMProvider.OPENAI,
-        openaiApiKey: process.env.OPENAI_API_KEY,
-        modelName: "gpt-3.5-turbo",
+        llm: createOpenAILLM("gpt-3.5-turbo"),
       });
       expect(result.data).toEqual(
         expect.objectContaining({
@@ -299,8 +310,7 @@ describe("Extract Integration Tests", () => {
           // to fail in some cases to return the structured output.
           content: z.string().optional(),
         }),
-        provider: LLMProvider.GOOGLE_GEMINI,
-        googleApiKey: process.env.GOOGLE_API_KEY,
+        llm: createGeminiLLM(),
         sourceUrl: "https://example.com/blog/async-await",
       });
       expect(result.data).toBeDefined();
@@ -319,11 +329,10 @@ describe("Extract Integration Tests", () => {
       });
 
       const result = await extract({
+        llm: createOpenAILLM(),
         content: markdownContent,
         format: ContentFormat.MARKDOWN,
         schema,
-        provider: LLMProvider.OPENAI,
-        openaiApiKey: process.env.OPENAI_API_KEY,
       });
 
       // Verify the extracted data
@@ -347,11 +356,10 @@ describe("Extract Integration Tests", () => {
       });
 
       const result = await extract({
+        llm: createOpenAILLM(),
         content: markdownContent,
         format: ContentFormat.MARKDOWN,
         schema,
-        provider: LLMProvider.OPENAI,
-        openaiApiKey: process.env.OPENAI_API_KEY,
       });
 
       // Verify the extracted data
@@ -378,11 +386,10 @@ describe("Extract Integration Tests", () => {
       };
 
       const result = await extract({
+        llm: createGeminiLLM(),
         content: blogPostHtml,
         format: ContentFormat.HTML,
         schema: blogSchema,
-        provider: LLMProvider.GOOGLE_GEMINI,
-        googleApiKey: process.env.GOOGLE_API_KEY,
         sourceUrl: "https://example.com/blog/async-await",
         extractionContext: partialData,
       });
@@ -400,11 +407,10 @@ describe("Extract Integration Tests", () => {
       };
 
       const result = await extract({
+        llm: createOpenAILLM(),
         content: blogPostHtml,
         format: ContentFormat.HTML,
         schema: blogSchemaOpenAI,
-        provider: LLMProvider.OPENAI,
-        openaiApiKey: process.env.OPENAI_API_KEY,
         sourceUrl: "https://example.com/blog/async-await",
         extractionContext: partialData,
       });
@@ -436,11 +442,10 @@ describe("Extract Integration Tests", () => {
       };
 
       const result = await extract({
+        llm: createGeminiLLM(),
         content: productListHtml,
         format: ContentFormat.HTML,
         schema: productSchema,
-        provider: LLMProvider.GOOGLE_GEMINI,
-        googleApiKey: process.env.GOOGLE_API_KEY,
         sourceUrl: "https://example.com/products",
         extractionContext: partialData,
         prompt:
@@ -596,11 +601,10 @@ describe("Image Extraction Integration Tests", () => {
   // Test with OpenAI
   test("should extract images using OpenAI when includeImages is true", async () => {
     const result = await extract({
+      llm: createOpenAILLM(),
       content: articleWithImages,
       format: ContentFormat.HTML,
       schema: articleSchemaOpenAI,
-      provider: LLMProvider.OPENAI,
-      openaiApiKey: process.env.OPENAI_API_KEY,
       htmlExtractionOptions: {
         includeImages: true,
       },
@@ -613,11 +617,10 @@ describe("Image Extraction Integration Tests", () => {
   // Test with Google Gemini
   test("should extract images using Google Gemini when includeImages is true", async () => {
     const result = await extract({
+      llm: createGeminiLLM(),
       content: articleWithImages,
       format: ContentFormat.HTML,
       schema: articleSchema,
-      provider: LLMProvider.GOOGLE_GEMINI,
-      googleApiKey: process.env.GOOGLE_API_KEY,
       htmlExtractionOptions: {
         includeImages: true,
       },
diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts
index 3cc1392..663db63 100644
--- a/tests/integration/processedContent.test.ts
+++ b/tests/integration/processedContent.test.ts
@@ -1,5 +1,14 @@
 import { z } from "zod";
-import { extract, ContentFormat, LLMProvider } from "../../src";
+import { ChatOpenAI } from "@langchain/openai";
+import { extract, ContentFormat } from "../../src";
+
+function createOpenAILLM() {
+  return new ChatOpenAI({
+    apiKey: process.env.OPENAI_API_KEY,
+    modelName: "gpt-4o-mini",
+    temperature: 0,
+  });
+}
 
 describe("ProcessedContent Integration Tests", () => {
   const simpleSchema = z.object({
@@ -25,11 +34,10 @@ describe("ProcessedContent Integration Tests", () => {
       "Title: Simple Test\n\nThis is a test of plain text extraction.";
 
     const result = await extract({
+      llm: createOpenAILLM(),
       content: plainTextContent,
       format: ContentFormat.TXT,
       schema: simpleSchema,
-      provider: LLMProvider.OPENAI,
-      openaiApiKey: process.env.OPENAI_API_KEY,
     });
 
     // Verify the processedContent is the same as the original content
@@ -46,11 +54,10 @@ describe("ProcessedContent Integration Tests", () => {
       "# Simple Test\n\nThis is a test of markdown extraction.";
 
     const result = await extract({
+      llm: createOpenAILLM(),
       content: markdownContent,
       format: ContentFormat.MARKDOWN,
       schema: simpleSchema,
-      provider: LLMProvider.OPENAI,
-      openaiApiKey: process.env.OPENAI_API_KEY,
     });
 
     // Verify the processedContent is the same as the original content
@@ -67,11 +74,10 @@ describe("ProcessedContent Integration Tests", () => {
       "<h1>Simple Test</h1><p>This is a test of HTML extraction.</p>";
 
     const result = await extract({
+      llm: createOpenAILLM(),
       content: htmlContent,
       format: ContentFormat.HTML,
       schema: simpleSchema,
-      provider: LLMProvider.OPENAI,
-      openaiApiKey: process.env.OPENAI_API_KEY,
       sourceUrl: "https://example.com",
     });
 
diff --git a/tests/unit/extractors.test.ts b/tests/unit/extractors.test.ts
index b249b26..f5c9272 100644
--- a/tests/unit/extractors.test.ts
+++ b/tests/unit/extractors.test.ts
@@ -1,47 +1,26 @@
 import {
   getUsage,
-  createLLM,
   extractWithLLM,
   truncateContent,
   generateExtractionPrompt,
 } from "../../src/extractors";
-import { LLMProvider, ContentFormat } from "../../src/types";
+import { ContentFormat } from "../../src/types";
 import { z } from "zod";
 
-// Mock the LLM providers
-jest.mock("@langchain/openai", () => ({
-  ChatOpenAI: jest.fn().mockImplementation(() => ({
-    constructor: { name: "ChatOpenAI" },
+function createMockLLM() {
+  return {
     withStructuredOutput: jest.fn().mockImplementation(() => ({
       invoke: jest.fn().mockResolvedValue({
         parsed: { title: "Test Title", content: "Test Content" },
         raw: {
           tool_calls: [
-            {
-              args: { title: "Test Title", content: "Test Content" },
-            },
+            { args: { title: "Test Title", content: "Test Content" } },
           ],
         },
       }),
     })),
-  })),
-}));
-
-jest.mock("@langchain/google-genai", () => ({
-  ChatGoogleGenerativeAI: jest.fn().mockImplementation(() => ({
-    constructor: { name: "ChatGoogleGenerativeAI" },
-    withStructuredOutput: jest.fn().mockImplementation(() => ({
-      invoke: jest.fn().mockResolvedValue({
-        parsed: { title: "Test Title", content: "Test Content" },
-        raw: {
-          lc_kwargs: {
-            content: '{"title":"Test Title","content":"Test Content"}',
-          },
-        },
-      }),
-    })),
-  })),
-}));
+  };
+}
 
 describe("extractors", () => {
   const mockSchema = z.object({
@@ -50,7 +29,6 @@ describe("extractors", () => {
   });
 
   const mockContent = "Test content";
-  const mockApiKey = "test-api-key";
 
   beforeEach(() => {
     jest.clearAllMocks();
@@ -95,57 +73,9 @@ describe("extractors", () => {
     });
   });
 
-  describe("createLLM", () => {
-    it("should create ChatOpenAI instance for OPENAI provider", () => {
-      const llm = createLLM(
-        LLMProvider.OPENAI,
-        "gpt-4o-mini",
-        "fake-api-key",
-        0
-      );
-
-      expect(llm).toBeDefined();
-      expect(llm.constructor.name).toBe("ChatOpenAI");
-    });
-
-    it("should create ChatGoogleGenerativeAI instance for GOOGLE_GEMINI provider", () => {
-      const llm = createLLM(
-        LLMProvider.GOOGLE_GEMINI,
-        "gemini-2.5-flash",
-        "fake-api-key",
-        0
-      );
-
-      expect(llm).toBeDefined();
-      expect(llm.constructor.name).toBe("ChatGoogleGenerativeAI");
-    });
-
-    it("should throw error for unsupported provider", () => {
-      expect(() => {
-        // @ts-ignore - Testing invalid provider
-        createLLM("unsupported-provider", "model", "api-key", 0);
-      }).toThrow("Unsupported LLM provider");
-    });
-  });
-
   describe("extractWithLLM", () => {
-    it("should extract data using OpenAI", async () => {
-      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
-      const result = await extractWithLLM(mockContent, mockSchema, llm);
-
-      expect(result.data).toEqual({
-        title: "Test Title",
-        content: "Test Content",
-      });
-    });
-
-    it("should extract data using Google Gemini", async () => {
-      const llm = createLLM(
-        LLMProvider.GOOGLE_GEMINI,
-        "gemini-2.5-flash",
-        mockApiKey,
-        0
-      );
+    it("should extract data using provided LLM", async () => {
+      const llm = createMockLLM() as any;
       const result = await extractWithLLM(mockContent, mockSchema, llm);
 
       expect(result.data).toEqual({
@@ -155,7 +85,7 @@ describe("extractors", () => {
     });
 
     it("should handle custom prompts", async () => {
-      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
+      const llm = createMockLLM() as any;
       const customPrompt = "Extract the main topic and summary";
       const result = await extractWithLLM(
         mockContent,
@@ -171,7 +101,7 @@ describe("extractors", () => {
     });
 
     it("should handle different content formats", async () => {
-      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
+      const llm = createMockLLM() as any;
       const result = await extractWithLLM(
         mockContent,
         mockSchema,
@@ -187,7 +117,7 @@ describe("extractors", () => {
     });
 
     it("should handle extraction context", async () => {
-      const llm = createLLM(LLMProvider.OPENAI, "gpt-4o-mini", mockApiKey, 0);
+      const llm = createMockLLM() as any;
       const extractionContext = {
         title: "Existing Title",
         content: "",

From 5d1bd21089ee1ad3f8e846302f6387c4594c37b4 Mon Sep 17 00:00:00 2001
From: Andrew Zhong <axzhong3@gmail.com>
Date: Tue, 10 Mar 2026 23:29:47 -0700
Subject: [PATCH 3/3] Update package.json to add @langchain/core as a peer
 dependency and adjust README installation instructions for LangChain
 integration packages.

---
 README.md    | 19 +++++++++++--------
 package.json |  6 ++++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 148e726..698caad 100644
--- a/README.md
+++ b/README.md
@@ -51,20 +51,23 @@ Lightfeed Extractor is a Typescript library built for robust web data extraction
 ## Installation
 
 ```bash
-npm install @lightfeed/extractor @langchain/core
+npm install @lightfeed/extractor @langchain/openai
 ```
 
-Install the LangChain integration for your chosen LLM provider:
+Install the LangChain integration package for your chosen provider. `@langchain/core` is a peer dependency — it's shared automatically:
 
 ```bash
-# For OpenAI
-npm install @langchain/openai
+# OpenAI
+npm install @lightfeed/extractor @langchain/openai
 
-# For Google Gemini
-npm install @langchain/google-genai
+# Google Gemini
+npm install @lightfeed/extractor @langchain/google-genai
 
-# For Anthropic, Ollama, etc.
-npm install @langchain/anthropic   # or @langchain/ollama, @langchain/mistralai, etc.
+# Anthropic
+npm install @lightfeed/extractor @langchain/anthropic
+
+# Ollama (local models)
+npm install @lightfeed/extractor @langchain/ollama
 ```
 
 ## Usage
diff --git a/package.json b/package.json
index 1ab543a..774356e 100644
--- a/package.json
+++ b/package.json
@@ -54,11 +54,12 @@
     "url": "https://github.com/lightfeed/extractor/issues"
   },
   "homepage": "https://github.com/lightfeed/extractor#readme",
+  "peerDependencies": {
+    "@langchain/core": ">=1.1.31"
+  },
   "dependencies": {
-    "@langchain/core": "^1.1.31",
     "cheerio": "^1.0.0",
     "jsonrepair": "^3.12.0",
-    "langchain": "^1.2.30",
     "playwright": "npm:rebrowser-playwright-core@1.49.1",
     "turndown": "^7.2.0",
     "xmldom": "^0.6.0",
@@ -66,6 +67,7 @@
     "zod": "^3.24.3"
   },
   "devDependencies": {
+    "@langchain/core": "^1.1.31",
     "@langchain/google-genai": "^2.1.24",
     "@langchain/openai": "^1.2.12",
     "@types/jest": "^29.5.12",