TanStack · tombeckenham · May 22, 2026 · May 22, 2026
diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md
@@ -0,0 +1,22 @@
+---
+'@tanstack/ai': minor
+'@tanstack/ai-openai': minor
+'@tanstack/ai-gemini': minor
+'@tanstack/ai-fal': minor
+'@tanstack/ai-grok': patch
+'@tanstack/ai-openrouter': patch
+'@tanstack/ai-event-client': patch
+---
+
+Add `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field.
+
+Provider behavior in this release:
+
+- **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error.
+- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws.
+- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only).
+- **fal.ai** — Inputs map to fal field names: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Video adapter additionally honors `role: 'start_frame'` / `'end_frame'`.
+- **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work).
+- **Anthropic** — Unchanged (no image generation API).
+
+Closes #618.
diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md
@@ -82,6 +82,9 @@ All image adapters support these common options:
 | `prompt` | `string` | Text description of the image to generate (required) |
 | `numberOfImages` | `number` | Number of images to generate |
 | `size` | `string` | Size of the generated image in WIDTHxHEIGHT format |
+| `imageInputs?` | `ImagePart[]` | Image conditioning inputs for image-to-image, reference-guided, edit, or multi-reference generation. See [Image-Conditioned Generation](#image-conditioned-generation) below. |
+| `videoInputs?` | `VideoPart[]` | Video conditioning inputs. Provider support is limited; most adapters throw. |
+| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs. Provider support is limited; most adapters throw. |
 | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
 
 ### Size Options
@@ -132,6 +135,114 @@ const result = await generateImage({
 })
 ```
 
+## Image-Conditioned Generation
+
+`generateImage()` accepts an optional `imageInputs` field for image-to-image,
+reference-guided, multi-reference, and edit / inpaint flows. The field reuses
+the same `ImagePart` shape used elsewhere for multimodal content:
+
+```typescript
+import { generateImage, type ImagePart } from '@tanstack/ai'
+import { openaiImage } from '@tanstack/ai-openai'
+
+const reference: ImagePart = {
+  type: 'image',
+  source: { type: 'url', value: 'https://example.com/product.png' },
+}
+
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Turn this into a cinematic product photo',
+  imageInputs: [reference],
+})
+```
+
+### Source format
+
+`ImagePart.source` is a discriminated union supporting both URLs and inline
+base64 data — pass whichever you have:
+
+```typescript
+// URL source
+{ type: 'image', source: { type: 'url', value: 'https://example.com/img.png' } }
+
+// Inline base64 data (mimeType required)
+{ type: 'image', source: { type: 'data', value: base64String, mimeType: 'image/png' } }
+```
+
+OpenAI's edit endpoint requires file uploads; the adapter fetches URL sources
+and converts base64 to a `File` automatically.
+
+### Role hints via `metadata.role`
+
+When a generation has multiple inputs with different roles (mask vs reference
+vs start/end frame), set `metadata.role` on each part. Adapters route by role
+to the provider-specific field; parts without a role fall back to positional
+mapping.
+
+| Role            | Maps to                                                                                |
+| --------------- | -------------------------------------------------------------------------------------- |
+| `'reference'`   | fal `reference_image_urls`; Gemini multimodal part; positional fallback                |
+| `'character'`   | Same as `'reference'`; Veo `referenceImages` slot                                      |
+| `'mask'`        | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url`                                  |
+| `'control'`     | fal `control_image_url` (ControlNet / depth / pose conditioning)                       |
+| `'start_frame'` | fal `start_image_url`; Veo `image` (used by `generateVideo`)                           |
+| `'end_frame'`   | fal `end_image_url`; Veo `lastFrame` (used by `generateVideo`)                         |
+
+#### Inpaint / edit with a mask
+
+```typescript
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Replace the masked region with a tree',
+  imageInputs: [
+    {
+      type: 'image',
+      source: { type: 'url', value: photoUrl },
+    },
+    {
+      type: 'image',
+      source: { type: 'url', value: maskUrl },
+      metadata: { role: 'mask' },
+    },
+  ],
+})
+```
+
+#### Multi-reference composition
+
+```typescript
+const product: ImagePart = {
+  type: 'image',
+  source: { type: 'url', value: 'https://example.com/product.png' },
+}
+
+const style: ImagePart = {
+  type: 'image',
+  source: { type: 'url', value: 'https://example.com/style.png' },
+}
+
+await generateImage({
+  adapter: geminiImage('gemini-3.1-flash-image-preview'),
+  prompt: 'Generate a new image of the product using the style of the second reference',
+  imageInputs: [product, style],
+})
+```
+
+### Provider support
+
+| Provider     | Behavior                                                                                                  |
+| ------------ | --------------------------------------------------------------------------------------------------------- |
+| **OpenAI**   | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.<br>`dall-e-2` → `images.edit()` with 1 source image only.<br>`dall-e-3` → throws (no edit support). |
+| **Gemini**   | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.<br>Imagen models → throws (text-to-image only). |
+| **fal.ai**   | 1 input → `image_url`; multiple → `image_urls`. `role: 'mask'` → `mask_url`. `role: 'control'` → `control_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. |
+| **Grok**     | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up.                                                                                                          |
+| **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up.                                                                                                              |
+| **Anthropic** | n/a — no image generation API.                                                                                                                                                                          |
+
+Adapters that don't support image-conditioned generation throw a clear
+runtime error so calls fail fast rather than silently dropping the inputs.
+
 ## Model Options
 
 ### OpenAI Model Options

diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md
@@ -372,8 +372,76 @@ And returns:
 | `prompt` | `string` | Text description of the video to generate (required) |
 | `size` | `string` | Video resolution in WIDTHxHEIGHT format |
 | `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) |
+| `imageInputs?` | `ImagePart[]` | Image conditioning inputs — starting frame, end frame, character / reference images. See [Image-to-Video](#image-to-video) below. |
+| `videoInputs?` | `VideoPart[]` | Video conditioning inputs for video-to-video / source clip flows. Provider support varies. |
+| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs for lipsync / voice cloning flows. Provider support varies. |
 | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
 
+## Image-to-Video
+
+`generateVideo()` accepts `imageInputs` for starting-frame, ending-frame,
+and reference-image conditioned video generation:
+
+```typescript
+import { generateVideo, type ImagePart } from '@tanstack/ai'
+import { openaiVideo } from '@tanstack/ai-openai'
+
+const startingFrame: ImagePart = {
+  type: 'image',
+  source: {
+    type: 'data',
+    value: base64Image,
+    mimeType: 'image/png',
+  },
+}
+
+const { jobId } = await generateVideo({
+  adapter: openaiVideo('sora-2'),
+  prompt: 'Animate this still into a slow cinematic push-in with subtle motion',
+  imageInputs: [startingFrame],
+})
+```
+
+### Role hints
+
+Each `ImagePart` can carry an optional `metadata.role` hint that the
+adapter uses to route the input to the provider-specific field:
+
+| Role            | Maps to                                                       |
+| --------------- | ------------------------------------------------------------- |
+| `'start_frame'` | fal `start_image_url` (positional default for the first input) |
+| `'end_frame'`   | fal `end_image_url` (Veo `lastFrame` when available)           |
+| `'reference'`   | fal `reference_image_urls` (Veo `referenceImages`)             |
+| `'character'`   | Same as `'reference'` — character consistency images           |
+
+```typescript
+import { falVideo } from '@tanstack/ai-fal'
+
+await generateVideo({
+  adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'),
+  prompt: 'Slow cinematic push-in then a hard cut',
+  imageInputs: [
+    { type: 'image', source: { type: 'url', value: firstFrameUrl } },
+    {
+      type: 'image',
+      source: { type: 'url', value: lastFrameUrl },
+      metadata: { role: 'end_frame' },
+    },
+  ],
+})
+```
+
+### Provider support
+
+| Provider     | Image-to-Video Behavior                                                                                  |
+| ------------ | -------------------------------------------------------------------------------------------------------- |
+| **OpenAI**   | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. |
+| **fal.ai**   | Single input → `image_url` (start frame). `role: 'end_frame'` → `end_image_url`. `role: 'start_frame'` → `start_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. |
+| **Gemini**   | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands.                         |
+
+Adapters whose underlying API can't accept image inputs throw a clear
+runtime error so calls fail fast.
+
 ### Supported Sizes
 
 Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create):

diff --git a/packages/typescript/ai-event-client/src/index.ts b/packages/typescript/ai-event-client/src/index.ts
@@ -428,6 +428,12 @@ export interface ImageRequestStartedEvent extends BaseEventContext {
   prompt: string
   numberOfImages?: number
   size?: string
+  /** Count of image conditioning inputs (image-to-image, mask, reference). */
+  imageInputCount?: number
+  /** Count of video conditioning inputs (video-to-video). */
+  videoInputCount?: number
+  /** Count of audio conditioning inputs (lipsync, voice reference). */
+  audioInputCount?: number
 }
 
 /** Emitted when an image request completes. */

diff --git a/packages/typescript/ai-fal/src/adapters/image.ts b/packages/typescript/ai-fal/src/adapters/image.ts
@@ -2,6 +2,7 @@ import { fal } from '@fal-ai/client'
 import { BaseImageAdapter } from '@tanstack/ai/adapters'
 import { configureFalClient, generateId as utilGenerateId } from '../utils'
 import { mapSizeToFalFormat } from '../image/image-provider-options'
+import { mapImageInputsToFalFields } from '../image/image-inputs'
 import type { OutputType, Result } from '@fal-ai/client'
 import type { FalClientConfig } from '../utils'
 import type {
@@ -63,6 +64,17 @@ export class FalImageAdapter<TModel extends FalModel> extends BaseImageAdapter<
       model: this.model,
     })
 
+    if (options.videoInputs?.length) {
+      throw new Error(
+        `fal.generateImages does not support videoInputs on model ${this.model}.`,
+      )
+    }
+    if (options.audioInputs?.length) {
+      throw new Error(
+        `fal.generateImages does not support audioInputs on model ${this.model}.`,
+      )
+    }
+
     try {
       const input = this.buildInput(options)
       const result = await fal.subscribe(this.model, { input })
@@ -83,9 +95,14 @@ export class FalImageAdapter<TModel extends FalModel> extends BaseImageAdapter<
     >,
   ): FalModelInput<TModel> {
     const sizeParams = mapSizeToFalFormat(options.size)
+    // Order matters: modelOptions first (so user overrides win for
+    // mask_url / control_image_url / reference_image_urls), then size,
+    // then derived image-input fields, then prompt / num_images.
+    const inputFields = mapImageInputsToFalFields(options.imageInputs)
     const input = {
       ...options.modelOptions,
       ...sizeParams,
+      ...inputFields,
       prompt: options.prompt,
       num_images: options.numberOfImages,
     } as FalModelInput<TModel>

diff --git a/packages/typescript/ai-fal/src/adapters/video.ts b/packages/typescript/ai-fal/src/adapters/video.ts
@@ -2,6 +2,8 @@ import { fal } from '@fal-ai/client'
 import { BaseVideoAdapter } from '@tanstack/ai/adapters'
 import { configureFalClient, generateId as utilGenerateId } from '../utils'
 import { mapVideoSizeToFalFormat } from '../video/video-provider-options'
+import { mapImageInputsToFalVideoFields } from '../image/image-inputs'
+import type { AudioPart, MediaInputMetadata, VideoPart } from '@tanstack/ai'
 import type {
   VideoGenerationOptions,
   VideoJobResult,
@@ -16,6 +18,63 @@ import type {
 } from '../model-meta'
 import type { FalClientConfig } from '../utils'
 
+/**
+ * Map video conditioning inputs onto fal field names.
+ * Video-to-video endpoints on fal almost universally use `video_url`; the
+ * occasional model takes `video_urls` (rare). Mirror the image-input logic
+ * positionally with a `reference` role escape hatch via `reference_video_urls`.
+ */
+function mapVideoInputsToFalFields(
+  videoInputs?: ReadonlyArray<VideoPart<MediaInputMetadata>>,
+): Record<string, unknown> {
+  if (!videoInputs || videoInputs.length === 0) return {}
+  const references: Array<string> = []
+  const sources: Array<string> = []
+  for (const part of videoInputs) {
+    const url = videoPartToUrl(part)
+    if (
+      part.metadata?.role === 'reference' ||
+      part.metadata?.role === 'character'
+    ) {
+      references.push(url)
+    } else {
+      sources.push(url)
+    }
+  }
+  const out: Record<string, unknown> = {}
+  if (references.length > 0) out.reference_video_urls = references
+  if (sources.length === 1) {
+    out.video_url = sources[0]
+  } else if (sources.length > 1) {
+    out.video_urls = sources
+  }
+  return out
+}
+
+function mapAudioInputsToFalFields(
+  audioInputs?: ReadonlyArray<AudioPart<MediaInputMetadata>>,
+): Record<string, unknown> {
+  if (!audioInputs || audioInputs.length === 0) return {}
+  if (audioInputs.length > 1) {
+    throw new Error(
+      `fal: multiple audioInputs are not supported (received ${audioInputs.length}).`,
+    )
+  }
+  const part = audioInputs[0]!
+  return {
+    audio_url:
+      part.source.type === 'url'
+        ? part.source.value
+        : `data:${part.source.mimeType};base64,${part.source.value}`,
+  }
+}
+
+function videoPartToUrl(part: VideoPart<MediaInputMetadata>): string {
+  return part.source.type === 'url'
+    ? part.source.value
+    : `data:${part.source.mimeType};base64,${part.source.value}`
+}
+
 type FalQueueStatus = 'IN_QUEUE' | 'IN_PROGRESS' | 'COMPLETED'
 
 interface FalStatusResponse {
@@ -80,7 +139,16 @@ export class FalVideoAdapter<TModel extends FalModel> extends BaseVideoAdapter<
       FalModelVideoSize<TModel>
     >,
   ): Promise<VideoJobResult> {
-    const { prompt, size, duration, modelOptions, logger } = options
+    const {
+      prompt,
+      size,
+      duration,
+      modelOptions,
+      logger,
+      imageInputs,
+      videoInputs,
+      audioInputs,
+    } = options
 
     logger.request(`activity=generateVideo provider=fal model=${this.model}`, {
       provider: 'fal',
@@ -89,10 +157,16 @@ export class FalVideoAdapter<TModel extends FalModel> extends BaseVideoAdapter<
 
     try {
       const sizeParams = mapVideoSizeToFalFormat(size)
+      const inputImageFields = mapImageInputsToFalVideoFields(imageInputs)
+      const videoFields = mapVideoInputsToFalFields(videoInputs)
+      const audioFields = mapAudioInputsToFalFields(audioInputs)
 
       const input = {
         ...modelOptions,
         ...sizeParams,
+        ...inputImageFields,
+        ...videoFields,
+        ...audioFields,
         prompt,
         ...(duration ? { duration } : {}),
       } as FalModelInput<TModel>