diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md
new file mode 100644
index 000000000..3620076c0
--- /dev/null
+++ b/.changeset/image-and-video-inputs.md
@@ -0,0 +1,22 @@
+---
+'@tanstack/ai': minor
+'@tanstack/ai-openai': minor
+'@tanstack/ai-gemini': minor
+'@tanstack/ai-fal': minor
+'@tanstack/ai-grok': patch
+'@tanstack/ai-openrouter': patch
+'@tanstack/ai-event-client': patch
+---
+
+Add `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field.
+
+Provider behavior in this release:
+
+- **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error.
+- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws.
+- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only).
+- **fal.ai** — Inputs map to fal field names: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Video adapter additionally honors `role: 'start_frame'` / `'end_frame'`.
+- **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work).
+- **Anthropic** — Unchanged (no image generation API).
+
+Closes #618.
diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md
index 6c3c6e115..5ba27fe94 100644
--- a/docs/media/image-generation.md
+++ b/docs/media/image-generation.md
@@ -82,6 +82,9 @@ All image adapters support these common options:
| `prompt` | `string` | Text description of the image to generate (required) |
| `numberOfImages` | `number` | Number of images to generate |
| `size` | `string` | Size of the generated image in WIDTHxHEIGHT format |
+| `imageInputs?` | `ImagePart[]` | Image conditioning inputs for image-to-image, reference-guided, edit, or multi-reference generation. See [Image-Conditioned Generation](#image-conditioned-generation) below. |
+| `videoInputs?` | `VideoPart[]` | Video conditioning inputs. Provider support is limited; most adapters throw. |
+| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs. Provider support is limited; most adapters throw. |
| `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
### Size Options
@@ -132,6 +135,114 @@ const result = await generateImage({
})
```
+## Image-Conditioned Generation
+
+`generateImage()` accepts an optional `imageInputs` field for image-to-image,
+reference-guided, multi-reference, and edit / inpaint flows. The field reuses
+the same `ImagePart` shape used elsewhere for multimodal content:
+
+```typescript
+import { generateImage, type ImagePart } from '@tanstack/ai'
+import { openaiImage } from '@tanstack/ai-openai'
+
+const reference: ImagePart = {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/product.png' },
+}
+
+await generateImage({
+ adapter: openaiImage('gpt-image-1'),
+ prompt: 'Turn this into a cinematic product photo',
+ imageInputs: [reference],
+})
+```
+
+### Source format
+
+`ImagePart.source` is a discriminated union supporting both URLs and inline
+base64 data — pass whichever you have:
+
+```typescript
+// URL source
+{ type: 'image', source: { type: 'url', value: 'https://example.com/img.png' } }
+
+// Inline base64 data (mimeType required)
+{ type: 'image', source: { type: 'data', value: base64String, mimeType: 'image/png' } }
+```
+
+OpenAI's edit endpoint requires file uploads; the adapter fetches URL sources
+and converts base64 to a `File` automatically.
+
+### Role hints via `metadata.role`
+
+When a generation has multiple inputs with different roles (mask vs reference
+vs start/end frame), set `metadata.role` on each part. Adapters route by role
+to the provider-specific field; parts without a role fall back to positional
+mapping.
+
+| Role | Maps to |
+| --------------- | -------------------------------------------------------------------------------------- |
+| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional fallback |
+| `'character'` | Same as `'reference'`; Veo `referenceImages` slot |
+| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` |
+| `'control'` | fal `control_image_url` (ControlNet / depth / pose conditioning) |
+| `'start_frame'` | fal `start_image_url`; Veo `image` (used by `generateVideo`) |
+| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` (used by `generateVideo`) |
+
+#### Inpaint / edit with a mask
+
+```typescript
+await generateImage({
+ adapter: openaiImage('gpt-image-1'),
+ prompt: 'Replace the masked region with a tree',
+ imageInputs: [
+ {
+ type: 'image',
+ source: { type: 'url', value: photoUrl },
+ },
+ {
+ type: 'image',
+ source: { type: 'url', value: maskUrl },
+ metadata: { role: 'mask' },
+ },
+ ],
+})
+```
+
+#### Multi-reference composition
+
+```typescript
+const product: ImagePart = {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/product.png' },
+}
+
+const style: ImagePart = {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/style.png' },
+}
+
+await generateImage({
+ adapter: geminiImage('gemini-3.1-flash-image-preview'),
+ prompt: 'Generate a new image of the product using the style of the second reference',
+ imageInputs: [product, style],
+})
+```
+
+### Provider support
+
+| Provider | Behavior |
+| ------------ | --------------------------------------------------------------------------------------------------------- |
+| **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). |
+| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.
Imagen models → throws (text-to-image only). |
+| **fal.ai** | 1 input → `image_url`; multiple → `image_urls`. `role: 'mask'` → `mask_url`. `role: 'control'` → `control_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. |
+| **Grok** | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up. |
+| **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up. |
+| **Anthropic** | n/a — no image generation API. |
+
+Adapters that don't support image-conditioned generation throw a clear
+runtime error so calls fail fast rather than silently dropping the inputs.
+
## Model Options
### OpenAI Model Options
diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md
index b42e88b6b..1872a39be 100644
--- a/docs/media/video-generation.md
+++ b/docs/media/video-generation.md
@@ -372,8 +372,76 @@ And returns:
| `prompt` | `string` | Text description of the video to generate (required) |
| `size` | `string` | Video resolution in WIDTHxHEIGHT format |
| `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) |
+| `imageInputs?` | `ImagePart[]` | Image conditioning inputs — starting frame, end frame, character / reference images. See [Image-to-Video](#image-to-video) below. |
+| `videoInputs?` | `VideoPart[]` | Video conditioning inputs for video-to-video / source clip flows. Provider support varies. |
+| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs for lipsync / voice cloning flows. Provider support varies. |
| `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
+## Image-to-Video
+
+`generateVideo()` accepts `imageInputs` for starting-frame, ending-frame,
+and reference-image conditioned video generation:
+
+```typescript
+import { generateVideo, type ImagePart } from '@tanstack/ai'
+import { openaiVideo } from '@tanstack/ai-openai'
+
+const startingFrame: ImagePart = {
+ type: 'image',
+ source: {
+ type: 'data',
+ value: base64Image,
+ mimeType: 'image/png',
+ },
+}
+
+const { jobId } = await generateVideo({
+ adapter: openaiVideo('sora-2'),
+ prompt: 'Animate this still into a slow cinematic push-in with subtle motion',
+ imageInputs: [startingFrame],
+})
+```
+
+### Role hints
+
+Each `ImagePart` can carry an optional `metadata.role` hint that the
+adapter uses to route the input to the provider-specific field:
+
+| Role | Maps to |
+| --------------- | ------------------------------------------------------------- |
+| `'start_frame'` | fal `start_image_url` (positional default for the first input) |
+| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` when available) |
+| `'reference'` | fal `reference_image_urls` (Veo `referenceImages`) |
+| `'character'` | Same as `'reference'` — character consistency images |
+
+```typescript
+import { falVideo } from '@tanstack/ai-fal'
+
+await generateVideo({
+ adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'),
+ prompt: 'Slow cinematic push-in then a hard cut',
+ imageInputs: [
+ { type: 'image', source: { type: 'url', value: firstFrameUrl } },
+ {
+ type: 'image',
+ source: { type: 'url', value: lastFrameUrl },
+ metadata: { role: 'end_frame' },
+ },
+ ],
+})
+```
+
+### Provider support
+
+| Provider | Image-to-Video Behavior |
+| ------------ | -------------------------------------------------------------------------------------------------------- |
+| **OpenAI** | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. |
+| **fal.ai** | Single input → `image_url` (start frame). `role: 'end_frame'` → `end_image_url`. `role: 'start_frame'` → `start_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. |
+| **Gemini** | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands. |
+
+Adapters whose underlying API can't accept image inputs throw a clear
+runtime error so calls fail fast.
+
### Supported Sizes
Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create):
diff --git a/packages/typescript/ai-event-client/src/index.ts b/packages/typescript/ai-event-client/src/index.ts
index 17fa1d6e0..acfc953c4 100644
--- a/packages/typescript/ai-event-client/src/index.ts
+++ b/packages/typescript/ai-event-client/src/index.ts
@@ -428,6 +428,12 @@ export interface ImageRequestStartedEvent extends BaseEventContext {
prompt: string
numberOfImages?: number
size?: string
+ /** Count of image conditioning inputs (image-to-image, mask, reference). */
+ imageInputCount?: number
+ /** Count of video conditioning inputs (video-to-video). */
+ videoInputCount?: number
+ /** Count of audio conditioning inputs (lipsync, voice reference). */
+ audioInputCount?: number
}
/** Emitted when an image request completes. */
diff --git a/packages/typescript/ai-fal/src/adapters/image.ts b/packages/typescript/ai-fal/src/adapters/image.ts
index 11dcbaeb2..7fbb14a08 100644
--- a/packages/typescript/ai-fal/src/adapters/image.ts
+++ b/packages/typescript/ai-fal/src/adapters/image.ts
@@ -2,6 +2,7 @@ import { fal } from '@fal-ai/client'
import { BaseImageAdapter } from '@tanstack/ai/adapters'
import { configureFalClient, generateId as utilGenerateId } from '../utils'
import { mapSizeToFalFormat } from '../image/image-provider-options'
+import { mapImageInputsToFalFields } from '../image/image-inputs'
import type { OutputType, Result } from '@fal-ai/client'
import type { FalClientConfig } from '../utils'
import type {
@@ -63,6 +64,17 @@ export class FalImageAdapter extends BaseImageAdapter<
model: this.model,
})
+ if (options.videoInputs?.length) {
+ throw new Error(
+ `fal.generateImages does not support videoInputs on model ${this.model}.`,
+ )
+ }
+ if (options.audioInputs?.length) {
+ throw new Error(
+ `fal.generateImages does not support audioInputs on model ${this.model}.`,
+ )
+ }
+
try {
const input = this.buildInput(options)
const result = await fal.subscribe(this.model, { input })
@@ -83,9 +95,14 @@ export class FalImageAdapter extends BaseImageAdapter<
>,
): FalModelInput {
const sizeParams = mapSizeToFalFormat(options.size)
+ // Order matters: modelOptions first (so user overrides win for
+ // mask_url / control_image_url / reference_image_urls), then size,
+ // then derived image-input fields, then prompt / num_images.
+ const inputFields = mapImageInputsToFalFields(options.imageInputs)
const input = {
...options.modelOptions,
...sizeParams,
+ ...inputFields,
prompt: options.prompt,
num_images: options.numberOfImages,
} as FalModelInput
diff --git a/packages/typescript/ai-fal/src/adapters/video.ts b/packages/typescript/ai-fal/src/adapters/video.ts
index 662b4f60f..6f3dbb1ef 100644
--- a/packages/typescript/ai-fal/src/adapters/video.ts
+++ b/packages/typescript/ai-fal/src/adapters/video.ts
@@ -2,6 +2,8 @@ import { fal } from '@fal-ai/client'
import { BaseVideoAdapter } from '@tanstack/ai/adapters'
import { configureFalClient, generateId as utilGenerateId } from '../utils'
import { mapVideoSizeToFalFormat } from '../video/video-provider-options'
+import { mapImageInputsToFalVideoFields } from '../image/image-inputs'
+import type { AudioPart, MediaInputMetadata, VideoPart } from '@tanstack/ai'
import type {
VideoGenerationOptions,
VideoJobResult,
@@ -16,6 +18,63 @@ import type {
} from '../model-meta'
import type { FalClientConfig } from '../utils'
+/**
+ * Map video conditioning inputs onto fal field names.
+ * Video-to-video endpoints on fal almost universally use `video_url`; the
+ * occasional model takes `video_urls` (rare). Mirror the image-input logic
+ * positionally with a `reference` role escape hatch via `reference_video_urls`.
+ */
+function mapVideoInputsToFalFields(
+ videoInputs?: ReadonlyArray>,
+): Record {
+ if (!videoInputs || videoInputs.length === 0) return {}
+ const references: Array = []
+ const sources: Array = []
+ for (const part of videoInputs) {
+ const url = videoPartToUrl(part)
+ if (
+ part.metadata?.role === 'reference' ||
+ part.metadata?.role === 'character'
+ ) {
+ references.push(url)
+ } else {
+ sources.push(url)
+ }
+ }
+ const out: Record = {}
+ if (references.length > 0) out.reference_video_urls = references
+ if (sources.length === 1) {
+ out.video_url = sources[0]
+ } else if (sources.length > 1) {
+ out.video_urls = sources
+ }
+ return out
+}
+
+function mapAudioInputsToFalFields(
+ audioInputs?: ReadonlyArray>,
+): Record {
+ if (!audioInputs || audioInputs.length === 0) return {}
+ if (audioInputs.length > 1) {
+ throw new Error(
+ `fal: multiple audioInputs are not supported (received ${audioInputs.length}).`,
+ )
+ }
+ const part = audioInputs[0]!
+ return {
+ audio_url:
+ part.source.type === 'url'
+ ? part.source.value
+ : `data:${part.source.mimeType};base64,${part.source.value}`,
+ }
+}
+
+function videoPartToUrl(part: VideoPart): string {
+ return part.source.type === 'url'
+ ? part.source.value
+ : `data:${part.source.mimeType};base64,${part.source.value}`
+}
+
type FalQueueStatus = 'IN_QUEUE' | 'IN_PROGRESS' | 'COMPLETED'
interface FalStatusResponse {
@@ -80,7 +139,16 @@ export class FalVideoAdapter extends BaseVideoAdapter<
FalModelVideoSize
>,
): Promise {
- const { prompt, size, duration, modelOptions, logger } = options
+ const {
+ prompt,
+ size,
+ duration,
+ modelOptions,
+ logger,
+ imageInputs,
+ videoInputs,
+ audioInputs,
+ } = options
logger.request(`activity=generateVideo provider=fal model=${this.model}`, {
provider: 'fal',
@@ -89,10 +157,16 @@ export class FalVideoAdapter extends BaseVideoAdapter<
try {
const sizeParams = mapVideoSizeToFalFormat(size)
+ const inputImageFields = mapImageInputsToFalVideoFields(imageInputs)
+ const videoFields = mapVideoInputsToFalFields(videoInputs)
+ const audioFields = mapAudioInputsToFalFields(audioInputs)
const input = {
...modelOptions,
...sizeParams,
+ ...inputImageFields,
+ ...videoFields,
+ ...audioFields,
prompt,
...(duration ? { duration } : {}),
} as FalModelInput
diff --git a/packages/typescript/ai-fal/src/image/image-inputs.ts b/packages/typescript/ai-fal/src/image/image-inputs.ts
new file mode 100644
index 000000000..0a5a06ca3
--- /dev/null
+++ b/packages/typescript/ai-fal/src/image/image-inputs.ts
@@ -0,0 +1,165 @@
+import type { ImagePart, MediaInputMetadata } from '@tanstack/ai'
+
+/**
+ * Map TanStack `imageInputs` onto fal.ai endpoint fields.
+ *
+ * fal endpoints use different field names for image-conditioned generation
+ * (~80% use `image_url` for single; the rest use `image_urls`,
+ * `reference_image_urls`, `mask_url`, `control_image_url`, etc.). Without
+ * per-endpoint metadata we apply this heuristic:
+ *
+ * - parts with `metadata.role === 'mask'` → `mask_url` (single)
+ * - parts with `metadata.role === 'control'` → `control_image_url` (single)
+ * - parts with `metadata.role === 'reference'` → `reference_image_urls` (array)
+ * - parts with `metadata.role === 'character'` → `reference_image_urls` (array)
+ * - remaining parts (no role, or unknown role):
+ * - exactly 1 part → `image_url`
+ * - >1 parts → `image_urls`
+ *
+ * Users can always override the resulting field shape via `modelOptions`
+ * (spread before these fields), or pass everything through `modelOptions`
+ * directly when the heuristic doesn't match an obscure endpoint.
+ *
+ * This mapping is interim and will be replaced by a per-endpoint mapping
+ * sourced from the `@fal-ai/schemas` library once it lands.
+ */
+export function mapImageInputsToFalFields(
+ imageInputs?: ReadonlyArray>,
+): Record {
+ if (!imageInputs || imageInputs.length === 0) return {}
+
+ const fields: Record = {}
+
+ const masks: Array = []
+ const controls: Array = []
+ const references: Array = []
+ const sources: Array = []
+
+ for (const part of imageInputs) {
+ const url = imagePartToUrl(part)
+ const role = part.metadata?.role
+ switch (role) {
+ case 'mask':
+ masks.push(url)
+ break
+ case 'control':
+ controls.push(url)
+ break
+ case 'reference':
+ case 'character':
+ references.push(url)
+ break
+ case 'start_frame':
+ case 'end_frame':
+ // Frame roles aren't meaningful for image generation; treat as the
+ // primary source. Video adapter handles start/end framing.
+ sources.push(url)
+ break
+ default:
+ sources.push(url)
+ }
+ }
+
+ if (masks.length > 1) {
+ throw new Error(
+ `fal: only one input with metadata.role === 'mask' is supported per request (received ${masks.length}).`,
+ )
+ }
+ if (controls.length > 1) {
+ throw new Error(
+ `fal: only one input with metadata.role === 'control' is supported per request (received ${controls.length}).`,
+ )
+ }
+
+ if (masks[0]) fields.mask_url = masks[0]
+ if (controls[0]) fields.control_image_url = controls[0]
+ if (references.length > 0) fields.reference_image_urls = references
+
+ if (sources.length === 1) {
+ fields.image_url = sources[0]
+ } else if (sources.length > 1) {
+ fields.image_urls = sources
+ }
+
+ return fields
+}
+
+/**
+ * Map TanStack `imageInputs` onto fal.ai video-endpoint fields.
+ *
+ * Video endpoints often expose a start frame as `image_url` (76% of i2v
+ * models) plus an optional `end_image_url`. Multi-reference video models
+ * (Kling O3, Seedance reference-to-video) use `reference_image_urls` or
+ * `image_urls`. Mapping:
+ *
+ * - `metadata.role === 'start_frame'` → `start_image_url`
+ * - `metadata.role === 'end_frame'` → `end_image_url`
+ * - `metadata.role === 'reference' | 'character'` → `reference_image_urls`
+ * - remaining parts (no role or unknown role):
+ * - exactly 1 part → `image_url`
+ * - >1 parts → `image_urls`
+ */
+export function mapImageInputsToFalVideoFields(
+ imageInputs?: ReadonlyArray>,
+): Record {
+ if (!imageInputs || imageInputs.length === 0) return {}
+
+ const fields: Record = {}
+
+ const startFrames: Array = []
+ const endFrames: Array = []
+ const references: Array = []
+ const sources: Array = []
+
+ for (const part of imageInputs) {
+ const url = imagePartToUrl(part)
+ const role = part.metadata?.role
+ switch (role) {
+ case 'start_frame':
+ startFrames.push(url)
+ break
+ case 'end_frame':
+ endFrames.push(url)
+ break
+ case 'reference':
+ case 'character':
+ references.push(url)
+ break
+ default:
+ sources.push(url)
+ }
+ }
+
+ if (startFrames.length > 1) {
+ throw new Error(
+ `fal: only one input with metadata.role === 'start_frame' is supported (received ${startFrames.length}).`,
+ )
+ }
+ if (endFrames.length > 1) {
+ throw new Error(
+ `fal: only one input with metadata.role === 'end_frame' is supported (received ${endFrames.length}).`,
+ )
+ }
+
+ if (startFrames[0]) fields.start_image_url = startFrames[0]
+ if (endFrames[0]) fields.end_image_url = endFrames[0]
+ if (references.length > 0) fields.reference_image_urls = references
+
+ if (sources.length === 1) {
+ fields.image_url = sources[0]
+ } else if (sources.length > 1) {
+ fields.image_urls = sources
+ }
+
+ return fields
+}
+
+/**
+ * Convert a TanStack ImagePart into a string suitable for fal's URL-based
+ * input fields. URL sources pass through; data sources are emitted as a
+ * `data:;base64,` URI which fal endpoints accept on the wire.
+ */
+function imagePartToUrl(part: ImagePart): string {
+ if (part.source.type === 'url') return part.source.value
+ return `data:${part.source.mimeType};base64,${part.source.value}`
+}
diff --git a/packages/typescript/ai-fal/tests/image-inputs.test.ts b/packages/typescript/ai-fal/tests/image-inputs.test.ts
new file mode 100644
index 000000000..0ed534080
--- /dev/null
+++ b/packages/typescript/ai-fal/tests/image-inputs.test.ts
@@ -0,0 +1,140 @@
+import { describe, expect, it } from 'vitest'
+import {
+ mapImageInputsToFalFields,
+ mapImageInputsToFalVideoFields,
+} from '../src/image/image-inputs'
+import type { ImagePart, MediaInputMetadata } from '@tanstack/ai'
+
+function urlPart(
+ value: string,
+ metadata?: MediaInputMetadata,
+): ImagePart {
+ return {
+ type: 'image',
+ source: { type: 'url', value },
+ ...(metadata && { metadata }),
+ }
+}
+
+describe('mapImageInputsToFalFields', () => {
+ it('returns an empty object when imageInputs is missing or empty', () => {
+ expect(mapImageInputsToFalFields(undefined)).toEqual({})
+ expect(mapImageInputsToFalFields([])).toEqual({})
+ })
+
+ it('routes a single source to image_url', () => {
+ expect(
+ mapImageInputsToFalFields([urlPart('https://example.com/a.png')]),
+ ).toEqual({ image_url: 'https://example.com/a.png' })
+ })
+
+ it('routes multiple sources to image_urls', () => {
+ expect(
+ mapImageInputsToFalFields([
+ urlPart('https://example.com/a.png'),
+ urlPart('https://example.com/b.png'),
+ ]),
+ ).toEqual({
+ image_urls: ['https://example.com/a.png', 'https://example.com/b.png'],
+ })
+ })
+
+ it('routes role=mask to mask_url alongside the source image_url', () => {
+ expect(
+ mapImageInputsToFalFields([
+ urlPart('https://example.com/img.png'),
+ urlPart('https://example.com/mask.png', { role: 'mask' }),
+ ]),
+ ).toEqual({
+ image_url: 'https://example.com/img.png',
+ mask_url: 'https://example.com/mask.png',
+ })
+ })
+
+ it('routes role=reference to reference_image_urls', () => {
+ expect(
+ mapImageInputsToFalFields([
+ urlPart('https://example.com/product.png'),
+ urlPart('https://example.com/style.png', { role: 'reference' }),
+ urlPart('https://example.com/character.png', { role: 'character' }),
+ ]),
+ ).toEqual({
+ image_url: 'https://example.com/product.png',
+ reference_image_urls: [
+ 'https://example.com/style.png',
+ 'https://example.com/character.png',
+ ],
+ })
+ })
+
+ it('routes role=control to control_image_url', () => {
+ expect(
+ mapImageInputsToFalFields([
+ urlPart('https://example.com/img.png'),
+ urlPart('https://example.com/depth.png', { role: 'control' }),
+ ]),
+ ).toEqual({
+ image_url: 'https://example.com/img.png',
+ control_image_url: 'https://example.com/depth.png',
+ })
+ })
+
+ it('encodes data sources as data URIs', () => {
+ expect(
+ mapImageInputsToFalFields([
+ {
+ type: 'image',
+ source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/png' },
+ },
+ ]),
+ ).toEqual({ image_url: 'data:image/png;base64,aGVsbG8=' })
+ })
+
+ it('throws when more than one mask is provided', () => {
+ expect(() =>
+ mapImageInputsToFalFields([
+ urlPart('https://example.com/m1.png', { role: 'mask' }),
+ urlPart('https://example.com/m2.png', { role: 'mask' }),
+ ]),
+ ).toThrow(/only one input with metadata.role === 'mask'/)
+ })
+})
+
+describe('mapImageInputsToFalVideoFields', () => {
+ it('returns empty for missing/empty inputs', () => {
+ expect(mapImageInputsToFalVideoFields(undefined)).toEqual({})
+ expect(mapImageInputsToFalVideoFields([])).toEqual({})
+ })
+
+ it('routes a single positional source to image_url (start frame)', () => {
+ expect(
+ mapImageInputsToFalVideoFields([
+ urlPart('https://example.com/start.png'),
+ ]),
+ ).toEqual({ image_url: 'https://example.com/start.png' })
+ })
+
+ it('routes role=start_frame to start_image_url and role=end_frame to end_image_url', () => {
+ expect(
+ mapImageInputsToFalVideoFields([
+ urlPart('https://example.com/a.png', { role: 'start_frame' }),
+ urlPart('https://example.com/z.png', { role: 'end_frame' }),
+ ]),
+ ).toEqual({
+ start_image_url: 'https://example.com/a.png',
+ end_image_url: 'https://example.com/z.png',
+ })
+ })
+
+ it('routes role=reference to reference_image_urls', () => {
+ expect(
+ mapImageInputsToFalVideoFields([
+ urlPart('https://example.com/start.png'),
+ urlPart('https://example.com/character.png', { role: 'reference' }),
+ ]),
+ ).toEqual({
+ image_url: 'https://example.com/start.png',
+ reference_image_urls: ['https://example.com/character.png'],
+ })
+ })
+})
diff --git a/packages/typescript/ai-gemini/src/adapters/image.ts b/packages/typescript/ai-gemini/src/adapters/image.ts
index 612385e16..90063cfe3 100644
--- a/packages/typescript/ai-gemini/src/adapters/image.ts
+++ b/packages/typescript/ai-gemini/src/adapters/image.ts
@@ -1,4 +1,5 @@
import { BaseImageAdapter } from '@tanstack/ai/adapters'
+import { arrayBufferToBase64 } from '@tanstack/ai-utils'
import {
createGeminiClient,
generateId,
@@ -21,13 +22,17 @@ import type {
GeneratedImage,
ImageGenerationOptions,
ImageGenerationResult,
+ ImagePart,
+ MediaInputMetadata,
} from '@tanstack/ai'
import type {
+ Content,
GenerateContentConfig,
GenerateContentResponse,
GenerateImagesConfig,
GenerateImagesResponse,
GoogleGenAI,
+ Part,
} from '@google/genai'
import type { GeminiClientConfig } from '../utils'
@@ -94,10 +99,29 @@ export class GeminiImageAdapter<
try {
validatePrompt({ prompt, model })
+ if (options.videoInputs?.length) {
+ throw new Error(
+ `${this.name}.generateImages does not support videoInputs (model: ${model}).`,
+ )
+ }
+ if (options.audioInputs?.length) {
+ throw new Error(
+ `${this.name}.generateImages does not support audioInputs (model: ${model}).`,
+ )
+ }
+
if (this.isGeminiImageModel(model)) {
return await this.generateWithGeminiApi(options)
}
+ // Imagen does not accept image inputs — it's strictly text-to-image.
+ if (options.imageInputs?.length) {
+ throw new Error(
+ `${this.name}: model "${model}" (Imagen) does not support imageInputs. ` +
+ `Use a Gemini-native image model (e.g. gemini-2.5-flash-image, "nano-banana") for image-conditioned generation.`,
+ )
+ }
+
// Imagen models path (generateImages API)
validateImageSize(model, options.size)
validateNumberOfImages(model, options.numberOfImages)
@@ -127,7 +151,8 @@ export class GeminiImageAdapter<
private async generateWithGeminiApi(
options: ImageGenerationOptions,
): Promise {
- const { model, prompt, size, numberOfImages, modelOptions } = options
+ const { model, prompt, size, numberOfImages, modelOptions, imageInputs } =
+ options
const parsedSize = size ? parseNativeImageSize(size) : undefined
@@ -169,15 +194,81 @@ export class GeminiImageAdapter<
}),
}
+ const contents = await this.buildContents(augmentedPrompt, imageInputs)
+
const response = await this.client.models.generateContent({
model,
- contents: augmentedPrompt,
+ contents,
config,
})
return this.transformGeminiResponse(model, response)
}
+ /**
+ * Build the multimodal `contents` payload. When `imageInputs` is empty the
+ * SDK accepts a plain prompt string; with inputs we hand it a single user
+ * `Content` whose `parts` interleave the inline/file image data with the
+ * text prompt last (Gemini conventionally treats the trailing text as the
+ * instruction).
+ */
+ private async buildContents(
+ prompt: string,
+ imageInputs?: ReadonlyArray>,
+ ): Promise> {
+ if (!imageInputs || imageInputs.length === 0) {
+ return prompt
+ }
+ const imageParts: Array = await Promise.all(
+ imageInputs.map((part) => this.imagePartToGeminiPart(part)),
+ )
+ const parts: Array = [...imageParts, { text: prompt }]
+ return [{ role: 'user', parts }]
+ }
+
+ private async imagePartToGeminiPart(
+ part: ImagePart,
+ ): Promise {
+ if (part.source.type === 'data') {
+ return {
+ inlineData: {
+ mimeType: part.source.mimeType || 'image/png',
+ data: part.source.value,
+ },
+ }
+ }
+ // For URL sources, prefer passing the URL through as `fileData` when it
+ // looks like a Google Files API URI; otherwise fetch and inline as base64.
+ if (
+ part.source.value.startsWith('gs://') ||
+ /^https?:\/\/generativelanguage\.googleapis\.com\//.test(
+ part.source.value,
+ )
+ ) {
+ return {
+ fileData: {
+ fileUri: part.source.value,
+ ...(part.source.mimeType && { mimeType: part.source.mimeType }),
+ },
+ }
+ }
+ const response = await fetch(part.source.value)
+ if (!response.ok) {
+ throw new Error(
+ `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`,
+ )
+ }
+ const blob = await response.blob()
+ const buffer = await blob.arrayBuffer()
+ const base64 = arrayBufferToBase64(buffer)
+ return {
+ inlineData: {
+ mimeType: part.source.mimeType || blob.type || 'image/png',
+ data: base64,
+ },
+ }
+ }
+
private transformGeminiResponse(
model: string,
response: GenerateContentResponse,
diff --git a/packages/typescript/ai-grok/src/adapters/image.ts b/packages/typescript/ai-grok/src/adapters/image.ts
index 35f8cd224..081ed610c 100644
--- a/packages/typescript/ai-grok/src/adapters/image.ts
+++ b/packages/typescript/ai-grok/src/adapters/image.ts
@@ -61,6 +61,18 @@ export class GrokImageAdapter<
): Promise {
const { model, prompt, numberOfImages, size, modelOptions } = options
+ if (
+ options.imageInputs?.length ||
+ options.videoInputs?.length ||
+ options.audioInputs?.length
+ ) {
+ throw new Error(
+ `grok.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` +
+ `Image-conditioned generation requires the xAI Imagine API, which the current adapter ` +
+ `does not target (it uses the OpenAI-compat endpoint). Track progress at https://github.com/TanStack/ai/issues/618.`,
+ )
+ }
+
validatePrompt({ prompt, model })
validateImageSize(model, size)
validateNumberOfImages(model, numberOfImages)
diff --git a/packages/typescript/ai-openai/src/adapters/image.ts b/packages/typescript/ai-openai/src/adapters/image.ts
index 1380c7a26..334001832 100644
--- a/packages/typescript/ai-openai/src/adapters/image.ts
+++ b/packages/typescript/ai-openai/src/adapters/image.ts
@@ -3,6 +3,7 @@ import { BaseImageAdapter } from '@tanstack/ai/adapters'
import { toRunErrorPayload } from '@tanstack/ai/adapter-internals'
import { generateId } from '@tanstack/ai-utils'
import { getOpenAIApiKeyFromEnv } from '../utils/client'
+import { imagePartToFile } from '../image/image-input-to-file'
import {
validateImageSize,
validateNumberOfImages,
@@ -12,6 +13,8 @@ import type {
GeneratedImage,
ImageGenerationOptions,
ImageGenerationResult,
+ ImagePart,
+ MediaInputMetadata,
} from '@tanstack/ai'
import type OpenAI_SDK from 'openai'
import type { OpenAIImageModel } from '../model-meta'
@@ -22,6 +25,15 @@ import type {
} from '../image/image-provider-options'
import type { OpenAIClientConfig } from '../utils/client'
+// Per OpenAI docs: dall-e-2 accepts 1 image to `images.edit()`; gpt-image-1
+// and gpt-image-1-mini accept up to 16; dall-e-3 does not support edit at all.
+const EDIT_MAX_IMAGES: Record = {
+ 'dall-e-2': 1,
+ 'gpt-image-1': 16,
+ 'gpt-image-1-mini': 16,
+ 'dall-e-3': 0,
+}
+
/**
* Configuration for OpenAI image adapter
*/
@@ -59,12 +71,44 @@ export class OpenAIImageAdapter<
async generateImages(
options: ImageGenerationOptions,
): Promise {
- const { model, prompt, numberOfImages, size, modelOptions } = options
+ const {
+ model,
+ prompt,
+ numberOfImages,
+ size,
+ modelOptions,
+ imageInputs,
+ videoInputs,
+ audioInputs,
+ } = options
validatePrompt({ prompt, model })
validateImageSize(model, size)
validateNumberOfImages(model, numberOfImages)
+ if (videoInputs?.length) {
+ throw new Error(
+ `${this.name}.generateImages does not support videoInputs (model: ${model}).`,
+ )
+ }
+ if (audioInputs?.length) {
+ throw new Error(
+ `${this.name}.generateImages does not support audioInputs (model: ${model}).`,
+ )
+ }
+
+ if (imageInputs && imageInputs.length > 0) {
+ return this.editImages({
+ model: model as OpenAIImageModel,
+ prompt,
+ numberOfImages,
+ size,
+ modelOptions,
+ imageInputs,
+ logger: options.logger,
+ })
+ }
+
// With exactOptionalPropertyTypes, vendor SDK request shapes reject
// `T | undefined` in optional fields. Build the request incrementally and
// only set `size` when it's actually defined.
@@ -138,6 +182,129 @@ export class OpenAIImageAdapter<
throw error
}
}
+
+ /**
+ * Image-conditioned generation via OpenAI's `images.edit()` endpoint.
+ * dall-e-2 accepts 1 input image; gpt-image-1 / gpt-image-1-mini accept up
+ * to 16; dall-e-3 rejects entirely. A part with `metadata.role === 'mask'`
+ * is routed to the SDK's `mask` field (PNG with alpha channel).
+ */
+ private async editImages(args: {
+ model: OpenAIImageModel
+ prompt: string
+ numberOfImages?: number
+ size?: string
+ modelOptions?: OpenAIImageProviderOptions
+ imageInputs: ReadonlyArray>
+ logger: ImageGenerationOptions['logger']
+ }): Promise {
+ const { model, prompt, numberOfImages, size, modelOptions, logger } = args
+ const maxImages = EDIT_MAX_IMAGES[model]
+ if (maxImages === 0) {
+ throw new Error(
+ `${this.name}: model "${model}" does not support imageInputs. ` +
+ `Use gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`,
+ )
+ }
+
+ const maskParts = args.imageInputs.filter(
+ (part) => part.metadata?.role === 'mask',
+ )
+ const sourceParts = args.imageInputs.filter(
+ (part) => part.metadata?.role !== 'mask',
+ )
+
+ if (maskParts.length > 1) {
+ throw new Error(
+ `${this.name}: only one input with metadata.role === 'mask' is supported per request.`,
+ )
+ }
+ if (sourceParts.length === 0) {
+ throw new Error(
+ `${this.name}: imageInputs contained only mask parts; at least one source image is required.`,
+ )
+ }
+ if (sourceParts.length > maxImages) {
+ throw new Error(
+ `${this.name}: model "${model}" accepts at most ${maxImages} source image(s); received ${sourceParts.length}.`,
+ )
+ }
+
+ const sourceFiles = await Promise.all(
+ sourceParts.map((part, i) => imagePartToFile(part, `source-${i}`)),
+ )
+ const maskFile = maskParts[0]
+ ? await imagePartToFile(maskParts[0], 'mask')
+ : undefined
+
+ // `modelOptions` is typed across all four image models (including dall-e-3's
+ // `quality: 'hd' | 'standard'` which isn't valid for edit). dall-e-3 has
+ // already been rejected above, so any remaining quality value is valid for
+ // the edit endpoint — cast the spread to clear the union mismatch.
+ const request: OpenAI_SDK.Images.ImageEditParamsNonStreaming = {
+ model,
+ prompt,
+ image: sourceFiles.length === 1 ? sourceFiles[0]! : sourceFiles,
+ n: numberOfImages ?? 1,
+ stream: false,
+ ...((modelOptions ??
+ {}) as Partial),
+ }
+ if (size !== undefined) {
+ request.size = size as Exclude<
+ OpenAI_SDK.Images.ImageEditParamsNonStreaming['size'],
+ undefined
+ >
+ }
+ if (maskFile) {
+ request.mask = maskFile
+ }
+
+ try {
+ logger.request(
+ `activity=imageEdit provider=${this.name} model=${model} n=${request.n ?? 1} size=${request.size ?? 'default'} sources=${sourceFiles.length}${maskFile ? ' mask' : ''}`,
+ { provider: this.name, model },
+ )
+ const response = await this.client.images.edit(request)
+
+ const images: Array = (response.data ?? []).flatMap(
+ (item): Array => {
+ const revisedPromptField =
+ item.revised_prompt !== undefined
+ ? { revisedPrompt: item.revised_prompt }
+ : {}
+ if (item.b64_json) {
+ return [{ b64Json: item.b64_json, ...revisedPromptField }]
+ }
+ if (item.url) {
+ return [{ url: item.url, ...revisedPromptField }]
+ }
+ return []
+ },
+ )
+
+ return {
+ id: generateId(this.name),
+ model,
+ images,
+ ...(response.usage
+ ? {
+ usage: {
+ inputTokens: response.usage.input_tokens,
+ outputTokens: response.usage.output_tokens,
+ totalTokens: response.usage.total_tokens,
+ },
+ }
+ : {}),
+ }
+ } catch (error: unknown) {
+ logger.errors(`${this.name}.editImages fatal`, {
+ error: toRunErrorPayload(error, `${this.name}.editImages failed`),
+ source: `${this.name}.editImages`,
+ })
+ throw error
+ }
+ }
}
/**
diff --git a/packages/typescript/ai-openai/src/adapters/video.ts b/packages/typescript/ai-openai/src/adapters/video.ts
index 2bb9df046..8a1bc9c1b 100644
--- a/packages/typescript/ai-openai/src/adapters/video.ts
+++ b/packages/typescript/ai-openai/src/adapters/video.ts
@@ -3,6 +3,7 @@ import { BaseVideoAdapter } from '@tanstack/ai/adapters'
import { toRunErrorPayload } from '@tanstack/ai/adapter-internals'
import { arrayBufferToBase64 } from '@tanstack/ai-utils'
import { getOpenAIApiKeyFromEnv } from '../utils/client'
+import { imagePartToFile } from '../image/image-input-to-file'
import {
toApiSeconds,
validateVideoSeconds,
@@ -88,15 +89,38 @@ export class OpenAIVideoAdapter<
options: VideoGenerationOptions,
): Promise {
const { model, size, duration, modelOptions } = options
+ const { imageInputs, videoInputs, audioInputs } = options
validateVideoSize(model, size)
const seconds = duration ?? modelOptions?.seconds
validateVideoSeconds(model, seconds)
+ if (videoInputs?.length) {
+ throw new Error(
+ `${this.name}.createVideoJob does not support videoInputs (model: ${model}).`,
+ )
+ }
+ if (audioInputs?.length) {
+ throw new Error(
+ `${this.name}.createVideoJob does not support audioInputs (model: ${model}).`,
+ )
+ }
+ if (imageInputs && imageInputs.length > 1) {
+ throw new Error(
+ `${this.name}: Sora accepts at most one input_reference image; received ${imageInputs.length}.`,
+ )
+ }
+
const request: OpenAI_SDK.Videos.VideoCreateParams = {
model: model as VideoModel,
prompt: options.prompt,
}
+ if (imageInputs && imageInputs[0]) {
+ // Sora's `input_reference` is a single Uploadable; convert TanStack
+ // ImagePart (URL or base64) → File before handing it to the SDK.
+ const file = await imagePartToFile(imageInputs[0], 'input-reference')
+ ;(request as { input_reference?: unknown }).input_reference = file
+ }
// `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we
// narrow before assignment instead of casting from a `T | undefined` source.
if (size) {
diff --git a/packages/typescript/ai-openai/src/image/image-input-to-file.ts b/packages/typescript/ai-openai/src/image/image-input-to-file.ts
new file mode 100644
index 000000000..2074496fd
--- /dev/null
+++ b/packages/typescript/ai-openai/src/image/image-input-to-file.ts
@@ -0,0 +1,70 @@
+import { base64ToArrayBuffer } from '@tanstack/ai-utils'
+import type { ImagePart, MediaInputMetadata } from '@tanstack/ai'
+
+const DEFAULT_MIME = 'image/png'
+const MIME_TO_EXT: Record = {
+ 'image/png': 'png',
+ 'image/jpeg': 'jpg',
+ 'image/jpg': 'jpg',
+ 'image/webp': 'webp',
+ 'image/gif': 'gif',
+}
+
+function extForMime(mimeType: string): string {
+ return MIME_TO_EXT[mimeType] ?? mimeType.split('/')[1] ?? 'png'
+}
+
+function ensureFileSupport(): void {
+ if (typeof File === 'undefined') {
+ throw new Error(
+ '`File` is not available in this environment. ' +
+ 'Image-conditioned generation requires Node 20+ or a browser context.',
+ )
+ }
+}
+
+/**
+ * Convert a TanStack `ImagePart` into an OpenAI-compatible `File`.
+ *
+ * - `source.type === 'data'`: decode base64 → Buffer → File.
+ * - `source.type === 'url'`: fetch the URL (or parse data: URI) → File.
+ *
+ * The mime type comes from the source when available, else inferred from the
+ * URL extension, else `image/png`.
+ */
+export async function imagePartToFile(
+ part: ImagePart,
+ fallbackName: string,
+): Promise {
+ ensureFileSupport()
+
+ if (part.source.type === 'data') {
+ const mimeType = part.source.mimeType || DEFAULT_MIME
+ const bytes = base64ToArrayBuffer(part.source.value)
+ return new File([bytes], `${fallbackName}.${extForMime(mimeType)}`, {
+ type: mimeType,
+ })
+ }
+
+ // URL source — also handles data: URIs uniformly via fetch().
+ const response = await fetch(part.source.value)
+ if (!response.ok) {
+ throw new Error(
+ `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`,
+ )
+ }
+ const blob = await response.blob()
+ const mimeType =
+ part.source.mimeType || blob.type || inferMimeFromUrl(part.source.value)
+ return new File([blob], `${fallbackName}.${extForMime(mimeType)}`, {
+ type: mimeType,
+ })
+}
+
+function inferMimeFromUrl(url: string): string {
+ const match = url.match(/\.(png|jpe?g|webp|gif)(?:\?|#|$)/i)
+ if (!match || !match[1]) return DEFAULT_MIME
+ const ext = match[1].toLowerCase()
+ if (ext === 'jpg' || ext === 'jpeg') return 'image/jpeg'
+ return `image/${ext}`
+}
diff --git a/packages/typescript/ai-openai/tests/image-adapter.test.ts b/packages/typescript/ai-openai/tests/image-adapter.test.ts
index 34cb5fda7..6c1da8143 100644
--- a/packages/typescript/ai-openai/tests/image-adapter.test.ts
+++ b/packages/typescript/ai-openai/tests/image-adapter.test.ts
@@ -25,6 +25,9 @@ class TestOpenAIImageAdapter<
spyOnImagesGenerate() {
return vi.spyOn(this.client.images, 'generate')
}
+ spyOnImagesEdit() {
+ return vi.spyOn(this.client.images, 'edit')
+ }
}
describe('OpenAI Image Adapter', () => {
@@ -232,4 +235,162 @@ describe('OpenAI Image Adapter', () => {
expect(result2.id).toMatch(/^openai-/)
})
})
+
+ describe('imageInputs (image-conditioned generation)', () => {
+ const imagesEditResponse: OpenAI.Images.ImagesResponse = {
+ created: 0,
+ data: [{ b64_json: 'edited-base64' }],
+ }
+
+ it('routes to images.edit() for gpt-image-1 when imageInputs is present', async () => {
+ const adapter = new TestOpenAIImageAdapter(
+ { apiKey: 'test-api-key' },
+ 'gpt-image-1',
+ )
+ const editSpy = adapter
+ .spyOnImagesEdit()
+ .mockResolvedValueOnce(imagesEditResponse)
+ const generateSpy = adapter.spyOnImagesGenerate()
+
+ const result = await adapter.generateImages({
+ model: 'gpt-image-1',
+ prompt: 'Make it cinematic',
+ imageInputs: [
+ {
+ type: 'image',
+ source: {
+ type: 'data',
+ value: 'aGVsbG8=',
+ mimeType: 'image/png',
+ },
+ },
+ ],
+ logger: testLogger,
+ })
+
+ expect(generateSpy).not.toHaveBeenCalled()
+ expect(editSpy).toHaveBeenCalledTimes(1)
+ const editArgs = editSpy.mock.calls[0]![0]
+ expect(editArgs.model).toBe('gpt-image-1')
+ expect(editArgs.prompt).toBe('Make it cinematic')
+ expect(editArgs.image).toBeInstanceOf(File)
+ expect(result.images[0]!.b64Json).toBe('edited-base64')
+ })
+
+ it('rejects dall-e-3 with a clear error when imageInputs is present', async () => {
+ const adapter = new TestOpenAIImageAdapter(
+ { apiKey: 'test-api-key' },
+ 'dall-e-3',
+ )
+
+ await expect(
+ adapter.generateImages({
+ model: 'dall-e-3',
+ prompt: 'edit',
+ imageInputs: [
+ {
+ type: 'image',
+ source: { type: 'data', value: 'aGk=', mimeType: 'image/png' },
+ },
+ ],
+ logger: testLogger,
+ }),
+ ).rejects.toThrow(/does not support imageInputs/)
+ })
+
+ it('rejects dall-e-2 when more than one source image is provided', async () => {
+ const adapter = new TestOpenAIImageAdapter(
+ { apiKey: 'test-api-key' },
+ 'dall-e-2',
+ )
+
+ await expect(
+ adapter.generateImages({
+ model: 'dall-e-2',
+ prompt: 'edit',
+ imageInputs: [
+ {
+ type: 'image',
+ source: { type: 'data', value: 'aGk=', mimeType: 'image/png' },
+ },
+ {
+ type: 'image',
+ source: {
+ type: 'data',
+ value: 'YnllCg==',
+ mimeType: 'image/png',
+ },
+ },
+ ],
+ logger: testLogger,
+ }),
+ ).rejects.toThrow(/at most 1 source image/)
+ })
+
+ it('routes metadata.role==="mask" to the mask param', async () => {
+ const adapter = new TestOpenAIImageAdapter(
+ { apiKey: 'test-api-key' },
+ 'gpt-image-1',
+ )
+ const editSpy = adapter
+ .spyOnImagesEdit()
+ .mockResolvedValueOnce(imagesEditResponse)
+
+ await adapter.generateImages({
+ model: 'gpt-image-1',
+ prompt: 'replace masked region',
+ imageInputs: [
+ {
+ type: 'image',
+ source: { type: 'data', value: 'aGk=', mimeType: 'image/png' },
+ },
+ {
+ type: 'image',
+ source: { type: 'data', value: 'bWFzaw==', mimeType: 'image/png' },
+ metadata: { role: 'mask' },
+ },
+ ],
+ logger: testLogger,
+ })
+
+ const editArgs = editSpy.mock.calls[0]![0]
+ expect(editArgs.mask).toBeInstanceOf(File)
+ expect(editArgs.image).toBeInstanceOf(File)
+ })
+
+ it('rejects videoInputs or audioInputs', async () => {
+ const adapter = new TestOpenAIImageAdapter(
+ { apiKey: 'test-api-key' },
+ 'gpt-image-1',
+ )
+
+ await expect(
+ adapter.generateImages({
+ model: 'gpt-image-1',
+ prompt: 'x',
+ videoInputs: [
+ {
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/v.mp4' },
+ },
+ ],
+ logger: testLogger,
+ }),
+ ).rejects.toThrow(/videoInputs/)
+
+ await expect(
+ adapter.generateImages({
+ model: 'gpt-image-1',
+ prompt: 'x',
+ audioInputs: [
+ {
+ type: 'audio',
+ source: { type: 'url', value: 'https://example.com/a.mp3' },
+ },
+ ],
+ logger: testLogger,
+ }),
+ ).rejects.toThrow(/audioInputs/)
+ })
+ })
})
diff --git a/packages/typescript/ai-openrouter/src/adapters/image.ts b/packages/typescript/ai-openrouter/src/adapters/image.ts
index ac746ab3b..4d4e75772 100644
--- a/packages/typescript/ai-openrouter/src/adapters/image.ts
+++ b/packages/typescript/ai-openrouter/src/adapters/image.ts
@@ -64,6 +64,18 @@ export class OpenRouterImageAdapter<
async generateImages(
options: ImageGenerationOptions,
): Promise {
+ if (
+ options.imageInputs?.length ||
+ options.videoInputs?.length ||
+ options.audioInputs?.length
+ ) {
+ throw new Error(
+ `openrouter.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` +
+ `Image-conditioned generation via OpenRouter requires injecting parts into the multimodal ` +
+ `chat-completions messages array; this is tracked at https://github.com/TanStack/ai/issues/618.`,
+ )
+ }
+
const { model, prompt, numberOfImages, size, modelOptions, logger } =
options
// Use provided aspect_ratio or derive from size
diff --git a/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md b/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md
index 0cd507a56..bf7ccaf0f 100644
--- a/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md
+++ b/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md
@@ -189,6 +189,95 @@ Result shape: `ImageGenerationResult` with `images` array where each entry
has `b64Json?`, `url?`, and `revisedPrompt?`. OpenAI image URLs expire
after 1 hour -- download or display immediately.
+#### Image-conditioned generation: `imageInputs` / `videoInputs` / `audioInputs`
+
+Both `generateImage()` and `generateVideo()` accept multimodal conditioning
+inputs that reuse the existing `ImagePart` / `VideoPart` / `AudioPart`
+shape used elsewhere in TanStack AI. Each input may carry an optional
+`metadata.role` hint that adapters use to route the part to the
+provider-specific field.
+
+```typescript
+import { generateImage, type ImagePart } from '@tanstack/ai'
+import { openaiImage } from '@tanstack/ai-openai'
+
+// Image-to-image (OpenAI gpt-image-1, dall-e-2)
+await generateImage({
+ adapter: openaiImage('gpt-image-1'),
+ prompt: 'Turn this into a cinematic product photo',
+ imageInputs: [
+ { type: 'image', source: { type: 'url', value: 'https://…/product.png' } },
+ ],
+})
+
+// Multi-reference (up to 16 for gpt-image-1; up to 14 for Gemini native)
+await generateImage({
+ adapter: openaiImage('gpt-image-1'),
+ prompt: 'Apply the second image as style to the first',
+ imageInputs: [
+ { type: 'image', source: { type: 'url', value: 'https://…/product.png' } },
+ { type: 'image', source: { type: 'url', value: 'https://…/style.png' } },
+ ],
+})
+
+// Inpaint via metadata.role === 'mask' (OpenAI gpt-image-1, dall-e-2; fal mask_url)
+await generateImage({
+ adapter: openaiImage('gpt-image-1'),
+ prompt: 'Replace the masked region with a tree',
+ imageInputs: [
+ { type: 'image', source: { type: 'url', value: photoUrl } },
+ {
+ type: 'image',
+ source: { type: 'url', value: maskUrl },
+ metadata: { role: 'mask' },
+ },
+ ],
+})
+
+// Image-to-video (OpenAI Sora: single input_reference; fal: image_url + optional end_image_url)
+import { generateVideo } from '@tanstack/ai'
+import { falVideo } from '@tanstack/ai-fal'
+
+await generateVideo({
+ adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'),
+ prompt: 'Slow cinematic push-in',
+ imageInputs: [
+ { type: 'image', source: { type: 'url', value: firstFrameUrl } },
+ {
+ type: 'image',
+ source: { type: 'url', value: lastFrameUrl },
+ metadata: { role: 'end_frame' },
+ },
+ ],
+})
+```
+
+**Role hints** (`metadata.role`):
+
+| Role | Maps to |
+| --------------- | ------------------------------------------------------------------------ |
+| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise |
+| `'character'` | Same as `'reference'`; Veo `referenceImages` slot |
+| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` |
+| `'control'` | fal `control_image_url` (ControlNet / depth / pose) |
+| `'start_frame'` | fal `start_image_url`; Veo `image` |
+| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` |
+
+**Provider support matrix:**
+
+| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` |
+| ---------- | ------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- |
+| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. |
+| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. |
+| fal | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. |
+| Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a |
+| OpenRouter | Throws — multimodal injection pending. | n/a |
+| Anthropic | n/a (no image generation API). | n/a |
+
+`videoInputs` and `audioInputs` follow the same `metadata.role` convention
+for video-to-video and lipsync flows on fal; other providers throw when
+they're passed.
+
### 2. Audio Generation (Music, Sound Effects)
Distinct from TTS — `generateAudio()` produces non-speech audio content.
@@ -579,7 +668,45 @@ generateSpeech({
> Source: Gemini TTS adapter validation; CodeRabbit review of PR #463.
-### h. LOW: Writing a logging middleware to see media chunks flow through
+### h. HIGH: Passing `imageInputs` to a model that doesn't support image-conditioned generation
+
+Not every model accepts image-conditioned inputs. Adapters throw a clear
+runtime error when the caller passes `imageInputs` to a model that
+can't honor it (dall-e-3, Imagen, Grok, OpenRouter), so users learn at
+call time rather than getting silently wrong output.
+
+```typescript
+// WRONG — dall-e-3 has no edit/inputs API
+generateImage({
+ adapter: openaiImage('dall-e-3'),
+ prompt: 'Edit this',
+ imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+}) // throws: model "dall-e-3" does not support imageInputs.
+
+// WRONG — Imagen is text-to-image only
+generateImage({
+ adapter: geminiImage('imagen-4.0-generate-001'),
+ prompt: 'Edit this',
+ imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+}) // throws: Imagen does not support imageInputs.
+
+// CORRECT — use a model that supports edits/inputs
+generateImage({
+ adapter: openaiImage('gpt-image-1'), // edits up to 16 images
+ prompt: 'Edit this',
+ imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+})
+
+generateImage({
+ adapter: geminiImage('gemini-3.1-flash-image-preview'), // native multimodal
+ prompt: 'Edit this',
+ imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+})
+```
+
+> Source: docs/media/image-generation.md, docs/media/video-generation.md.
+
+### i. LOW: Writing a logging middleware to see media chunks flow through
Every media activity — `generateAudio`, `generateSpeech`,
`generateTranscription`, `generateImage`, `generateVideo` — accepts the
diff --git a/packages/typescript/ai/src/activities/generateImage/index.ts b/packages/typescript/ai/src/activities/generateImage/index.ts
index b8d173b09..285fb6a56 100644
--- a/packages/typescript/ai/src/activities/generateImage/index.ts
+++ b/packages/typescript/ai/src/activities/generateImage/index.ts
@@ -11,7 +11,14 @@ import { resolveDebugOption } from '../../logger/resolve'
import type { InternalLogger } from '../../logger/internal-logger'
import type { DebugOption } from '../../logger/types'
import type { ImageAdapter } from './adapter'
-import type { ImageGenerationResult, StreamChunk } from '../../types'
+import type {
+ AudioPart,
+ ImageGenerationResult,
+ ImagePart,
+ MediaInputMetadata,
+ StreamChunk,
+ VideoPart,
+} from '../../types'
// ===========================
// Activity Kind
@@ -78,6 +85,17 @@ export type ImageActivityOptions<
numberOfImages?: number
/** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */
size?: ImageSizeForModel
+ /**
+ * Image conditioning inputs for image-to-image, reference-guided, edit, or
+ * multi-reference generation. Each part may carry `metadata.role`
+ * (`'reference' | 'mask' | 'control' | 'character'`) to disambiguate intent.
+ * Adapters that don't support image-conditioned generation throw clearly.
+ */
+ imageInputs?: Array>
+ /** Video conditioning inputs. Provider support varies; unsupported adapters throw. */
+ videoInputs?: Array>
+ /** Audio conditioning inputs. Provider support varies; unsupported adapters throw. */
+ audioInputs?: Array>
/**
* Whether to stream the image generation result.
* When true, returns an AsyncIterable for streaming transport.
@@ -210,6 +228,9 @@ async function runGenerateImage<
prompt: rest.prompt,
numberOfImages: rest.numberOfImages,
size: rest.size,
+ imageInputCount: rest.imageInputs?.length,
+ videoInputCount: rest.videoInputs?.length,
+ audioInputCount: rest.audioInputs?.length,
modelOptions: rest.modelOptions,
timestamp: startTime,
})
diff --git a/packages/typescript/ai/src/activities/generateVideo/index.ts b/packages/typescript/ai/src/activities/generateVideo/index.ts
index cee2339f7..ee9b2f9b0 100644
--- a/packages/typescript/ai/src/activities/generateVideo/index.ts
+++ b/packages/typescript/ai/src/activities/generateVideo/index.ts
@@ -14,8 +14,12 @@ import type { InternalLogger } from '../../logger/internal-logger'
import type { DebugOption } from '../../logger/types'
import type { VideoAdapter } from './adapter'
import type {
+ AudioPart,
+ ImagePart,
+ MediaInputMetadata,
StreamChunk,
VideoJobResult,
+ VideoPart,
VideoStatusResult,
VideoUrlResult,
} from '../../types'
@@ -89,6 +93,16 @@ export type VideoCreateOptions<
size?: VideoSizeForAdapter
/** Video duration in seconds */
duration?: number
+ /**
+ * Image conditioning inputs (start frame, end frame, reference / character
+ * images). Use `metadata.role` (`'start_frame' | 'end_frame' | 'reference' |
+ * 'character'`) to disambiguate intent; positional fallback otherwise.
+ */
+ imageInputs?: Array>
+ /** Video conditioning inputs (video-to-video, source clip). */
+ videoInputs?: Array>
+ /** Audio conditioning inputs (lipsync source, voice reference). */
+ audioInputs?: Array>
/**
* Whether to stream the video generation lifecycle.
* When true, returns an AsyncIterable that handles the full
@@ -249,7 +263,16 @@ export function generateVideo<
async function runCreateVideoJob<
TAdapter extends VideoAdapter,
>(options: VideoCreateOptions): Promise {
- const { adapter, prompt, size, duration, modelOptions } = options
+ const {
+ adapter,
+ prompt,
+ size,
+ duration,
+ modelOptions,
+ imageInputs,
+ videoInputs,
+ audioInputs,
+ } = options
const model = adapter.model
const logger: InternalLogger = resolveDebugOption(options.debug)
const providerName =
@@ -269,6 +292,9 @@ async function runCreateVideoJob<
size,
duration,
modelOptions,
+ imageInputs,
+ videoInputs,
+ audioInputs,
logger,
})
logger.output(`activity=generateVideo jobId=${result.jobId}`, {
@@ -296,7 +322,16 @@ function sleep(ms: number): Promise {
async function* runStreamingVideoGeneration<
TAdapter extends VideoAdapter,
>(options: VideoCreateOptions): AsyncIterable {
- const { adapter, prompt, size, duration, modelOptions } = options
+ const {
+ adapter,
+ prompt,
+ size,
+ duration,
+ modelOptions,
+ imageInputs,
+ videoInputs,
+ audioInputs,
+ } = options
const model = adapter.model
const runId = options.runId ?? createId('run')
const pollingInterval = options.pollingInterval ?? 2000
@@ -332,6 +367,9 @@ async function* runStreamingVideoGeneration<
size,
duration,
modelOptions,
+ imageInputs,
+ videoInputs,
+ audioInputs,
logger,
})
diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts
index a12964981..5528ca9f9 100644
--- a/packages/typescript/ai/src/types.ts
+++ b/packages/typescript/ai/src/types.ts
@@ -1409,6 +1409,31 @@ export interface SummarizationResult {
// Image Generation Types
// ============================================================================
+/**
+ * Optional role hint on a media input part (image / video / audio). Adapters
+ * read `metadata.role` to route the part to the provider-specific request
+ * field — e.g. `'mask'` → OpenAI `mask` / fal `mask_url`, `'end_frame'` → fal
+ * `end_image_url`, `'reference'` → fal `reference_image_urls`. When omitted
+ * the adapter falls back to positional routing.
+ */
+export type MediaInputRole =
+ | 'reference'
+ | 'mask'
+ | 'control'
+ | 'start_frame'
+ | 'end_frame'
+ | 'character'
+
+/**
+ * Metadata convention for image / video / audio inputs to media generation.
+ * Carried on `ImagePart.metadata` / `VideoPart.metadata` / `AudioPart.metadata`
+ * when used as conditioning inputs to `generateImage()` or `generateVideo()`.
+ */
+export interface MediaInputMetadata {
+ /** Optional role hint disambiguating the part's intent for the adapter */
+ role?: MediaInputRole
+}
+
/**
* Options for image generation.
* These are the common options supported across providers.
@@ -1425,6 +1450,25 @@ export interface ImageGenerationOptions<
numberOfImages?: number
/** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */
size?: TSize
+ /**
+ * Image conditioning inputs (reference / mask / control / start frame /
+ * character). Reuses the multimodal `ImagePart` shape. Adapters map these
+ * onto the provider-native request — e.g. OpenAI `images.edit()`, Gemini
+ * multimodal `contents`, fal `image_url` / `image_urls` / `mask_url`.
+ * Adapters that do not support image-conditioned generation throw a clear
+ * runtime error when this field is non-empty.
+ */
+ imageInputs?: Array>
+ /**
+ * Video conditioning inputs (video-to-video, edit, lipsync source).
+ * Not all providers support this; adapters throw when unsupported.
+ */
+ videoInputs?: Array>
+ /**
+ * Audio conditioning inputs (audio reference, voice cloning, lipsync).
+ * Not all providers support this; adapters throw when unsupported.
+ */
+ audioInputs?: Array>
/** Model-specific options for image generation */
modelOptions?: TProviderOptions
/**
@@ -1555,6 +1599,24 @@ export interface VideoGenerationOptions<
size?: TSize
/** Video duration in seconds */
duration?: number
+ /**
+ * Image conditioning inputs (start frame, end frame, character / reference
+ * images). Reuses the multimodal `ImagePart` shape; adapters route by
+ * `metadata.role` and array position (e.g. OpenAI Sora `input_reference`,
+ * fal `image_url` / `end_image_url`, Veo `image` / `lastFrame` /
+ * `referenceImages`). Adapters throw at runtime if unsupported.
+ */
+ imageInputs?: Array>
+ /**
+ * Video conditioning inputs (video-to-video edit, source clip).
+ * Not all providers support this; adapters throw when unsupported.
+ */
+ videoInputs?: Array>
+ /**
+ * Audio conditioning inputs (lipsync source, voice reference).
+ * Not all providers support this; adapters throw when unsupported.
+ */
+ audioInputs?: Array>
/** Model-specific options for video generation */
modelOptions?: TProviderOptions
/**
diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts
index 3b464be5b..89c632208 100644
--- a/testing/e2e/src/lib/feature-support.ts
+++ b/testing/e2e/src/lib/feature-support.ts
@@ -142,9 +142,19 @@ export const matrix: Record> = {
]),
// Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format
'image-gen': new Set(['openai', 'grok']),
+ // image-to-image (imageInputs on generateImage) routes adapters to wire
+ // endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, Gemini
+ // multimodal `generateContent`, fal endpoint-specific input fields).
+ // Adapter-level mapping is covered by unit tests. Populate this set when
+ // aimock gains support for those endpoints.
+ 'image-to-image': new Set([]),
tts: new Set(['openai', 'grok']),
transcription: new Set(['openai', 'grok']),
'video-gen': new Set(['openai']),
+ // image-to-video (imageInputs on generateVideo) similarly depends on
+ // aimock mocking Sora's `input_reference` upload field. Populate when
+ // aimock support lands.
+ 'image-to-video': new Set([]),
}
export function isSupported(provider: Provider, feature: Feature): boolean {
diff --git a/testing/e2e/src/lib/types.ts b/testing/e2e/src/lib/types.ts
index be405c74d..dc839fcc4 100644
--- a/testing/e2e/src/lib/types.ts
+++ b/testing/e2e/src/lib/types.ts
@@ -27,9 +27,11 @@ export type Feature =
| 'summarize'
| 'summarize-stream'
| 'image-gen'
+ | 'image-to-image'
| 'tts'
| 'transcription'
| 'video-gen'
+ | 'image-to-video'
export const ALL_PROVIDERS: Provider[] = [
'openai',
@@ -59,7 +61,9 @@ export const ALL_FEATURES: Feature[] = [
'summarize',
'summarize-stream',
'image-gen',
+ 'image-to-image',
'tts',
'transcription',
'video-gen',
+ 'image-to-video',
]