From ed6d2a1a4df60b7f83791a2cc79c245833b43dca Mon Sep 17 00:00:00 2001 From: qer Date: Thu, 25 Jun 2026 18:17:18 +0800 Subject: [PATCH 1/2] feat(fetch-url): support fetching images as multimodal content - Extend UrlFetchKind with 'image' to distinguish image responses - Add image field to UrlFetchResult for base64-encoded image data - Update LocalFetchURLProvider to detect image/* content types and download binary data, converting to base64 - Update FetchURLTool to return ContentPart[] with image_url when fetching images, enabling multimodal models to 'see' the image - Update tool description to mention image support - Add test for image content kind detection Fixes: Fetch tool supports pulling images (P1 Backlog) --- .../src/tools/builtin/web/fetch-url.md | 2 +- .../src/tools/builtin/web/fetch-url.ts | 34 ++++++++++++++----- .../src/tools/providers/local-fetch-url.ts | 23 +++++++++++-- .../tools/providers/local-fetch-url.test.ts | 22 ++++++++++++ 4 files changed, 70 insertions(+), 11 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.md b/packages/agent-core/src/tools/builtin/web/fetch-url.md index f2356e690..5371d522d 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.md +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.md @@ -1,3 +1,3 @@ -Fetch content from a URL. Returns the main text content extracted from the page. Use this when you need to read a specific web page. +Fetch content from a URL. Returns the main text content extracted from the page, or the image data if the URL points to an image file. Use this when you need to read a specific web page or image. Only public `http`/`https` URLs are supported. Requests to private, loopback, or link-local addresses are refused, and responses larger than 10 MiB are rejected. diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.ts b/packages/agent-core/src/tools/builtin/web/fetch-url.ts index 9ea5b126c..123849e58 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.ts +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.ts @@ -10,7 +10,7 @@ import { z } from 'zod'; import type { BuiltinTool } from '../../../agent/tool'; import { ToolAccesses } from '../../../loop/tool-access'; -import type { ExecutableToolContext, ExecutableToolResult, ToolExecution } from '../../../loop/types'; +import type { ContentPart, ExecutableToolContext, ExecutableToolResult, ToolExecution } from '../../../loop/types'; import { toInputJsonSchema } from '../../support/input-schema'; import { literalRulePattern, matchesGlobRuleSubject } from '../../support/rule-match'; import { ToolResultBuilder } from '../../support/result-builder'; @@ -25,14 +25,18 @@ import DESCRIPTION from './fetch-url.md?raw'; * returned verbatim, in full. * - `extracted` — the body was an HTML page; only the main article text * was extracted and returned. + * - `image` — the body is an image file; the binary data is returned + * as base64-encoded content for multimodal model input. */ -export type UrlFetchKind = 'passthrough' | 'extracted'; +export type UrlFetchKind = 'passthrough' | 'extracted' | 'image'; export interface UrlFetchResult { - /** The text handed to the LLM. */ + /** The text handed to the LLM, or empty string for image content. */ content: string; - /** Whether `content` is a verbatim passthrough or extracted main text. */ + /** Whether content is a verbatim passthrough, extracted main text, or image data. */ kind: UrlFetchKind; + /** Image data as base64, when kind is 'image'. */ + image?: { mimeType: string; base64: string }; } export interface UrlFetcher { @@ -84,12 +88,26 @@ export class FetchURLTool implements BuiltinTool { private async execution( args: FetchURLInput, - { - toolCallId, - }: ExecutableToolContext, + { toolCallId }: ExecutableToolContext, ): Promise { try { - const { content, kind } = await this.fetcher.fetch(args.url, { toolCallId }); + const { content, kind, image } = await this.fetcher.fetch(args.url, { toolCallId }); + + if (image) { + const output: ContentPart[] = [ + { + type: 'text', + text: `Fetched image from ${args.url}. Mime type: ${image.mimeType}`, + }, + { type: 'text', text: `` }, + { + type: 'image_url', + imageUrl: { url: `data:${image.mimeType};base64,${image.base64}` }, + }, + { type: 'text', text: '' }, + ]; + return { output, isError: false }; + } if (!content) { return { diff --git a/packages/agent-core/src/tools/providers/local-fetch-url.ts b/packages/agent-core/src/tools/providers/local-fetch-url.ts index af10a8ca3..82d59d14f 100644 --- a/packages/agent-core/src/tools/providers/local-fetch-url.ts +++ b/packages/agent-core/src/tools/providers/local-fetch-url.ts @@ -7,7 +7,8 @@ * 3. Reject responses larger than `maxBytes` (content-length first, * then measured body length as a defensive second check). * 4. `text/plain` / `text/markdown` → passthrough verbatim. - * 5. Otherwise (assumed HTML) → run Readability over a linkedom + * 5. `image/*` → download binary, encode as base64, return as image kind. + * 6. Otherwise (assumed HTML) → run Readability over a linkedom * document. Return `# ${title}\n\n${text}` (title omitted when * absent). If extraction yields no meaningful text, fall back to * common content containers (`
` / `
` / ``) @@ -172,6 +173,25 @@ export class LocalFetchURLProvider implements UrlFetcher { } } + const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); + + // Handle image content types + if (contentType.startsWith('image/')) { + const arrayBuffer = await response.arrayBuffer(); + const actualBytes = arrayBuffer.byteLength; + if (actualBytes > this.maxBytes) { + throw new Error( + `Response body too large: ${String(actualBytes)} bytes exceeds maxBytes (${String(this.maxBytes)}).`, + ); + } + const base64 = Buffer.from(arrayBuffer).toString('base64'); + return { + content: '', + kind: 'image', + image: { mimeType: contentType, base64 }, + }; + } + const body = await response.text(); // Servers may omit content-length — measure again defensively. @@ -182,7 +202,6 @@ export class LocalFetchURLProvider implements UrlFetcher { ); } - const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); if (contentType.startsWith('text/plain') || contentType.startsWith('text/markdown')) { return { content: body, kind: 'passthrough' }; } diff --git a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts index 2c0ce931f..5c031e390 100644 --- a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts +++ b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts @@ -17,7 +17,29 @@ function htmlResponse(body: string, contentType: string): Response { }); } +function imageResponse(data: Uint8Array, contentType: string): Response { + return new Response(data, { + status: 200, + headers: { 'content-type': contentType }, + }); +} + describe('LocalFetchURLProvider content kind', () => { + it('reports image content as image kind with base64 data', async () => { + const imageData = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); + const fetchImpl = vi + .fn() + .mockResolvedValue(imageResponse(imageData, 'image/png')); + const provider = new LocalFetchURLProvider({ fetchImpl }); + + const result = await provider.fetch('https://example.com/image.png'); + + expect(result.kind).toBe('image'); + expect(result.image).toBeDefined(); + expect(result.image?.mimeType).toBe('image/png'); + expect(result.image?.base64).toBeTruthy(); + }); + it('reports text/plain bodies as a verbatim passthrough', async () => { const fetchImpl = vi .fn() From a97fad41301ad6cc9fcd0d476d9a0cc711dcdaf5 Mon Sep 17 00:00:00 2001 From: qer Date: Thu, 25 Jun 2026 18:33:47 +0800 Subject: [PATCH 2/2] feat(fetch-url): align with internal API shape - Add PageMetadata to UrlFetchResult (url, mime, title) matching internal FetchResponse.page - Remove XML wrapping around image_url content, return clean image_url part directly - Add imageUrl.id from original URL for traceability - Update LocalFetchURLProvider to include page metadata for all content kinds - Update tests to match new return shape --- .../src/tools/builtin/web/fetch-url.ts | 19 +++++++---- .../src/tools/providers/local-fetch-url.ts | 5 +-- .../agent-core/test/tools/fetch-url.test.ts | 33 +++++++++++++++++-- .../tools/providers/local-fetch-url.test.ts | 5 +-- .../providers/moonshot-fetch-url.test.ts | 6 ++-- 5 files changed, 53 insertions(+), 15 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.ts b/packages/agent-core/src/tools/builtin/web/fetch-url.ts index 123849e58..bb6576a02 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.ts +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.ts @@ -30,6 +30,15 @@ import DESCRIPTION from './fetch-url.md?raw'; */ export type UrlFetchKind = 'passthrough' | 'extracted' | 'image'; +export interface PageMetadata { + /** The URL that was fetched. */ + url: string; + /** The title of the page, if available. */ + title?: string; + /** The MIME type of the response. */ + mime?: string; +} + export interface UrlFetchResult { /** The text handed to the LLM, or empty string for image content. */ content: string; @@ -37,6 +46,8 @@ export interface UrlFetchResult { kind: UrlFetchKind; /** Image data as base64, when kind is 'image'. */ image?: { mimeType: string; base64: string }; + /** Page metadata for the fetched resource, aligning with internal API shape. */ + page?: PageMetadata; } export interface UrlFetcher { @@ -95,16 +106,10 @@ export class FetchURLTool implements BuiltinTool { if (image) { const output: ContentPart[] = [ - { - type: 'text', - text: `Fetched image from ${args.url}. Mime type: ${image.mimeType}`, - }, - { type: 'text', text: `` }, { type: 'image_url', - imageUrl: { url: `data:${image.mimeType};base64,${image.base64}` }, + imageUrl: { url: `data:${image.mimeType};base64,${image.base64}`, id: args.url }, }, - { type: 'text', text: '' }, ]; return { output, isError: false }; } diff --git a/packages/agent-core/src/tools/providers/local-fetch-url.ts b/packages/agent-core/src/tools/providers/local-fetch-url.ts index 82d59d14f..0c9e3d470 100644 --- a/packages/agent-core/src/tools/providers/local-fetch-url.ts +++ b/packages/agent-core/src/tools/providers/local-fetch-url.ts @@ -189,6 +189,7 @@ export class LocalFetchURLProvider implements UrlFetcher { content: '', kind: 'image', image: { mimeType: contentType, base64 }, + page: { url, mime: contentType }, }; } @@ -203,10 +204,10 @@ export class LocalFetchURLProvider implements UrlFetcher { } if (contentType.startsWith('text/plain') || contentType.startsWith('text/markdown')) { - return { content: body, kind: 'passthrough' }; + return { content: body, kind: 'passthrough', page: { url, mime: contentType } }; } - return { content: this.extractMainContent(body), kind: 'extracted' }; + return { content: this.extractMainContent(body), kind: 'extracted', page: { url, mime: contentType } }; } private extractMainContent(html: string): string { diff --git a/packages/agent-core/test/tools/fetch-url.test.ts b/packages/agent-core/test/tools/fetch-url.test.ts index 0e5c55ee6..5ec08f93f 100644 --- a/packages/agent-core/test/tools/fetch-url.test.ts +++ b/packages/agent-core/test/tools/fetch-url.test.ts @@ -12,6 +12,7 @@ import { HttpFetchError, type UrlFetcher, } from '../../src/tools/builtin/web/fetch-url'; +import type { ImageURLPart } from '@moonshot-ai/kosong'; import { MoonshotFetchURLProvider } from '../../src/tools/providers/moonshot-fetch-url'; import { toolContentString } from './fixtures/fake-kaos'; import { executeTool } from './fixtures/execute-tool'; @@ -20,9 +21,11 @@ const signal = new AbortController().signal; function fakeFetcher( content = '', - kind: 'passthrough' | 'extracted' = 'extracted', + kind: 'passthrough' | 'extracted' | 'image' = 'extracted', + image?: { mimeType: string; base64: string }, + page?: { url: string; mime?: string; title?: string }, ): UrlFetcher { - return { fetch: vi.fn().mockResolvedValue({ content, kind }) }; + return { fetch: vi.fn().mockResolvedValue({ content, kind, image, page }) }; } describe('FetchURLTool', () => { @@ -119,6 +122,32 @@ describe('FetchURLTool', () => { expect((result as { message?: string }).message).toContain('Output is truncated'); }); + it('returns image_url content part for image kind results', async () => { + const fetcher: UrlFetcher = { + fetch: vi.fn().mockResolvedValue({ + content: '', + kind: 'image', + image: { mimeType: 'image/png', base64: 'base64data' }, + page: { url: 'https://example.com/image.png', mime: 'image/png' }, + }), + }; + const tool = new FetchURLTool(fetcher); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_img', + args: { url: 'https://example.com/image.png' }, + signal, + }); + + expect(result.isError).toBe(false); + const output = (result as { output?: ContentPart[] }).output; + expect(output).toHaveLength(1); + expect(output?.[0]?.type).toBe('image_url'); + expect((output?.[0] as ImageURLPart).imageUrl.url).toBe('data:image/png;base64,base64data'); + expect((output?.[0] as ImageURLPart).imageUrl.id).toBe('https://example.com/image.png'); + }); + it('returns error when fetcher throws', async () => { const fetcher: UrlFetcher = { fetch: vi.fn().mockRejectedValue(new Error('timeout')), diff --git a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts index 5c031e390..7f9b404e5 100644 --- a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts +++ b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts @@ -38,6 +38,7 @@ describe('LocalFetchURLProvider content kind', () => { expect(result.image).toBeDefined(); expect(result.image?.mimeType).toBe('image/png'); expect(result.image?.base64).toBeTruthy(); + expect(result.page).toEqual({ url: 'https://example.com/image.png', mime: 'image/png' }); }); it('reports text/plain bodies as a verbatim passthrough', async () => { @@ -48,7 +49,7 @@ describe('LocalFetchURLProvider content kind', () => { const result = await provider.fetch('https://example.com/file.txt'); - expect(result).toEqual({ content: 'plain body', kind: 'passthrough' }); + expect(result).toEqual({ content: 'plain body', kind: 'passthrough', page: { url: 'https://example.com/file.txt', mime: 'text/plain; charset=utf-8' } }); }); it('reports text/markdown bodies as a verbatim passthrough', async () => { @@ -59,7 +60,7 @@ describe('LocalFetchURLProvider content kind', () => { const result = await provider.fetch('https://example.com/readme.md'); - expect(result).toEqual({ content: '# Title\n\nbody', kind: 'passthrough' }); + expect(result).toEqual({ content: '# Title\n\nbody', kind: 'passthrough', page: { url: 'https://example.com/readme.md', mime: 'text/markdown' } }); }); it('reports HTML bodies as extracted main content', async () => { diff --git a/packages/agent-core/test/tools/providers/moonshot-fetch-url.test.ts b/packages/agent-core/test/tools/providers/moonshot-fetch-url.test.ts index 59c5f224e..5edbccbe5 100644 --- a/packages/agent-core/test/tools/providers/moonshot-fetch-url.test.ts +++ b/packages/agent-core/test/tools/providers/moonshot-fetch-url.test.ts @@ -5,9 +5,11 @@ import { MoonshotFetchURLProvider } from '../../../src/tools/providers/moonshot- function fakeFetcher( content = '', - kind: 'passthrough' | 'extracted' = 'extracted', + kind: 'passthrough' | 'extracted' | 'image' = 'extracted', + image?: { mimeType: string; base64: string }, + page?: { url: string; mime?: string; title?: string }, ): UrlFetcher { - return { fetch: vi.fn().mockResolvedValue({ content, kind }) }; + return { fetch: vi.fn().mockResolvedValue({ content, kind, image, page }) }; } describe('MoonshotFetchURLProvider auth fallback', () => {