From 4f8df81bf2ed54a71ca27cdd1a7d4c635b4c9972 Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Mon, 3 Nov 2025 15:24:33 +0800 Subject: [PATCH 1/3] llama-stack: switches to open model configuration with openai remote Signed-off-by: Adrian Cole --- inference-platforms/agent.py | 41 +++++++++++++++---- inference-platforms/llama-stack/README.md | 26 +++++++----- .../llama-stack/docker-compose.yml | 12 ++++-- inference-platforms/llama-stack/env.local | 13 +++--- 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/inference-platforms/agent.py b/inference-platforms/agent.py index 276c3d5..f5032a1 100644 --- a/inference-platforms/agent.py +++ b/inference-platforms/agent.py @@ -16,23 +16,25 @@ # This must precede any other imports you want to instrument! auto_instrumentation.initialize() +import argparse import asyncio import os from datetime import datetime, timedelta from agents import ( Agent, + HostedMCPTool, OpenAIProvider, RunConfig, Runner, Tool, ) from agents.mcp import MCPServerStreamableHttp, MCPUtil +from openai.types.responses.tool_param import Mcp -async def run_agent(tools: list[Tool]): - model_name = os.getenv("AGENT_MODEL", "gpt-5-nano") - model = OpenAIProvider(use_responses=False).get_model(model_name) +async def run_agent(tools: list[Tool], model_name: str, use_responses: bool): + model = OpenAIProvider(use_responses=use_responses).get_model(model_name) agent = Agent( name="flight-search-agent", model=model, @@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]): async def main(): + parser = argparse.ArgumentParser(description="MCP-enabled flight search agent") + parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents") + args = parser.parse_args() + + model_name = os.getenv("AGENT_MODEL", "gpt-5-nano") + mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com") + mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h) + + if args.use_responses_api: + # Server-side MCP via Responses API + tools = [ + HostedMCPTool( + tool_config=Mcp( + type="mcp", + server_url=mcp_url, + server_label="kiwi-flights", + headers=mcp_headers, + require_approval="never", + ) + ) + ] + await run_agent(tools, model_name, use_responses=True) + return + + # Client-side MCP orchestration async with MCPServerStreamableHttp( - { - "url": os.getenv("MCP_URL", "https://mcp.kiwi.com"), - "headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h), - "timeout": 30.0, - }, + {"url": mcp_url, "headers": mcp_headers, "timeout": 30.0}, client_session_timeout_seconds=60.0, ) as server: tools = await server.list_tools() util = MCPUtil() tools = [util.to_function_tool(tool, server, False) for tool in tools] - await run_agent(tools) + await run_agent(tools, model_name, use_responses=False) if __name__ == "__main__": diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md index 6e8e202..9cbd902 100644 --- a/inference-platforms/llama-stack/README.md +++ b/inference-platforms/llama-stack/README.md @@ -1,10 +1,7 @@ # Llama Stack -This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an -OpenAI compatible API. - -This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces -and metrics from signals recorded with Llama Stack's observability SDK. +This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI +compatible API. ## Prerequisites @@ -13,7 +10,7 @@ Start Ollama and your OpenTelemetry Collector via this repository's [README](../ ## Run Llama Stack ```bash -docker compose up --pull always --force-recreate --remove-orphans +docker compose up --force-recreate --remove-orphans ``` Clean up when finished, like this: @@ -36,16 +33,25 @@ Or, for the OpenAI Responses API uv run --exact -q --env-file env.local ../chat.py --use-responses-api ``` +### MCP Agent + +```bash +uv run --exact -q --env-file env.local ../agent.py --use-responses-api +``` + ## Notes -Here are some constraints about the LlamaStack implementation: -* Only supports llama models (so not Qwen) -* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks. +* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw + which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`. + +* Uses the `starter` distribution with its built-in `remote::openai` provider, + pointing to Ollama via `OPENAI_BASE_URL` environment variable. +* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) * Until [this issue][docker] resolves, running docker on Apple Silicon requires emulation. --- +[docker]: https://github.com/llamastack/llama-stack/issues/406 [docs]: https://llama-stack.readthedocs.io/en/latest/index.html [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration [uv]: https://docs.astral.sh/uv/getting-started/installation/ -[docker]: https://github.com/llamastack/llama-stack/issues/406 diff --git a/inference-platforms/llama-stack/docker-compose.yml b/inference-platforms/llama-stack/docker-compose.yml index e0d3b7d..4e78278 100644 --- a/inference-platforms/llama-stack/docker-compose.yml +++ b/inference-platforms/llama-stack/docker-compose.yml @@ -7,7 +7,7 @@ services: env_file: - env.local entrypoint: sh - command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}' + command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}' extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - "localhost:host-gateway" @@ -15,9 +15,9 @@ services: depends_on: ollama-pull: condition: service_completed_successfully - image: llamastack/distribution-starter:0.2.20 + image: llamastack/distribution-starter:0.4.1 + platform: linux/amd64 # ARM64 not published: https://github.com/llamastack/llama-stack/issues/406 container_name: llama-stack - platform: linux/amd64 # Force amd64 with emulation tty: true env_file: - env.local @@ -26,7 +26,11 @@ services: # Ensure the container which specially treats localhost routes back to the # host machine, e.g. your laptop. environment: - - OLLAMA_URL=http://host.docker.internal:11434 + - OPENAI_BASE_URL=http://host.docker.internal:11434/v1 - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318 + # Ensure we only see traces sampled upstream. This reduces noise without + # disabling SQL tracing entirely. + - OTEL_TRACES_SAMPLER=parentbased_traceidratio + - OTEL_TRACES_SAMPLER_ARG=0.0 extra_hosts: - "host.docker.internal:host-gateway" diff --git a/inference-platforms/llama-stack/env.local b/inference-platforms/llama-stack/env.local index 0af6145..69ff51c 100644 --- a/inference-platforms/llama-stack/env.local +++ b/inference-platforms/llama-stack/env.local @@ -1,14 +1,11 @@ -# Override default ENV variables for llama-stack -OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1 +# OpenAI-compatible endpoint configuration +OPENAI_BASE_URL=http://localhost:8321/v1 OPENAI_API_KEY=unused -CHAT_MODEL=llama3.2:1b - -# Variables used by llama-stack -OLLAMA_URL=http://localhost:11434 -INFERENCE_MODEL=llama3.2:1b +# Models require `provider_id/` prefix, in this case `openai` +CHAT_MODEL=openai/qwen3:0.6b +AGENT_MODEL=openai/qwen3:1.7b # OpenTelemetry configuration -TELEMETRY_SINKS=otel_trace,otel_metric OTEL_SERVICE_NAME=llama-stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf From 74ea0ac7e8bfc600e988e0483b79c3e673781afd Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Sat, 14 Feb 2026 11:36:50 +0800 Subject: [PATCH 2/3] final Signed-off-by: Adrian Cole --- inference-platforms/chat.py | 6 ++---- inference-platforms/llama-stack/README.md | 4 ---- inference-platforms/llama-stack/docker-compose.yml | 3 +-- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/inference-platforms/chat.py b/inference-platforms/chat.py index bcabd27..9d290e3 100644 --- a/inference-platforms/chat.py +++ b/inference-platforms/chat.py @@ -39,10 +39,8 @@ def main(): # vllm-specific switch to disable thinking, ignored by other inference platforms. # See https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes - if "qwen3" in model.lower(): - extra_body = {"chat_template_kwargs": {"enable_thinking": False}} - else: - extra_body = {} + extra_body = {"chat_template_kwargs": {"enable_thinking": False}} if model.startswith("Qwen/Qwen3") else None + if args.use_responses_api: response = client.responses.create( model=model, input=messages[0]["content"], temperature=0, extra_body=extra_body diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md index 9cbd902..ca89737 100644 --- a/inference-platforms/llama-stack/README.md +++ b/inference-platforms/llama-stack/README.md @@ -43,15 +43,11 @@ uv run --exact -q --env-file env.local ../agent.py --use-responses-api * Llama Stack's Responses API connects to MCP servers server-side (unlike aigw which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`. - * Uses the `starter` distribution with its built-in `remote::openai` provider, pointing to Ollama via `OPENAI_BASE_URL` environment variable. * Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) -* Until [this issue][docker] resolves, running docker on Apple Silicon - requires emulation. --- -[docker]: https://github.com/llamastack/llama-stack/issues/406 [docs]: https://llama-stack.readthedocs.io/en/latest/index.html [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration [uv]: https://docs.astral.sh/uv/getting-started/installation/ diff --git a/inference-platforms/llama-stack/docker-compose.yml b/inference-platforms/llama-stack/docker-compose.yml index 4e78278..f220507 100644 --- a/inference-platforms/llama-stack/docker-compose.yml +++ b/inference-platforms/llama-stack/docker-compose.yml @@ -15,8 +15,7 @@ services: depends_on: ollama-pull: condition: service_completed_successfully - image: llamastack/distribution-starter:0.4.1 - platform: linux/amd64 # ARM64 not published: https://github.com/llamastack/llama-stack/issues/406 + image: llamastack/distribution-starter:0.5.0 container_name: llama-stack tty: true env_file: From 81153b06f8ece4e7010faabef2d79b0d6066544d Mon Sep 17 00:00:00 2001 From: Anuraag Agrawal Date: Mon, 16 Feb 2026 11:07:41 +0900 Subject: [PATCH 3/3] Update links --- inference-platforms/README.md | 2 +- inference-platforms/archgw/README.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inference-platforms/README.md b/inference-platforms/README.md index ec365dc..af38c60 100644 --- a/inference-platforms/README.md +++ b/inference-platforms/README.md @@ -106,7 +106,7 @@ To start and use Ollama, do the following: --- [aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun -[archgw]: https://docs.archgw.com/guides/observability/tracing.html +[archgw]: https://docs.planoai.dev/guides/observability/tracing.html [litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration [llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry [AgC]: https://github.com/masaic-ai-platform/AgC/blob/main/platform/README.md#setting-up-the-opentelemetry-collector diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md index e4525bc..0ef4ce4 100644 --- a/inference-platforms/archgw/README.md +++ b/inference-platforms/archgw/README.md @@ -76,9 +76,9 @@ Just run it again until we find a way to make the results idempotent. --- [docs]: https://github.com/katanemo/archgw?tab=readme-ov-file#use-arch-gateway-as-llm-router -[config]: https://docs.archgw.com/guides/observability/tracing.html +[config]: https://docs.planoai.dev/guides/observability/tracing.html [envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry -[archgw-wasm]: https://github.com/katanemo/archgw/blob/main/arch/README.md +[archgw-wasm]: https://github.com/katanemo/plano/blob/main/README.md [uv]: https://docs.astral.sh/uv/getting-started/installation/ -[openai-responses]: https://github.com/katanemo/archgw/issues/476 +[openai-responses]: https://github.com/katanemo/plano/issues/476 [otel-tui]: https://github.com/ymtdzzz/otel-tui