Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion inference-platforms/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ To start and use Ollama, do the following:

---
[aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun
[archgw]: https://docs.archgw.com/guides/observability/tracing.html
[archgw]: https://docs.planoai.dev/guides/observability/tracing.html
[litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
[llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry
[AgC]: https://github.com/masaic-ai-platform/AgC/blob/main/platform/README.md#setting-up-the-opentelemetry-collector
Expand Down
41 changes: 32 additions & 9 deletions inference-platforms/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,25 @@
# This must precede any other imports you want to instrument!
auto_instrumentation.initialize()

import argparse
import asyncio
import os
from datetime import datetime, timedelta

from agents import (
Agent,
HostedMCPTool,
OpenAIProvider,
RunConfig,
Runner,
Tool,
)
from agents.mcp import MCPServerStreamableHttp, MCPUtil
from openai.types.responses.tool_param import Mcp


async def run_agent(tools: list[Tool]):
model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
model = OpenAIProvider(use_responses=False).get_model(model_name)
async def run_agent(tools: list[Tool], model_name: str, use_responses: bool):
model = OpenAIProvider(use_responses=use_responses).get_model(model_name)
agent = Agent(
name="flight-search-agent",
model=model,
Expand All @@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]):


async def main():
parser = argparse.ArgumentParser(description="MCP-enabled flight search agent")
parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents")
args = parser.parse_args()

model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com")
mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h)

if args.use_responses_api:
# Server-side MCP via Responses API
tools = [
HostedMCPTool(
tool_config=Mcp(
type="mcp",
server_url=mcp_url,
server_label="kiwi-flights",
headers=mcp_headers,
require_approval="never",
)
)
]
await run_agent(tools, model_name, use_responses=True)
return

# Client-side MCP orchestration
async with MCPServerStreamableHttp(
{
"url": os.getenv("MCP_URL", "https://mcp.kiwi.com"),
"headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h),
"timeout": 30.0,
},
{"url": mcp_url, "headers": mcp_headers, "timeout": 30.0},
client_session_timeout_seconds=60.0,
) as server:
tools = await server.list_tools()
util = MCPUtil()
tools = [util.to_function_tool(tool, server, False) for tool in tools]
await run_agent(tools)
await run_agent(tools, model_name, use_responses=False)


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions inference-platforms/archgw/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ Just run it again until we find a way to make the results idempotent.

---
[docs]: https://github.com/katanemo/archgw?tab=readme-ov-file#use-arch-gateway-as-llm-router
[config]: https://docs.archgw.com/guides/observability/tracing.html
[config]: https://docs.planoai.dev/guides/observability/tracing.html
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found these new links. Not sure if we're supposed to rename archgw -> plano everywhere but definitely don't want to

[envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry
[archgw-wasm]: https://github.com/katanemo/archgw/blob/main/arch/README.md
[archgw-wasm]: https://github.com/katanemo/plano/blob/main/README.md
[uv]: https://docs.astral.sh/uv/getting-started/installation/
[openai-responses]: https://github.com/katanemo/archgw/issues/476
[openai-responses]: https://github.com/katanemo/plano/issues/476
[otel-tui]: https://github.com/ymtdzzz/otel-tui
6 changes: 2 additions & 4 deletions inference-platforms/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ def main():

# vllm-specific switch to disable thinking, ignored by other inference platforms.
# See https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes
if "qwen3" in model.lower():
extra_body = {"chat_template_kwargs": {"enable_thinking": False}}
else:
extra_body = {}
extra_body = {"chat_template_kwargs": {"enable_thinking": False}} if model.startswith("Qwen/Qwen3") else None

if args.use_responses_api:
response = client.responses.create(
model=model, input=messages[0]["content"], temperature=0, extra_body=extra_body
Expand Down
26 changes: 14 additions & 12 deletions inference-platforms/llama-stack/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
# Llama Stack

This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an
OpenAI compatible API.

This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces
and metrics from signals recorded with Llama Stack's observability SDK.
This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI
compatible API.

## Prerequisites

Expand All @@ -13,7 +10,7 @@ Start Ollama and your OpenTelemetry Collector via this repository's [README](../
## Run Llama Stack

```bash
docker compose up --pull always --force-recreate --remove-orphans
docker compose up --force-recreate --remove-orphans
```

Clean up when finished, like this:
Expand All @@ -36,16 +33,21 @@ Or, for the OpenAI Responses API
uv run --exact -q --env-file env.local ../chat.py --use-responses-api
```

### MCP Agent

```bash
uv run --exact -q --env-file env.local ../agent.py --use-responses-api
```

## Notes

Here are some constraints about the LlamaStack implementation:
* Only supports llama models (so not Qwen)
* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks.
* Until [this issue][docker] resolves, running docker on Apple Silicon
requires emulation.
* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
* Uses the `starter` distribution with its built-in `remote::openai` provider,
pointing to Ollama via `OPENAI_BASE_URL` environment variable.
* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`)

---
[docs]: https://llama-stack.readthedocs.io/en/latest/index.html
[otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
[uv]: https://docs.astral.sh/uv/getting-started/installation/
[docker]: https://github.com/llamastack/llama-stack/issues/406
11 changes: 7 additions & 4 deletions inference-platforms/llama-stack/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@ services:
env_file:
- env.local
entrypoint: sh
command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}'
command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}'
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
- "localhost:host-gateway"

llama-stack:
depends_on:
ollama-pull:
condition: service_completed_successfully
image: llamastack/distribution-starter:0.2.20
image: llamastack/distribution-starter:0.5.0
container_name: llama-stack
platform: linux/amd64 # Force amd64 with emulation
tty: true
env_file:
- env.local
Expand All @@ -26,7 +25,11 @@ services:
# Ensure the container which specially treats localhost routes back to the
# host machine, e.g. your laptop.
environment:
- OLLAMA_URL=http://host.docker.internal:11434
- OPENAI_BASE_URL=http://host.docker.internal:11434/v1
- OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318
# Ensure we only see traces sampled upstream. This reduces noise without
# disabling SQL tracing entirely.
- OTEL_TRACES_SAMPLER=parentbased_traceidratio
- OTEL_TRACES_SAMPLER_ARG=0.0
extra_hosts:
- "host.docker.internal:host-gateway"
13 changes: 5 additions & 8 deletions inference-platforms/llama-stack/env.local
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# Override default ENV variables for llama-stack
OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1
# OpenAI-compatible endpoint configuration
OPENAI_BASE_URL=http://localhost:8321/v1
OPENAI_API_KEY=unused
CHAT_MODEL=llama3.2:1b

# Variables used by llama-stack
OLLAMA_URL=http://localhost:11434
INFERENCE_MODEL=llama3.2:1b
# Models require `provider_id/` prefix, in this case `openai`
CHAT_MODEL=openai/qwen3:0.6b
AGENT_MODEL=openai/qwen3:1.7b

# OpenTelemetry configuration
TELEMETRY_SINKS=otel_trace,otel_metric
OTEL_SERVICE_NAME=llama-stack
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
Expand Down