From 15aca9166fd2b0d8ba7970067e6df9cda654f591 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 7 May 2026 06:04:01 -0400 Subject: [PATCH 1/2] Add CITATION.cff --- CITATION.cff | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..324d340 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,20 @@ +cff-version: 1.2.0 +message: "If you use benchflow in your research, please cite it as below." +title: "benchflow: Multi-turn agent benchmarking with ACP" +abstract: "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." +type: software +authors: + - name: "BenchFlow team" + website: "https://github.com/benchflow-ai/benchflow" +repository-code: "https://github.com/benchflow-ai/benchflow" +url: "https://github.com/benchflow-ai/benchflow" +license: Apache-2.0 +version: 0.3.2 +keywords: + - benchmark + - llm-agents + - acp + - agent-evaluation + - multi-turn + - terminal-bench + - skillsbench From 575cd50b20e389664bb0168d225f2d1b72760da2 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 7 May 2026 06:08:04 -0400 Subject: [PATCH 2/2] Update title and abstract to reference use cases --- CITATION.cff | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 324d340..8cd5728 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,7 +1,7 @@ cff-version: 1.2.0 message: "If you use benchflow in your research, please cite it as below." -title: "benchflow: Multi-turn agent benchmarking with ACP" -abstract: "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." +title: "BenchFlow: framework for RL environments for LLM agents" +abstract: "BenchFlow is a framework for building RL environments to evaluate and train LLM agents. Built on the Agent Client Protocol (ACP), it provides Scene-based multi-turn, multi-agent, and multi-model evaluation in shared sandboxes — without Docker Compose or sidecar containers. Supported use cases include interactive user simulation, code-review loops, bring-your-own-skill (BYOS) skill generation, multi-turn iterative refinement, cross-model review (cheap coder + strong reviewer), and stateful service tasks against live mock APIs (Gmail, Calendar, Docs, Drive, Slack). See docs/use-cases.md." type: software authors: - name: "BenchFlow team"