-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathinit.py
More file actions
106 lines (93 loc) · 4.69 KB
/
init.py
File metadata and controls
106 lines (93 loc) · 4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Initialize the Braintrust project with the workshop dataset.
Run this once before the workshop to set up the Evals-101-Workshop project
and populate its dataset with the 12 test cases used in the eval.
Usage:
uv run python init.py
"""
import braintrust
import os
from dotenv import load_dotenv
load_dotenv()
PROJECT_NAME = "Evals-101-Workshop"
DATASET_NAME = "support-agent-dataset"
DATASET = [
# --- Happy path: order lookup ---
{
"input": "What's the status of order ORD-1001?",
"expected": "Order ORD-1001 has been delivered. It contains Pro Plan (Annual) and the total was $299.99.",
"metadata": {"category": "order_lookup", "expected_tool": "lookup_order", "expected_tool_path": ["lookup_order"]},
},
{
"input": "Can you tell me about order ORD-1004?",
"expected": "Order ORD-1004 has been delivered. Items: Pro Plan (Monthly) and Storage Upgrade. Total: $74.98.",
"metadata": {"category": "order_lookup", "expected_tool": "lookup_order", "expected_tool_path": ["lookup_order"]},
},
# --- Happy path: refund (eligible) ---
{
"input": "I'd like a refund for order ORD-1001, the product didn't meet my expectations.",
"expected": "Refund of $299.99 for order ORD-1001 has been processed.",
"metadata": {"category": "refund", "expected_tool": "process_refund", "expected_tool_path": ["process_refund"]},
},
# --- Happy path: FAQ ---
{
"input": "How do I reset my password?",
"expected": "Go to Settings > Security > Reset Password. You'll receive an email with a reset link.",
"metadata": {"category": "faq", "expected_tool": "search_faq", "expected_tool_path": ["search_faq"]},
},
{
"input": "What payment methods do you accept?",
"expected": "We accept Visa, Mastercard, American Express, and PayPal.",
"metadata": {"category": "faq", "expected_tool": "search_faq", "expected_tool_path": ["search_faq"]},
},
# --- Failure mode 1: Hallucinated order data (nonexistent order) ---
{
"input": "What's the status of order ORD-9999?",
"expected": "Order ORD-9999 was not found. Please double-check your order ID or contact support.",
"metadata": {"category": "order_not_found", "expected_tool": "lookup_order", "expected_tool_path": ["lookup_order"]},
},
{
"input": "Tell me about order ORD-0000",
"expected": "That order could not be found in our system.",
"metadata": {"category": "order_not_found", "expected_tool": "lookup_order", "expected_tool_path": ["lookup_order"]},
},
# --- Failure mode 2: Wrong tool selection ---
{
"input": "Do you offer a free trial?",
"expected": "Yes! All plans include a 14-day free trial. No credit card required.",
"metadata": {"category": "faq", "expected_tool": "search_faq", "expected_tool_path": ["search_faq"]},
},
{
"input": "How do I cancel my subscription?",
"expected": "Go to Settings > Billing > Cancel Subscription. Your access continues until the end of your billing period.",
"metadata": {"category": "faq", "expected_tool": "search_faq", "expected_tool_path": ["search_faq"]},
},
# --- Failure mode 3: Refund policy violation (ineligible orders) ---
{
"input": "I want a refund for order ORD-1002",
"expected": "Order ORD-1002 is currently shipped and is not eligible for a refund. Only delivered orders can be refunded.",
"metadata": {"category": "refund_ineligible", "expected_tool": "process_refund", "expected_tool_path": ["process_refund"]},
},
{
"input": "Please refund order ORD-1003, I changed my mind.",
"expected": "Order ORD-1003 is still processing and cannot be refunded yet. Only delivered orders are eligible.",
"metadata": {"category": "refund_ineligible", "expected_tool": "process_refund", "expected_tool_path": ["process_refund"]},
},
# --- Failure mode 4: FAQ mismatch ---
{
"input": "Can I integrate Acme with Slack?",
"expected": "I don't have information about Slack integration. Please contact support@acme.com for help.",
"metadata": {"category": "faq_no_match", "expected_tool": "search_faq", "expected_tool_path": ["search_faq"]},
},
]
def main():
dataset = braintrust.init_dataset(project=PROJECT_NAME, name=DATASET_NAME, api_key=os.environ["BRAINTRUST_API_KEY"])
for row in DATASET:
dataset.insert(
input=row["input"],
expected=row["expected"],
metadata=row["metadata"],
)
dataset.flush()
print(f"Loaded {len(DATASET)} rows into dataset '{DATASET_NAME}' in project '{PROJECT_NAME}'")
if __name__ == "__main__":
main()