1010import io .opentelemetry .api .trace .Tracer ;
1111import io .opentelemetry .context .Context ;
1212import java .util .List ;
13+ import java .util .Map ;
1314import lombok .extern .slf4j .Slf4j ;
15+ import org .junit .jupiter .api .BeforeAll ;
1416import org .junit .jupiter .api .BeforeEach ;
1517import org .junit .jupiter .api .Test ;
1618
1719@ Slf4j
1820public class ScorerBrainstoreImplTest {
19- // NOTE: the remote scorers under test are standard boilerplate autofilled by the braintrust UI
20- // TODO: test is VCR'd so it's fine, but would be nice to have logic to (re)create the score
21- // objects if they are absent
22-
2321 // returns 1.0 for an exact match, 0.0 otherwise
24- private static final String SCORER_SLUG = "typescriptexactmatch-9e44" ;
22+ private static TestHarness . CodeScorerInfo CODE_SCORER_INFO ;
2523
26- // LLM judge scorer that returns {"name":" close- enough-judge","metadata":{"choice":"0.9",...}}
27- private static final String LLM_JUDGE_SLUG = "close-enough-judge-d31b" ;
24+ // LLM judge scorer that returns 1.0 if output is close enough to expected
25+ private static String LLM_JUDGE_SLUG ;
2826
2927 private TestHarness testHarness ;
3028
29+ @ BeforeAll
30+ static void beforeAll () {
31+ var harness = TestHarness .setup ();
32+ CODE_SCORER_INFO = harness .ensureRemoteCodeScorer ("typescript-exact-match" , SCORER_CODE );
33+ LLM_JUDGE_SLUG =
34+ harness .ensureRemoteLLMJudgeScorer (
35+ "close-enough-judge" ,
36+ """
37+ are expected and output a close enough match?
38+ expected: {{expected}}
39+ output: {{output}}
40+ """ ,
41+ Map .of ("NO" , 0.0 , "YES" , 1.0 ));
42+ }
43+
3144 @ BeforeEach
3245 void beforeEach () {
3346 testHarness = TestHarness .setup ();
@@ -39,7 +52,7 @@ void testScorerReturnsOneForExactMatch() {
3952 Scorer .fetchFromBraintrust (
4053 testHarness .braintrust ().openApiClient (),
4154 testHarness .braintrust ().config ().defaultProjectName ().orElseThrow (),
42- SCORER_SLUG ,
55+ CODE_SCORER_INFO . slug () ,
4356 null );
4457 assertNotNull (scorer );
4558 assertNotNull (scorer .getName ());
@@ -59,7 +72,7 @@ void testScorerReturnsZeroForMismatch() {
5972 Scorer .fetchFromBraintrust (
6073 testHarness .braintrust ().openApiClient (),
6174 testHarness .braintrust ().config ().defaultProjectName ().orElseThrow (),
62- SCORER_SLUG ,
75+ CODE_SCORER_INFO . slug () ,
6376 null );
6477 assertNotNull (scorer );
6578 assertNotNull (scorer .getName ());
@@ -75,14 +88,14 @@ void testScorerReturnsZeroForMismatch() {
7588
7689 @ Test
7790 void testScorerOldVersion () {
78- // Version 485dbf64e486ab3a of the exact match scorer always returns 0, even for exact
79- // matches
80- String oldVersion = "485dbf64e486ab3a" ;
91+ // The first version of the exact match scorer (index 0) always returns 0.0 , even for
92+ // exact matches. Fetch it by its version ID to verify old-version behavior.
93+ String oldVersion = CODE_SCORER_INFO . versionIds (). get ( 0 ) ;
8194 Scorer <String , String > scorer =
8295 Scorer .fetchFromBraintrust (
8396 testHarness .braintrust ().openApiClient (),
8497 testHarness .braintrust ().config ().defaultProjectName ().orElseThrow (),
85- SCORER_SLUG ,
98+ CODE_SCORER_INFO . slug () ,
8699 oldVersion );
87100 assertNotNull (scorer );
88101 assertNotNull (scorer .getName ());
@@ -219,4 +232,65 @@ void testDistributedTracingWithRemoteScorer() throws InterruptedException {
219232 "Expected to find a span with parent spanId '%s' in trace '%s'. Found %d spans total."
220233 .formatted (spanId , traceId , response .data ().size ()));
221234 }
235+
236+ private static final List <String > SCORER_CODE =
237+ List .of (
238+ // language=typescript
239+ """
240+ import type { Trace } from 'braintrust';
241+ // an older buggy version that always returns 0.0
242+ async function handler({
243+ input,
244+ output,
245+ expected,
246+ metadata,
247+ trace,
248+ }: {
249+ input: any;
250+ output: any;
251+ expected: any;
252+ metadata: Record<string, any>;
253+ trace: Trace;
254+ }): Promise<
255+ | number
256+ | { score: number; name?: string; metadata?: Record<string, unknown> }
257+ | null
258+ > {
259+ if (expected === null) return null;
260+
261+ return {
262+ name: "typescript exact match",
263+ score: 0.0
264+ };
265+ }
266+ """ ,
267+ // language=typescript
268+ """
269+ import type { Trace } from 'braintrust';
270+ // returns 1.0 for exact match, 0.0 otherwise
271+ async function handler({
272+ input,
273+ output,
274+ expected,
275+ metadata,
276+ trace,
277+ }: {
278+ input: any;
279+ output: any;
280+ expected: any;
281+ metadata: Record<string, any>;
282+ trace: Trace;
283+ }): Promise<
284+ | number
285+ | { score: number; name?: string; metadata?: Record<string, unknown> }
286+ | null
287+ > {
288+ if (expected === null) return null;
289+
290+ return {
291+ name: "typescript exact match",
292+ score: output === expected ? 1.0 : 0.0
293+ };
294+ }
295+ """ );
222296}
0 commit comments