Skip to content

Commit 2975298

Browse files
committed
Merge branch 'main' into derekx/new-checkpointing
2 parents 99bcb4d + cfa015d commit 2975298

17 files changed

Lines changed: 360 additions & 197 deletions

eval_protocol/dataset_logger/__init__.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,31 @@
33
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
44
from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter
55

6+
67
# Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs.
7-
if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() != "1":
8-
default_logger = SqliteDatasetLoggerAdapter()
9-
else:
8+
def _get_default_logger():
9+
if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() != "1":
10+
return SqliteDatasetLoggerAdapter()
11+
else:
12+
13+
class _NoOpLogger(DatasetLogger):
14+
def log(self, row):
15+
return None
16+
17+
def read(self, rollout_id=None):
18+
return []
19+
20+
return _NoOpLogger()
21+
22+
23+
# Lazy property that creates the logger only when accessed
24+
class _LazyLogger(DatasetLogger):
25+
26+
def log(self, row):
27+
return _get_default_logger().log(row)
1028

11-
class _NoOpLogger(DatasetLogger):
12-
def log(self, row):
13-
return None
29+
def read(self, rollout_id=None):
30+
return _get_default_logger().read(rollout_id)
1431

15-
def read(self, rollout_id=None):
16-
return []
1732

18-
default_logger = _NoOpLogger()
33+
default_logger: DatasetLogger = _LazyLogger()

eval_protocol/pytest/default_agent_rollout_processor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ async def call_agent(self) -> str:
5959
self.append_message_and_log(message)
6060
if message.tool_calls:
6161
# Create tasks for all tool calls to run them in parallel
62-
tool_tasks = []
62+
tool_tasks: List[asyncio.Task[tuple[str, List[TextContent]]]] = []
6363
for tool_call in message.tool_calls:
6464
tool_call_id = tool_call.id
6565
tool_name = tool_call.function.name
@@ -71,7 +71,7 @@ async def call_agent(self) -> str:
7171
tool_tasks.append(task)
7272

7373
# Execute all tool calls in parallel
74-
tool_results: List[tuple[str, List[TextContent]]] = await asyncio.gather(*tool_tasks)
74+
tool_results = await asyncio.gather(*tool_tasks)
7575

7676
# Add all tool results to messages (they will be in the same order as tool_calls)
7777
for tool_call, (tool_call_id, content) in zip(message.tool_calls, tool_results):

tests/pytest/test_pytest_ensure_logging.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List
1+
import os
22
from unittest.mock import Mock, patch
33

44
import eval_protocol.dataset_logger as dataset_logger
@@ -13,28 +13,21 @@ async def test_ensure_logging(monkeypatch):
1313
"""
1414
Ensure that default SQLITE logger gets called by mocking the storage and checking that the storage is called.
1515
"""
16-
from eval_protocol.pytest.evaluation_test import evaluation_test
17-
1816
# Mock the SqliteEvaluationRowStore to track calls
19-
mock_store = Mock(spec=SqliteEvaluationRowStore)
17+
mock_store = Mock()
2018
mock_store.upsert_row = Mock()
2119
mock_store.read_rows = Mock(return_value=[])
2220
mock_store.db_path = "/tmp/test.db"
2321

24-
# Create a custom logger that uses our mocked store
25-
class MockSqliteLogger(DatasetLogger):
26-
def __init__(self, store: SqliteEvaluationRowStore):
27-
self._store = store
28-
29-
def log(self, row: EvaluationRow) -> None:
30-
data = row.model_dump(exclude_none=True, mode="json")
31-
self._store.upsert_row(data=data)
32-
33-
def read(self, rollout_id=None) -> List[EvaluationRow]:
34-
results = self._store.read_rows(rollout_id=rollout_id)
35-
return [EvaluationRow(**data) for data in results]
36-
37-
mock_logger = MockSqliteLogger(mock_store)
22+
# Mock the SqliteEvaluationRowStore constructor so that when SqliteDatasetLoggerAdapter
23+
# creates its store, it gets our mock instead
24+
with patch(
25+
"eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store
26+
):
27+
from eval_protocol.models import EvaluationRow
28+
from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
29+
from eval_protocol.pytest.evaluation_test import evaluation_test
30+
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
3831

3932
@evaluation_test(
4033
input_dataset=[
@@ -46,7 +39,7 @@ def read(self, rollout_id=None) -> List[EvaluationRow]:
4639
mode="pointwise",
4740
combine_datasets=False,
4841
num_runs=2,
49-
logger=mock_logger, # Use our mocked logger
42+
# Don't pass logger parameter - let it use the default_logger (which we've replaced)
5043
)
5144
def eval_fn(row: EvaluationRow) -> EvaluationRow:
5245
return row

vite-app/dist/assets/index-CpScNe1P.css

Lines changed: 0 additions & 1 deletion
This file was deleted.

vite-app/dist/assets/index-D1ErODUS.js

Lines changed: 93 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/assets/index-D1ErODUS.js.map

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/assets/index-D5KxcfFQ.css

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/assets/index-dHlKwEPE.js

Lines changed: 0 additions & 93 deletions
This file was deleted.

vite-app/dist/assets/index-dHlKwEPE.js.map

Lines changed: 0 additions & 1 deletion
This file was deleted.

vite-app/dist/index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
66
<title>EP | Log Viewer</title>
77
<link rel="icon" href="/assets/favicon-BkAAWQga.png" />
8-
<script type="module" crossorigin src="/assets/index-dHlKwEPE.js"></script>
9-
<link rel="stylesheet" crossorigin href="/assets/index-CpScNe1P.css">
8+
<script type="module" crossorigin src="/assets/index-D1ErODUS.js"></script>
9+
<link rel="stylesheet" crossorigin href="/assets/index-D5KxcfFQ.css">
1010
</head>
1111
<body>
1212
<div id="root"></div>

0 commit comments

Comments
 (0)