diff --git a/gittensor/validator/pat_handler.py b/gittensor/validator/pat_handler.py index 21230a45..49e416dc 100644 --- a/gittensor/validator/pat_handler.py +++ b/gittensor/validator/pat_handler.py @@ -7,6 +7,7 @@ """ import asyncio +import json from typing import TYPE_CHECKING, Optional, Tuple import bittensor as bt @@ -70,22 +71,28 @@ def _reject(reason: str) -> PatBroadcastSynapse: if error: return _reject(error) - # 3. Enforce GitHub identity pinning — same hotkey cannot switch GitHub accounts - identity_error = _github_identity_pin_error(uid, hotkey, github_id) - if identity_error: - return _reject(identity_error) - - # 4. Test query against a known repo to catch org-restricted PATs - test_error = await asyncio.to_thread(_test_pat_against_repo, synapse.github_access_token) - if test_error: - return _reject(f'PAT test query failed: {test_error}') - - identity_error = _github_identity_pin_error(uid, hotkey, github_id) - if identity_error: - return _reject(identity_error) - - # 5. Store PAT (github_id guaranteed non-None after validate_github_credentials success) - pat_storage.save_pat(uid=uid, hotkey=hotkey, pat=synapse.github_access_token, github_id=github_id or '0') + # Steps 3-5 read/write the PAT store. If it is momentarily unreadable, fail + # closed: reject so the miner retries, rather than letting save_pat overwrite + # (and wipe) the store or surfacing a raw axon error. + try: + # 3. Enforce GitHub identity pinning — same hotkey cannot switch GitHub accounts + identity_error = _github_identity_pin_error(uid, hotkey, github_id) + if identity_error: + return _reject(identity_error) + + # 4. Test query against a known repo to catch org-restricted PATs + test_error = await asyncio.to_thread(_test_pat_against_repo, synapse.github_access_token) + if test_error: + return _reject(f'PAT test query failed: {test_error}') + + identity_error = _github_identity_pin_error(uid, hotkey, github_id) + if identity_error: + return _reject(identity_error) + + # 5. Store PAT (github_id guaranteed non-None after validate_github_credentials success) + pat_storage.save_pat(uid=uid, hotkey=hotkey, pat=synapse.github_access_token, github_id=github_id or '0') + except (json.JSONDecodeError, OSError) as e: + return _reject(f'Validator PAT store temporarily unavailable; please retry. ({e})') # Clear PAT from response so it isn't echoed back synapse.github_access_token = '' diff --git a/gittensor/validator/pat_storage.py b/gittensor/validator/pat_storage.py index 37b20d39..aab94a37 100644 --- a/gittensor/validator/pat_storage.py +++ b/gittensor/validator/pat_storage.py @@ -15,6 +15,8 @@ from pathlib import Path from typing import Optional +import bittensor as bt + PATS_FILE = Path(__file__).resolve().parents[2] / 'data' / 'miner_pats.json' _lock = threading.Lock() @@ -28,15 +30,33 @@ def ensure_pats_file() -> None: def load_all_pats() -> list[dict]: - """Read all stored PAT entries. Returns empty list if file is missing or corrupt.""" + """Snapshot all stored PAT entries for a scoring round. + + Read-only and deliberately tolerant: an unreadable store here must not crash + the round (an unhandled error would stop the validator) nor wipe anything. It + logs loudly and returns [] so the round recovers on the next successful read. + The *write* path (save_pat) is the one that fails closed. + """ with _lock: - return _read_file() + try: + return _read_file() + except (json.JSONDecodeError, OSError) as e: + bt.logging.error( + f'miner_pats.json unreadable this round; scoring with no stored PATs until it recovers: {e}' + ) + return [] def save_pat(uid: int, hotkey: str, pat: str, github_id: str) -> None: - """Upsert a PAT entry by UID. Creates the file if needed.""" + """Upsert a PAT entry by UID, failing closed on an unreadable store. + + If the existing store cannot be read (corrupt file or a transient I/O error), + this raises *without writing*, so a single failed read can never erase every + other miner's stored PAT (the read-then-overwrite wipe). The broadcast handler + catches this and rejects the broadcast, so the miner retries and nothing is lost. + """ with _lock: - entries = _read_file() + entries = _read_file() # fail closed: a failed read raises here, before any write entry = { 'uid': uid, @@ -57,7 +77,11 @@ def save_pat(uid: int, hotkey: str, pat: str, github_id: str) -> None: def get_pat_by_uid(uid: int) -> Optional[dict]: - """Look up a single PAT entry by UID. Returns None if not found.""" + """Look up a single PAT entry by UID. Returns None if not found. + + Propagates (does not swallow) a read error, so callers never mistake an + unreadable store for 'no PAT stored for this miner'. + """ with _lock: for entry in _read_file(): if entry.get('uid') == uid: @@ -66,13 +90,16 @@ def get_pat_by_uid(uid: int) -> Optional[dict]: def _read_file() -> list[dict]: - """Read and parse the JSON file. Must be called while holding _lock.""" + """Read and parse the JSON store. Must be called while holding _lock. + + Returns [] only when the file genuinely does not exist. Raises + (json.JSONDecodeError / OSError) on a corrupt or unreadable file so the write + path never mistakes a failed read for an empty store and overwrites it. The + read paths that must tolerate a transient failure (load_all_pats) catch it. + """ if not PATS_FILE.exists(): return [] - try: - return json.loads(PATS_FILE.read_text()) - except (json.JSONDecodeError, OSError): - return [] + return json.loads(PATS_FILE.read_text()) def _write_file(entries: list[dict]) -> None: diff --git a/tests/validator/test_pat_handler.py b/tests/validator/test_pat_handler.py index 0d61fffb..fb37acd1 100644 --- a/tests/validator/test_pat_handler.py +++ b/tests/validator/test_pat_handler.py @@ -191,6 +191,24 @@ def test_unregistered_hotkey_rejected(self, mock_validator): assert result.accepted is False assert 'not registered' in (result.rejection_reason or '') + @patch('gittensor.validator.pat_handler._test_pat_against_repo', return_value=None) + @patch('gittensor.validator.pat_handler.validate_github_credentials', return_value=('github_42', None)) + def test_unreadable_store_rejected_not_wiped( + self, mock_validate, mock_test_query, mock_validator, use_tmp_pats_file + ): + """A momentarily unreadable store fails closed: the broadcast is rejected (so + the miner retries) instead of overwriting the store or raising a raw error.""" + pat_storage.save_pat(0, 'hotkey_0', 'ghp_existing', 'github_7') + use_tmp_pats_file.write_text('not json{{{') + + synapse = _make_broadcast_synapse('hotkey_1', pat='ghp_valid') + result = _run(handle_pat_broadcast(mock_validator, synapse)) + + assert result.accepted is False + assert 'temporarily unavailable' in (result.rejection_reason or '') + # The store was not overwritten down to the single incoming entry. + assert use_tmp_pats_file.read_text() == 'not json{{{' + @patch('gittensor.validator.pat_handler.validate_github_credentials', return_value=(None, 'PAT invalid')) def test_invalid_pat_rejected(self, mock_validate, mock_validator): synapse = _make_broadcast_synapse('hotkey_1', pat='ghp_bad') diff --git a/tests/validator/test_pat_storage.py b/tests/validator/test_pat_storage.py index a5ed494f..13a786bd 100644 --- a/tests/validator/test_pat_storage.py +++ b/tests/validator/test_pat_storage.py @@ -4,6 +4,7 @@ import json import threading +from pathlib import Path import pytest @@ -90,6 +91,50 @@ def test_load_handles_corrupt_file(self, use_tmp_pats_file): assert entries == [] +class TestSavePatFailsClosed: + """The read-then-overwrite wipe (issue #1481, Proof 2): a single failed read of + miner_pats.json must never erase every other miner's stored PAT on the next save.""" + + def test_transient_read_error_does_not_wipe(self, use_tmp_pats_file, monkeypatch): + """A momentary I/O error on read leaves the on-disk store fully intact.""" + pat_storage.save_pat(10, 'h10', 'p10', 'u10') + pat_storage.save_pat(20, 'h20', 'p20', 'u20') + pat_storage.save_pat(30, 'h30', 'p30', 'u30') + good_bytes = use_tmp_pats_file.read_text() + + real_read_text = Path.read_text + failing = {'on': False} + + def flaky_read_text(self, *args, **kwargs): + if failing['on'] and self == use_tmp_pats_file: + raise OSError('transient I/O error') + return real_read_text(self, *args, **kwargs) + + monkeypatch.setattr(Path, 'read_text', flaky_read_text) + + # The save must fail closed (raise) instead of overwriting the store. + failing['on'] = True + with pytest.raises(OSError): + pat_storage.save_pat(99, 'h99', 'p99', 'u99') + failing['on'] = False + + # Read recovers: every original entry survives; UID 99 was never written. + assert use_tmp_pats_file.read_text() == good_bytes + assert {e['uid'] for e in pat_storage.load_all_pats()} == {10, 20, 30} + + def test_corrupt_store_save_does_not_shrink(self, use_tmp_pats_file): + """A corrupt store is left as-is, not silently overwritten down to one entry.""" + pat_storage.save_pat(10, 'h10', 'p10', 'u10') + pat_storage.save_pat(20, 'h20', 'p20', 'u20') + + use_tmp_pats_file.write_text('not json{{{') + + with pytest.raises(json.JSONDecodeError): + pat_storage.save_pat(99, 'h99', 'p99', 'u99') + + assert use_tmp_pats_file.read_text() == 'not json{{{' + + class TestGetPatByUid: def test_get_existing(self): pat_storage.save_pat(1, 'hotkey_1', 'ghp_abc', 'user_1')