diff --git a/mhctools/__init__.py b/mhctools/__init__.py index 9589544..df5ab32 100644 --- a/mhctools/__init__.py +++ b/mhctools/__init__.py @@ -63,7 +63,7 @@ def __getattr__(name): raise AttributeError( "module %r has no attribute %r" % (__name__, name)) -__version__ = "3.13.2" +__version__ = "3.13.3" __all__ = [ "Prediction", diff --git a/mhctools/pepsickle.py b/mhctools/pepsickle.py index b7e846e..79086ae 100644 --- a/mhctools/pepsickle.py +++ b/mhctools/pepsickle.py @@ -10,11 +10,44 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import logging +import subprocess +import sys + from .proteasome_predictor import ProteasomePredictor # Module-level cache for loaded pepsickle models. Keyed by human_only. _model_cache = {} +logger = logging.getLogger(__name__) + +PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS = 300 + +_PEPSICKLE_SUBPROCESS_SCRIPT = r""" +import json +import sys + +from pepsickle.model_functions import ( + initialize_epitope_model, + predict_protein_cleavage_locations, +) + +request = json.loads(sys.stdin.read()) +model = initialize_epitope_model(human_only=request["human_only"]) +results = {} +for sequence in request["sequences"]: + preds_raw = predict_protein_cleavage_locations( + sequence, + model, + mod_type="epitope", + proteasome_type="C", + threshold=request["threshold"], + ) + results[sequence] = [entry[2] for entry in preds_raw] +json.dump({"results": results}, sys.stdout) +""" + class Pepsickle(ProteasomePredictor): """ @@ -39,6 +72,15 @@ class Pepsickle(ProteasomePredictor): human_only : bool If True, use human-only trained model instead of all-mammal. + + isolate_subprocess : bool + If True, run pepsickle inference in a short-lived Python subprocess. + This avoids macOS duplicate OpenMP runtime crashes when the parent + process has already imported packages such as pandas, numpy, or + pyarrow. + + subprocess_timeout : int + Timeout in seconds for isolated pepsickle inference. """ def __init__( @@ -46,7 +88,9 @@ def __init__( default_peptide_lengths=None, scoring=None, threshold=0.5, - human_only=False): + human_only=False, + isolate_subprocess=False, + subprocess_timeout=PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS): ProteasomePredictor.__init__( self, default_peptide_lengths=default_peptide_lengths, @@ -54,12 +98,15 @@ def __init__( ) self.threshold = threshold self.human_only = human_only + self.isolate_subprocess = isolate_subprocess + self.subprocess_timeout = subprocess_timeout self._model = None def __str__(self): - return "%s(scoring=%s)" % ( + return "%s(scoring=%s, isolate_subprocess=%s)" % ( self.__class__.__name__, - getattr(self.scoring, "__name__", repr(self.scoring))) + getattr(self.scoring, "__name__", repr(self.scoring)), + self.isolate_subprocess) def _predictor_name(self): return "pepsickle" @@ -75,6 +122,20 @@ def _load_model(self): return self._model def cleavage_probs(self, sequence): + return self.cleavage_probs_many([sequence])[sequence] + + def cleavage_probs_many(self, sequences): + unique_sequences = list(dict.fromkeys(sequences)) + if not unique_sequences: + return {} + if self.isolate_subprocess: + return self._cleavage_probs_many_subprocess(unique_sequences) + return { + sequence: self._cleavage_probs_in_process(sequence) + for sequence in unique_sequences + } + + def _cleavage_probs_in_process(self, sequence): from pepsickle.model_functions import predict_protein_cleavage_locations model = self._load_model() preds_raw = predict_protein_cleavage_locations( @@ -85,3 +146,78 @@ def cleavage_probs(self, sequence): threshold=self.threshold, ) return [entry[2] for entry in preds_raw] + + def _cleavage_probs_many_subprocess(self, sequences): + payload = json.dumps({ + "human_only": bool(self.human_only), + "threshold": float(self.threshold), + "sequences": sequences, + }) + try: + result = subprocess.run( + [sys.executable, "-c", _PEPSICKLE_SUBPROCESS_SCRIPT], + input=payload, + text=True, + capture_output=True, + timeout=self.subprocess_timeout, + ) + except subprocess.TimeoutExpired as e: + msg = ( + "pepsickle subprocess timed out after %d seconds " + "while scoring %d sequences" + % (self.subprocess_timeout, len(sequences))) + logger.warning(msg) + raise RuntimeError(msg) from e + except OSError as e: + msg = "Could not start pepsickle subprocess: %s" % e + logger.warning(msg) + raise RuntimeError(msg) from e + + stderr_text = result.stderr.strip() + if stderr_text: + logger.warning("pepsickle subprocess stderr:\n%s", stderr_text) + if result.returncode != 0: + logger.warning( + "pepsickle subprocess exited with code %d", + result.returncode) + raise RuntimeError( + "pepsickle subprocess exited with code %d.\nstdout: %s\n" + "stderr: %s" + % (result.returncode, result.stdout.strip(), stderr_text)) + + try: + parsed = json.loads(result.stdout) + except ValueError as e: + logger.warning("Could not parse pepsickle subprocess JSON output") + raise RuntimeError( + "Could not parse pepsickle subprocess JSON output: %s" + % result.stdout.strip()) from e + + results = parsed.get("results") + if not isinstance(results, dict): + logger.warning( + "pepsickle subprocess output is missing a results object") + raise RuntimeError( + "pepsickle subprocess output is missing a results object") + missing = [sequence for sequence in sequences if sequence not in results] + if missing: + logger.warning( + "pepsickle subprocess omitted %d sequences", + len(missing)) + raise RuntimeError( + "pepsickle subprocess omitted %d sequences" % len(missing)) + + output = {} + for sequence in sequences: + probs = results[sequence] + if len(probs) != len(sequence): + logger.warning( + "pepsickle subprocess returned %d scores for a " + "%d-residue sequence", + len(probs), + len(sequence)) + raise ValueError( + "Expected %d pepsickle scores for sequence, got %d" + % (len(sequence), len(probs))) + output[sequence] = [float(p) for p in probs] + return output diff --git a/mhctools/processing_predictor.py b/mhctools/processing_predictor.py index 52371c2..7ffb0b7 100644 --- a/mhctools/processing_predictor.py +++ b/mhctools/processing_predictor.py @@ -199,6 +199,18 @@ def cleavage_probs(self, sequence): raise NotImplementedError( "%s must implement cleavage_probs" % self.__class__.__name__) + def cleavage_probs_many(self, sequences): + """ + Return per-position cleavage probabilities for multiple sequences. + + Subclasses can override this to batch expensive setup. The default + implementation preserves the existing one-sequence-at-a-time behavior. + """ + return { + sequence: self.cleavage_probs(sequence) + for sequence in dict.fromkeys(sequences) + } + # ------------------------------------------------------------------ # Component helpers # ------------------------------------------------------------------ @@ -273,13 +285,18 @@ def predict(self, peptides, n_flanks=None, c_flanks=None): ------- list of PeptideResult """ - results = [] + prediction_inputs = [] for i, peptide in enumerate(peptides): n_flank = n_flanks[i] if n_flanks else "" c_flank = c_flanks[i] if c_flanks else "" - full_seq = n_flank + peptide + c_flank - probs = self.cleavage_probs(full_seq) + prediction_inputs.append((peptide, n_flank, c_flank, full_seq)) + full_sequences = [item[3] for item in prediction_inputs] + probs_by_sequence = self.cleavage_probs_many(full_sequences) + + results = [] + for peptide, n_flank, c_flank, full_seq in prediction_inputs: + probs = probs_by_sequence[full_seq] offset = len(n_flank) score = self._peptide_score(probs, offset, len(peptide)) @@ -336,10 +353,11 @@ def predict_proteins(self, sequence_dict, peptide_lengths=None, sequence_dict = {seq: seq for seq in sequence_dict} peptide_lengths = self._resolve_peptide_lengths(peptide_lengths) + probs_by_sequence = self.cleavage_probs_many(sequence_dict.values()) results = defaultdict(list) for name, sequence in sequence_dict.items(): - probs = self.cleavage_probs(sequence) + probs = probs_by_sequence[sequence] for plen in peptide_lengths: for i in range(len(sequence) - plen + 1): peptide = sequence[i:i + plen] @@ -388,8 +406,9 @@ def predict_cleavage_sites(self, sequence_dict): """ if isinstance(sequence_dict, str): sequence_dict = {"seq": sequence_dict} + probs_by_sequence = self.cleavage_probs_many(sequence_dict.values()) return { - name: self.cleavage_probs(seq) + name: probs_by_sequence[seq] for name, seq in sequence_dict.items() } diff --git a/tests/test_pepsickle.py b/tests/test_pepsickle.py index e19d457..08f094f 100644 --- a/tests/test_pepsickle.py +++ b/tests/test_pepsickle.py @@ -10,6 +10,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import subprocess + import pytest from mhctools.pepsickle import Pepsickle @@ -39,6 +42,7 @@ def test_init_defaults(): p = Pepsickle() assert p.threshold == 0.5 assert p.human_only is False + assert p.isolate_subprocess is False assert p.default_peptide_lengths == [9] assert p.scoring is score_cterm_anti_max_internal assert not hasattr(p, "alleles") @@ -71,6 +75,57 @@ def test_cleavage_probs(predictor): assert all(0.0 <= p <= 1.0 for p in probs) +def test_isolated_cleavage_probs_matches_in_process(): + in_process = Pepsickle().cleavage_probs(PROTEIN) + isolated = Pepsickle(isolate_subprocess=True).cleavage_probs(PROTEIN) + assert isolated == pytest.approx(in_process) + + +def test_isolated_cleavage_probs_batches_unique_sequences(monkeypatch): + calls = [] + + def fake_run(args, input, text, capture_output, timeout): + calls.append({ + "args": args, + "request": json.loads(input), + "text": text, + "capture_output": capture_output, + "timeout": timeout, + }) + results = { + sequence: [0.25] * len(sequence) + for sequence in calls[-1]["request"]["sequences"] + } + return subprocess.CompletedProcess( + args=args, + returncode=0, + stdout=json.dumps({"results": results}), + stderr="", + ) + + monkeypatch.setattr("mhctools.pepsickle.subprocess.run", fake_run) + + predictor = Pepsickle( + threshold=0.25, + human_only=True, + isolate_subprocess=True, + subprocess_timeout=7, + ) + result = predictor.cleavage_probs_many([PROTEIN, PROTEIN, "AC"]) + + assert result[PROTEIN] == [0.25] * len(PROTEIN) + assert result["AC"] == [0.25, 0.25] + assert len(calls) == 1 + assert calls[0]["request"] == { + "human_only": True, + "threshold": 0.25, + "sequences": [PROTEIN, "AC"], + } + assert calls[0]["text"] is True + assert calls[0]["capture_output"] is True + assert calls[0]["timeout"] == 7 + + # -- predict -- def test_predict_returns_peptide_preds(predictor): @@ -191,4 +246,3 @@ def test_scoring_methods_produce_different_scores(): tuple(round(s, 6) for s in v) for v in scores_by_fn.values()) assert len(unique) > 1 - diff --git a/tests/test_processing_predictor.py b/tests/test_processing_predictor.py index c4c25cd..c80b7af 100644 --- a/tests/test_processing_predictor.py +++ b/tests/test_processing_predictor.py @@ -66,6 +66,24 @@ def cleavage_probs(self, sequence): return [0.5] * len(sequence) +class BatchStubPredictor(ProcessingPredictor): + """ProcessingPredictor that only supports batched cleavage calls.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.batches = [] + + def cleavage_probs(self, sequence): + raise AssertionError("predict APIs should use cleavage_probs_many") + + def cleavage_probs_many(self, sequences): + self.batches.append(list(sequences)) + return { + sequence: [0.5] * len(sequence) + for sequence in dict.fromkeys(sequences) + } + + # A 12-residue "protein" with known cleavage probs PROTEIN = "ABCDEFGHIJKL" # 0 1 2 3 4 5 6 7 8 9 10 11 @@ -608,6 +626,19 @@ def test_score_uses_scoring_function(self): # max_internal = 0.4, score = 0.5 * (1 - 0.4) = 0.3 assert pred.score == pytest.approx(0.3) + def test_uses_cleavage_probs_many(self): + stub = BatchStubPredictor(scoring=score_cterm) + results = stub.predict(["ABCD", "EF"], c_flanks=["X", "YZ"]) + assert len(results) == 2 + assert stub.batches == [["ABCDX", "EFYZ"]] + + def test_generator_peptides(self): + stub = BatchStubPredictor(scoring=score_cterm) + peptides = (peptide for peptide in ["ABCD", "EF"]) + results = stub.predict(peptides, c_flanks=["X", "YZ"]) + assert [pp.preds[0].peptide for pp in results] == ["ABCD", "EF"] + assert stub.batches == [["ABCDX", "EFYZ"]] + # ====================================================================== # predict_dataframe() @@ -765,6 +796,13 @@ def test_flank_length_does_not_change_scores(self): for a, b in zip(scores_a, scores_b): assert a == pytest.approx(b) + def test_uses_cleavage_probs_many(self): + stub = BatchStubPredictor(default_peptide_lengths=[3]) + result = stub.predict_proteins({"a": PROTEIN, "b": PROTEIN2}) + assert len(result["a"]) == len(PROTEIN) - 2 + assert len(result["b"]) == len(PROTEIN2) - 2 + assert stub.batches == [[PROTEIN, PROTEIN2]] + # ====================================================================== # predict_proteins_dataframe() @@ -818,6 +856,13 @@ def test_multiple_sequences(self): assert result["a"] == PROBS assert result["b"] == PROBS2 + def test_uses_cleavage_probs_many(self): + stub = BatchStubPredictor() + result = stub.predict_cleavage_sites({"a": PROTEIN, "b": PROTEIN2}) + assert result["a"] == [0.5] * len(PROTEIN) + assert result["b"] == [0.5] * len(PROTEIN2) + assert stub.batches == [[PROTEIN, PROTEIN2]] + # ====================================================================== # _resolve_peptide_lengths()