Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mhctools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __getattr__(name):
raise AttributeError(
"module %r has no attribute %r" % (__name__, name))

__version__ = "3.13.2"
__version__ = "3.13.3"

__all__ = [
"Prediction",
Expand Down
142 changes: 139 additions & 3 deletions mhctools/pepsickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import subprocess
import sys

from .proteasome_predictor import ProteasomePredictor

# Module-level cache for loaded pepsickle models. Keyed by human_only.
_model_cache = {}

logger = logging.getLogger(__name__)

PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS = 300

_PEPSICKLE_SUBPROCESS_SCRIPT = r"""
import json
import sys

from pepsickle.model_functions import (
initialize_epitope_model,
predict_protein_cleavage_locations,
)

request = json.loads(sys.stdin.read())
model = initialize_epitope_model(human_only=request["human_only"])
results = {}
for sequence in request["sequences"]:
preds_raw = predict_protein_cleavage_locations(
sequence,
model,
mod_type="epitope",
proteasome_type="C",
threshold=request["threshold"],
)
results[sequence] = [entry[2] for entry in preds_raw]
json.dump({"results": results}, sys.stdout)
"""


class Pepsickle(ProteasomePredictor):
"""
Expand All @@ -39,27 +72,41 @@ class Pepsickle(ProteasomePredictor):

human_only : bool
If True, use human-only trained model instead of all-mammal.

isolate_subprocess : bool
If True, run pepsickle inference in a short-lived Python subprocess.
This avoids macOS duplicate OpenMP runtime crashes when the parent
process has already imported packages such as pandas, numpy, or
pyarrow.

subprocess_timeout : int
Timeout in seconds for isolated pepsickle inference.
"""

def __init__(
self,
default_peptide_lengths=None,
scoring=None,
threshold=0.5,
human_only=False):
human_only=False,
isolate_subprocess=False,
subprocess_timeout=PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS):
ProteasomePredictor.__init__(
self,
default_peptide_lengths=default_peptide_lengths,
scoring=scoring,
)
self.threshold = threshold
self.human_only = human_only
self.isolate_subprocess = isolate_subprocess
self.subprocess_timeout = subprocess_timeout
self._model = None

def __str__(self):
return "%s(scoring=%s)" % (
return "%s(scoring=%s, isolate_subprocess=%s)" % (
self.__class__.__name__,
getattr(self.scoring, "__name__", repr(self.scoring)))
getattr(self.scoring, "__name__", repr(self.scoring)),
self.isolate_subprocess)

def _predictor_name(self):
return "pepsickle"
Expand All @@ -75,6 +122,20 @@ def _load_model(self):
return self._model

def cleavage_probs(self, sequence):
return self.cleavage_probs_many([sequence])[sequence]

def cleavage_probs_many(self, sequences):
unique_sequences = list(dict.fromkeys(sequences))
if not unique_sequences:
return {}
if self.isolate_subprocess:
return self._cleavage_probs_many_subprocess(unique_sequences)
return {
sequence: self._cleavage_probs_in_process(sequence)
for sequence in unique_sequences
}

def _cleavage_probs_in_process(self, sequence):
from pepsickle.model_functions import predict_protein_cleavage_locations
model = self._load_model()
preds_raw = predict_protein_cleavage_locations(
Expand All @@ -85,3 +146,78 @@ def cleavage_probs(self, sequence):
threshold=self.threshold,
)
return [entry[2] for entry in preds_raw]

def _cleavage_probs_many_subprocess(self, sequences):
payload = json.dumps({
"human_only": bool(self.human_only),
"threshold": float(self.threshold),
"sequences": sequences,
})
try:
result = subprocess.run(
[sys.executable, "-c", _PEPSICKLE_SUBPROCESS_SCRIPT],
input=payload,
text=True,
capture_output=True,
timeout=self.subprocess_timeout,
)
except subprocess.TimeoutExpired as e:
msg = (
"pepsickle subprocess timed out after %d seconds "
"while scoring %d sequences"
% (self.subprocess_timeout, len(sequences)))
logger.warning(msg)
raise RuntimeError(msg) from e
except OSError as e:
msg = "Could not start pepsickle subprocess: %s" % e
logger.warning(msg)
raise RuntimeError(msg) from e

stderr_text = result.stderr.strip()
if stderr_text:
logger.warning("pepsickle subprocess stderr:\n%s", stderr_text)
if result.returncode != 0:
logger.warning(
"pepsickle subprocess exited with code %d",
result.returncode)
raise RuntimeError(
"pepsickle subprocess exited with code %d.\nstdout: %s\n"
"stderr: %s"
% (result.returncode, result.stdout.strip(), stderr_text))

try:
parsed = json.loads(result.stdout)
except ValueError as e:
logger.warning("Could not parse pepsickle subprocess JSON output")
raise RuntimeError(
"Could not parse pepsickle subprocess JSON output: %s"
% result.stdout.strip()) from e

results = parsed.get("results")
if not isinstance(results, dict):
logger.warning(
"pepsickle subprocess output is missing a results object")
raise RuntimeError(
"pepsickle subprocess output is missing a results object")
missing = [sequence for sequence in sequences if sequence not in results]
if missing:
logger.warning(
"pepsickle subprocess omitted %d sequences",
len(missing))
raise RuntimeError(
"pepsickle subprocess omitted %d sequences" % len(missing))

output = {}
for sequence in sequences:
probs = results[sequence]
if len(probs) != len(sequence):
logger.warning(
"pepsickle subprocess returned %d scores for a "
"%d-residue sequence",
len(probs),
len(sequence))
raise ValueError(
"Expected %d pepsickle scores for sequence, got %d"
% (len(sequence), len(probs)))
output[sequence] = [float(p) for p in probs]
return output
29 changes: 24 additions & 5 deletions mhctools/processing_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,18 @@ def cleavage_probs(self, sequence):
raise NotImplementedError(
"%s must implement cleavage_probs" % self.__class__.__name__)

def cleavage_probs_many(self, sequences):
"""
Return per-position cleavage probabilities for multiple sequences.

Subclasses can override this to batch expensive setup. The default
implementation preserves the existing one-sequence-at-a-time behavior.
"""
return {
sequence: self.cleavage_probs(sequence)
for sequence in dict.fromkeys(sequences)
}

# ------------------------------------------------------------------
# Component helpers
# ------------------------------------------------------------------
Expand Down Expand Up @@ -273,13 +285,18 @@ def predict(self, peptides, n_flanks=None, c_flanks=None):
-------
list of PeptideResult
"""
results = []
prediction_inputs = []
for i, peptide in enumerate(peptides):
n_flank = n_flanks[i] if n_flanks else ""
c_flank = c_flanks[i] if c_flanks else ""

full_seq = n_flank + peptide + c_flank
probs = self.cleavage_probs(full_seq)
prediction_inputs.append((peptide, n_flank, c_flank, full_seq))
full_sequences = [item[3] for item in prediction_inputs]
probs_by_sequence = self.cleavage_probs_many(full_sequences)

results = []
for peptide, n_flank, c_flank, full_seq in prediction_inputs:
probs = probs_by_sequence[full_seq]

offset = len(n_flank)
score = self._peptide_score(probs, offset, len(peptide))
Expand Down Expand Up @@ -336,10 +353,11 @@ def predict_proteins(self, sequence_dict, peptide_lengths=None,
sequence_dict = {seq: seq for seq in sequence_dict}

peptide_lengths = self._resolve_peptide_lengths(peptide_lengths)
probs_by_sequence = self.cleavage_probs_many(sequence_dict.values())

results = defaultdict(list)
for name, sequence in sequence_dict.items():
probs = self.cleavage_probs(sequence)
probs = probs_by_sequence[sequence]
for plen in peptide_lengths:
for i in range(len(sequence) - plen + 1):
peptide = sequence[i:i + plen]
Expand Down Expand Up @@ -388,8 +406,9 @@ def predict_cleavage_sites(self, sequence_dict):
"""
if isinstance(sequence_dict, str):
sequence_dict = {"seq": sequence_dict}
probs_by_sequence = self.cleavage_probs_many(sequence_dict.values())
return {
name: self.cleavage_probs(seq)
name: probs_by_sequence[seq]
for name, seq in sequence_dict.items()
}

Expand Down
56 changes: 55 additions & 1 deletion tests/test_pepsickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import subprocess

import pytest

from mhctools.pepsickle import Pepsickle
Expand Down Expand Up @@ -39,6 +42,7 @@ def test_init_defaults():
p = Pepsickle()
assert p.threshold == 0.5
assert p.human_only is False
assert p.isolate_subprocess is False
assert p.default_peptide_lengths == [9]
assert p.scoring is score_cterm_anti_max_internal
assert not hasattr(p, "alleles")
Expand Down Expand Up @@ -71,6 +75,57 @@ def test_cleavage_probs(predictor):
assert all(0.0 <= p <= 1.0 for p in probs)


def test_isolated_cleavage_probs_matches_in_process():
in_process = Pepsickle().cleavage_probs(PROTEIN)
isolated = Pepsickle(isolate_subprocess=True).cleavage_probs(PROTEIN)
assert isolated == pytest.approx(in_process)


def test_isolated_cleavage_probs_batches_unique_sequences(monkeypatch):
calls = []

def fake_run(args, input, text, capture_output, timeout):
calls.append({
"args": args,
"request": json.loads(input),
"text": text,
"capture_output": capture_output,
"timeout": timeout,
})
results = {
sequence: [0.25] * len(sequence)
for sequence in calls[-1]["request"]["sequences"]
}
return subprocess.CompletedProcess(
args=args,
returncode=0,
stdout=json.dumps({"results": results}),
stderr="",
)

monkeypatch.setattr("mhctools.pepsickle.subprocess.run", fake_run)

predictor = Pepsickle(
threshold=0.25,
human_only=True,
isolate_subprocess=True,
subprocess_timeout=7,
)
result = predictor.cleavage_probs_many([PROTEIN, PROTEIN, "AC"])

assert result[PROTEIN] == [0.25] * len(PROTEIN)
assert result["AC"] == [0.25, 0.25]
assert len(calls) == 1
assert calls[0]["request"] == {
"human_only": True,
"threshold": 0.25,
"sequences": [PROTEIN, "AC"],
}
assert calls[0]["text"] is True
assert calls[0]["capture_output"] is True
assert calls[0]["timeout"] == 7


# -- predict --

def test_predict_returns_peptide_preds(predictor):
Expand Down Expand Up @@ -191,4 +246,3 @@ def test_scoring_methods_produce_different_scores():
tuple(round(s, 6) for s in v) for v in scores_by_fn.values())
assert len(unique) > 1


Loading
Loading