Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions kinoml/core/kinase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Defines the Kinase class

[WIP]
"""


class Kinase(object):
def __init__(
self,
pdb,
chain,
kinase_id,
name,
struct_id,
ligand,
pocket_seq,
numbering,
key_res,
dihedrals,
distances,
mean_dist,
):
"""This script defines a Kinase class of which any kinase can be represented as an object with the
following parameters:

Parameters
----------
pdb: str
The PDB code of the structure.
chain: str
The chain index of the structure.
kinase_id: int
The standard ID of a kinase enforced by the KLIFS database.
name: str
The standard name of the kinase used by the KLIFS database.
struct_id: int
The ID associated with a specific chain in the pdb structure of a kinase.
ligand: str
The ligand name as it appears in the pdb file.
pocket_seq: str
The 85 discontinuous residues (from multi-sequence alignment) that define the binding pocket of a kinase.
numbering: list of int
The residue indices of the 85 pocket residues specific to the structure.
key_res: list of int
A list of residue indices that are relevant to the collective variables.
dihedrals: list of floats
A list (one frame) or lists (multiple frames) of dihedrals relevant to kinase conformation.
distances: list of floats
A list (one frame) or lists (multiple frames) of intramolecular distances relevant to kinase conformation.
mean_dist: float
A float (one frame) or a list of floats (multiple frames), which is the mean pairwise distance between
ligand heavy atoms and the CAs of the 85 pocket residues.

.. todo ::

This is WAY too many positional arguments. Can we use kwargs instead, or somehow simplify the positional arguments into logical groups?
Many of these will be optional if we want to represent aspects of a structure, so there's no need to make them all requiredself.
Also, we will likely not want to mix features (distances, dihedrals) with structural information directly.

"""

self.pdb = pdb
self.chain = chain
self.kinase_id = kinase_id
self.name = name
self.struct_id = struct_id
self.ligand = ligand
self.pocket_seq = pocket_seq
self.numbering = numbering
self.key_res = key_res
self.dihedrals = dihedrals
self.distances = distances
self.mean_dist = mean_dist
233 changes: 233 additions & 0 deletions kinoml/features/dunbrack_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
"""
Tools to assign a structure or a trajectory of structures into
conformational clusters based on Modi and Dunbrack, 2019 (https://pubmed.ncbi.nlm.nih.gov/30867294/)
"""
from pathlib import Path
import tempfile
from typing import Union

from appdirs import user_cache_dir
import pandas as pd


def assign(dihedrals, distances):
from math import cos
import numpy as np

# define the centroid values for Dunbrack features
centroid = dict()
centroid[(0, "x_phi")] = -129.0
centroid[(0, "x_psi")] = 179.0
centroid[(0, "d_phi")] = 61.0
centroid[(0, "d_psi")] = 81.0
centroid[(0, "f_phi")] = -97.0
centroid[(0, "f_psi")] = 20.0
centroid[(0, "f_chi1")] = -71.0

centroid[(1, "x_phi")] = -119.0
centroid[(1, "x_psi")] = 168.0
centroid[(1, "d_phi")] = 59.0
centroid[(1, "d_psi")] = 34.0
centroid[(1, "f_phi")] = -89.0
centroid[(1, "f_psi")] = -8.0
centroid[(1, "f_chi1")] = 56.0

centroid[(2, "x_phi")] = -112.0
centroid[(2, "x_psi")] = -8.0
centroid[(2, "d_phi")] = -141.0
centroid[(2, "d_psi")] = 148.0
centroid[(2, "f_phi")] = -128.0
centroid[(2, "f_psi")] = 23.0
centroid[(2, "f_chi1")] = -64.0

centroid[(3, "x_phi")] = -135.0
centroid[(3, "x_psi")] = 175.0
centroid[(3, "d_phi")] = 60.0
centroid[(3, "d_psi")] = 65.0
centroid[(3, "f_phi")] = -79.0
centroid[(3, "f_psi")] = 145.0
centroid[(3, "f_chi1")] = -73.0

centroid[(4, "x_phi")] = -125.0
centroid[(4, "x_psi")] = 172.0
centroid[(4, "d_phi")] = 60.0
centroid[(4, "d_psi")] = 33.0
centroid[(4, "f_phi")] = -85.0
centroid[(4, "f_psi")] = 145.0
centroid[(4, "f_chi1")] = 49.0

centroid[(5, "x_phi")] = -106.0
centroid[(5, "x_psi")] = 157.0
centroid[(5, "d_phi")] = 69.0
centroid[(5, "d_psi")] = 21.0
centroid[(5, "f_phi")] = -62.0
centroid[(5, "f_psi")] = 134.0
centroid[(5, "f_chi1")] = -145.0

assignment = list()
for i in range(len(distances)):
## reproduce the Dunbrack clustering
## level1: define the DFG positions
if distances[i][0] <= 11.0 and distances[i][1] <= 11.0: # angstroms
## can only be BABtrans
assignment.append(7)
elif distances[i][0] > 11.0 and distances[i][1] < 14.0:
## can only be BBAminus
assignment.append(6)
else:
## belong to DFGin and possibly clusters 0 - 5
mindist = 10000.0
cluster_assign = 0

for c in range(6):
total_dist = (
float(
(
2.0
* (
1.0
- cos((dihedrals[i][0] - centroid[(c, "x_phi")]) * np.pi / 180.0)
)
)
+ (
2.0
* (
1.0
- cos((dihedrals[i][1] - centroid[(c, "x_psi")]) * np.pi / 180.0)
)
)
+ (
2.0
* (
1.0
- cos((dihedrals[i][2] - centroid[(c, "d_phi")]) * np.pi / 180.0)
)
)
+ (
2.0
* (
1.0
- cos((dihedrals[i][3] - centroid[(c, "d_psi")]) * np.pi / 180.0)
)
)
+ (
2.0
* (
1.0
- cos((dihedrals[i][4] - centroid[(c, "f_phi")]) * np.pi / 180.0)
)
)
+ (
2.0
* (
1.0
- cos((dihedrals[i][5] - centroid[(c, "f_psi")]) * np.pi / 180.0)
)
)
+ (
2.0
* (
1.0
- cos((dihedrals[i][6] - centroid[(c, "f_chi1")]) * np.pi / 180.0)
)
)
)
/ 7
)
if total_dist < mindist:
mindist = total_dist
clust_assign = c
assignment.append(clust_assign)
return assignment


class PDBDunbrack:

_PDB_DUNBRACK_LIBRARY = Path(f"{user_cache_dir()}/pdb_dunbrack_library.csv")

def __init__(self):
self.pdb_dunbrack_library = self.update()

def __repr__(self):
return f"<PDBDunbrack library located at {self._PDB_DUNBRACK_LIBRARY} contains {self.pdb_dunbrack_library.shape[0]} structures.>"

def update(self, reinitialize: bool = False) -> pd.DataFrame:
"""
Update DataFrame holding information about kinases from the KLIFS database and the corresponding Dunbrack
cluster.
Parameters
----------
reinitialize: bool
If the DataFrame should be built from scratch.
Returns
-------
pdb_dunbrack_library: pd.DataFrame
DataFrame holding information about kinases from KLIFS and the corresponding Dunbrack cluster.
"""
from .klifs import query_klifs_database
import klifs_utils
import MDAnalysis as mda
from .protein_struct_features import key_klifs_residues, compute_simple_protein_features
from tqdm import tqdm

# get available kinase information from KLIFS
klifs_kinase_ids = klifs_utils.remote.kinases.kinase_names().kinase_ID.to_list()
klifs_kinase_df = klifs_utils.remote.structures.structures_from_kinase_ids(
klifs_kinase_ids
)

# initialize library
if not self._PDB_DUNBRACK_LIBRARY.is_file() or reinitialize is True:
columns = list(klifs_kinase_df.columns) + ["dunbrack_cluster"]
pdb_dunbrack_library = pd.DataFrame(columns=columns)
pdb_dunbrack_library.to_csv(self._PDB_DUNBRACK_LIBRARY, index=False)

pdb_dunbrack_library = pd.read_csv(self._PDB_DUNBRACK_LIBRARY)

counter = 0
for index, row in tqdm(klifs_kinase_df.iterrows(), total=klifs_kinase_df.shape[0]):
structure_id = row["structure_ID"]
if structure_id not in list(pdb_dunbrack_library["structure_ID"]):
counter += 1
try: # assign dunbrack cluster
with tempfile.NamedTemporaryFile(suffix=".pdb", mode="w+t") as temp_file:
pdb_text = klifs_utils.remote.coordinates.complex._complex_pdb_text(
structure_id
)
temp_file.write(pdb_text)
u = mda.Universe(temp_file.name)
klifs = query_klifs_database(row["pdb"], row["chain"])
key_res = key_klifs_residues(klifs["numbering"])
dihedrals, distances = compute_simple_protein_features(u, key_res)
assignment = assign(dihedrals, distances)[0]
except: # catch all errors and assign None
assignment = None
row["dunbrack_cluster"] = assignment
pdb_dunbrack_library = pdb_dunbrack_library.append(row, ignore_index=True)
if counter % 10 == 0: # save every 10th structure, so one can pause in between
pdb_dunbrack_library.to_csv(self._PDB_DUNBRACK_LIBRARY, index=False)

pdb_dunbrack_library.to_csv(self._PDB_DUNBRACK_LIBRARY, index=False)
return pdb_dunbrack_library

def structures_by_cluster(self, cluster_id: Union[int, None]) -> pd.DataFrame:
"""
Get KLIFS kinase information of structures matching the given Dunbrack cluster ID.
Parameters
----------
cluster_id: int or None
Dunbrack cluser ID of interest. None for structures that were not assigned to any cluster.
Returns
-------
structures: pd.DataFrame
KLIFS kinase information about matching structures.
"""
if cluster_id is None:
structures = self.pdb_dunbrack_library[
self.pdb_dunbrack_library["dunbrack_cluster"].isnull()
]
else:
structures = self.pdb_dunbrack_library[
self.pdb_dunbrack_library["dunbrack_cluster"] == cluster_id
]
return structures
74 changes: 74 additions & 0 deletions kinoml/features/kinase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
kinase_model.py
Defines the Kinase class

"""


class Kinase(object):
def __init__(
self,
pdb,
chain,
kinase_id,
name,
struct_id,
ligand,
pocket_seq,
numbering,
key_res,
dihedrals,
distances,
mean_dist,
):
"""This script defines a Kinase class of which any kinase can be represented as an object with the
following parameters:

Parameters
----------
pdb: str
The PDB code of the structure.
chain: str
The chain index of the structure.
kinase_id: int
The standard ID of a kinase enforced by the KLIFS database.
name: str
The standard name of the kinase used by the KLIFS database.
struct_id: int
The ID associated with a specific chain in the pdb structure of a kinase.
ligand: str
The ligand name as it appears in the pdb file.
pocket_seq: str
The 85 discontinuous residues (from multi-sequence alignment) that define the binding pocket of a kinase.
numbering: list of int
The residue indices of the 85 pocket residues specific to the structure.
key_res: list of int
A list of residue indices that are relevant to the collective variables.
dihedrals: list of floats
A list (one frame) or lists (multiple frames) of dihedrals relevant to kinase conformation.
distances: list of floats
A list (one frame) or lists (multiple frames) of intramolecular distances relevant to kinase conformation.
mean_dist: float
A float (one frame) or a list of floats (multiple frames), which is the mean pairwise distance between
ligand heavy atoms and the CAs of the 85 pocket residues.

.. todo ::

This is WAY too many positional arguments. Can we use kwargs instead, or somehow simplify the positional arguments into logical groups?
Many of these will be optional if we want to represent aspects of a structure, so there's no need to make them all requiredself.
Also, we will likely not want to mix features (distances, dihedrals) with structural information directly.

"""

self.pdb = pdb
self.chain = chain
self.kinase_id = kinase_id
self.name = name
self.struct_id = struct_id
self.ligand = ligand
self.pocket_seq = pocket_seq
self.numbering = numbering
self.key_res = key_res
self.dihedrals = dihedrals
self.distances = distances
self.mean_dist = mean_dist
Loading