From 34e409439dbe9f785884d65a88ed32f5d0f4d617 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Apr 2025 14:08:08 -0500 Subject: [PATCH 01/20] saving changes --- frustratometer/classes/AWSEM.py | 300 +++++++++++++++----- frustratometer/classes/__init__.py | 2 +- frustratometer/optimization/optimization.py | 124 ++------ 3 files changed, 246 insertions(+), 180 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 770f4ad..72f6f9a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -5,9 +5,9 @@ from .Gamma import Gamma from pydantic import BaseModel, Field, ConfigDict from pydantic.types import Path -from typing import List,Optional,Union +from typing import List,Optional,Union,Generator -__all__ = ['AWSEM'] +__all__ = ['AWSEM','AWSEMEnsemble'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -75,73 +75,88 @@ def __init__(self, AWSEM object """ - #Set attributes - p = AWSEMParameters(**parameters) - if p.min_sequence_separation_contact is None: - p.min_sequence_separation_contact = 1 - if p.min_sequence_separation_rho is None: - p.min_sequence_separation_rho = 1 - if p.min_sequence_separation_electrostatics is None: - p.min_sequence_separation_electrostatics = 1 - - for field, value in p: + #Set parameters attributes + self.p = AWSEMParameters(**parameters) + if self.p.min_sequence_separation_contact is None: + self.p.min_sequence_separation_contact = 1 + if self.p.min_sequence_separation_rho is None: + self.p.min_sequence_separation_rho = 1 + if self.p.min_sequence_separation_electrostatics is None: + self.p.min_sequence_separation_electrostatics = 1 + for field, value in self.p: setattr(self, field, value) - - #Gamma parameters - if isinstance(p.gamma, Gamma): - gamma = p.gamma - elif isinstance(p.gamma, Path): - gamma = Gamma(p.gamma) + if isinstance(self.p.gamma, Gamma): + gamma = self.p.gamma + elif isinstance(self.p.gamma, Path): + gamma = Gamma(self.p.gamma) else: raise ValueError("Gamma parameter must be a path or a Gamma object.") - self.gamma=gamma self.burial_gamma = gamma['Burial'].T self.direct_gamma = gamma['Direct'][0] self.protein_gamma = gamma['Protein'][0] self.water_gamma = gamma['Water'][0] - self.burial_in_context=p.burial_in_context + self.burial_in_context=self.p.burial_in_context - #Structure details - self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict + # ?????? + self._decoy_fluctuation = {} # don't know what this does + self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ + + # sequence details if sequence is None: self.sequence=pdb_structure.sequence else: self.sequence=sequence + self.aa_freq = frustration.compute_aa_freq(self.sequence) + self.contact_freq = frustration.compute_contact_freq(self.sequence) + + # structure details + self.expose_indicator_functions = expose_indicator_functions + self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict + self.pdb_structure = pdb_structure + + + @property + def pdb_structure(self): + return self._pdb_structure + @pdb_structure.setter + def pdb_structure(self,pdb_structure): + # check structure + selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') + resid = selection_CB.getResindices() + N=len(resid) + if N != len(self.sequence): + raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") + self.resid = resid + self.N = N + # set structure-dependent proterties + self.pdb_structure = pdb_structure self.structure=pdb_structure.structure self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file self.init_index_shift=pdb_structure.init_index_shift self.distance_matrix=pdb_structure.distance_matrix self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix - selection_CB = self.structure.select('name CB or (resname GLY IGL and name CA)') - - resid = selection_CB.getResindices() - self.resid=resid - self.N=len(self.resid) - assert self.N == len(self.sequence), "The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object." + # reset indicator functions, energies, and potts model + self.compute_indicators_energies_potts_model() + def compute_indicators_energies_potts_model(): if self.burial_in_context==True: selected_matrix=self.full_pdb_distance_matrix else: selected_matrix=self.distance_matrix sequence_mask_rho = frustration.compute_mask(selected_matrix, maximum_contact_distance=None, - minimum_sequence_separation = p.min_sequence_separation_rho) + minimum_sequence_separation = self.p.min_sequence_separation_rho) sequence_mask_contact = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=p.distance_cutoff_contact, - minimum_sequence_separation = p.min_sequence_separation_contact) - - self._decoy_fluctuation = {} - self.minimally_frustrated_threshold=.78 - + maximum_contact_distance=self.p.distance_cutoff_contact, + minimum_sequence_separation = self.p.min_sequence_separation_contact) # Calculate rho rho = 0.25 - rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) - rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) + rho *= (1 + np.tanh(self.p.eta * (selected_matrix- self.p.r_min))) + rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - selected_matrix))) rho *= sequence_mask_rho self.rho=rho - #Calculate sigma water rho_r = (rho).sum(axis=1) if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: @@ -153,23 +168,21 @@ def __init__(self, rho_b = np.expand_dims(rho_r, 1) rho1 = np.expand_dims(rho_r, 0) rho2 = np.expand_dims(rho_r, 1) - sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) + sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) sigma_protein = 1 - sigma_water - #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(p.eta * (self.distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - self.distance_matrix))) - thetaII = 0.25 * (1 + np.tanh(p.eta * (self.distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - self.distance_matrix))) - burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) + theta = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - self.distance_matrix))) + thetaII = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_minII))) * (1 + np.tanh(self.p.eta * (self.p.r_maxII - self.distance_matrix))) + burial_indicator = np.tanh(self.p.burial_kappa * (rho_b - self.p.burial_ro_min)) + np.tanh(self.p.burial_kappa * (self.p.burial_ro_max - rho_b)) direct_indicator = theta[:, :, np.newaxis, np.newaxis] water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] - - if expose_indicator_functions: + # store indicators and gammas for our particular sequence as attributes + if self.expose_indicator_functions: self.indicators=[] self.indicators.append(burial_indicator[:,0]) self.indicators.append(burial_indicator[:,1]) self.indicators.append(burial_indicator[:,2]) - self.indicators.append(direct_indicator[:,:,0,0]*sequence_mask_contact) self.indicators.append(protein_indicator[:,:,0,0]*sequence_mask_contact) self.indicators.append(water_indicator[:,:,0,0]*sequence_mask_contact) @@ -194,60 +207,52 @@ def __init__(self, self.water_indicator = water_indicator self.protein_indicator = protein_indicator - + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) - - #Burial energy - burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] - self.burial_energy = burial_energy - - #Contact energy + + # compute burial and contact energies + self.burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - contact_energy = p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - - # Compute electrostatics - if p.k_electrostatics!=0: - self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) + self.contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + # Compute electrostatics and add to contact energy + if self.p.k_electrostatics!=0: + self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None - - - electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) + electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] - - electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / p.electrostatics_screening_length) * electrostatics_mask - electrostatics_energy = -p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) - + electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * electrostatics_mask + electrostatics_energy = -self.p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) - if expose_indicator_functions: + if self.expose_indicator_functions: self.indicators.append(electrostatics_indicator) - temp_gamma=0.5 * p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] + temp_gamma=0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 self.gamma_array.append(temp_gamma) else: - self.sequence_cutoff=p.min_sequence_separation_contact - self.distance_cutoff=p.distance_cutoff_contact + self.sequence_cutoff=self.p.min_sequence_separation_contact + self.distance_cutoff=self.p.distance_cutoff_contact self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) - self.contact_energy = contact_energy - # Compute fast properties - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + # Compute potts model self.potts_model = {} self.potts_model['h'] = burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] self.potts_model['J'] = contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] - # Set the gap energy to zero self.potts_model['h'][:, 0] = 0 self.potts_model['J'][:, :, 0, :] = 0 self.potts_model['J'][:, :, :, 0] = 0 - self._native_energy=None + self._native_energy=None # don't know what this does + + def change_conformation(alternative_pdb_structure): + # this function is an alias for the pdb_structure setter + self.pdb_structure = alternative_pdb_structure def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] @@ -366,4 +371,147 @@ def compute_configurational_energies(self): def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) - return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) \ No newline at end of file + return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + + +class AWSEMEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer + # also, note that the functions compute_configurational_decoy_statistics, + # compute_configurational_energies, and configuration_frustration are + # present in the AWSEM class but removed here, since we don't expect to + # compute frustration on an entire ensemble + #Mapping to DCA + q = 20 + aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy + aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') + + def __init__(self, + pdb_structures: Generator[object,None,None], + **parameters)->object: + """ + Generate AWSEMEnsemble object + + Parameters + ---------- + pdb_structures : Generator[object,None,None] + yields Structure objects representing decoy structures + + Returns + ------- + AWSEMEnsemble object + """ + + #Set attributes + p = AWSEMParameters(**parameters) + if p.min_sequence_separation_contact is None: + p.min_sequence_separation_contact = 1 + if p.min_sequence_separation_rho is None: + p.min_sequence_separation_rho = 1 + if p.min_sequence_separation_electrostatics is None: + p.min_sequence_separation_electrostatics = 1 + + for field, value in p: + setattr(self, field, value) + + #Gamma parameters + if isinstance(p.gamma, Gamma): + gamma = p.gamma + elif isinstance(p.gamma, Path): + gamma = Gamma(p.gamma) + else: + raise ValueError("Gamma parameter must be a path or a Gamma object.") + + self.gamma=gamma + self.burial_gamma = gamma['Burial'].T + self.direct_gamma = gamma['Direct'][0] + self.protein_gamma = gamma['Protein'][0] + self.water_gamma = gamma['Water'][0] + self.burial_in_context=p.burial_in_context # need to be careful here--the same choice will have to apply to all structures, + # the way this code is currently written + self.indicators = [] # we're always going to expose indicator functions for this class + #self._decoy_fluctuation = {} # not sure what this does + if p.k_electrostatics!=0: + self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) + self.distance_cutoff=None + else: + self.sequence_cutoff=p.min_sequence_separation_contact + self.distance_cutoff=p.distance_cutoff_contact + + Ns = [] # number of residues in each structure + for pdb_structure in pdb_structures: + self.indicators.append(AWSEM(pdb_structure,expose_indicator_functions=True).indicators) + """ + #Structure details + # we can exclude most of the details present in the AWSEM class + structure=pdb_structure.structure + init_index_shift=pdb_structure.init_index_shift + distance_matrix=pdb_structure.distance_matrix + full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix + selection_CB = structure.select('name CB or (resname GLY IGL and name CA)') + + resid = selection_CB.getResindices() + N = len(resid) + Ns.append(N) # we'll check this later do make sure every structure has the same number of residues + + if self.burial_in_context==True: + selected_matrix=full_pdb_distance_matrix # use a matrix that includes extra residues to compute local density and contacts + # like the case where we're trying to design a protein that binds + # to another protein, and those residues affect the local environment even if they're + # not part of the sequence space that we're sampling + else: + selected_matrix=distance_matrix + sequence_mask_rho = frustration.compute_mask(selected_matrix, + maximum_contact_distance=None, + minimum_sequence_separation = p.min_sequence_separation_rho) + sequence_mask_contact = frustration.compute_mask(distance_matrix, + maximum_contact_distance=p.distance_cutoff_contact, + minimum_sequence_separation = p.min_sequence_separation_contact) + + # Calculate rho + rho = 0.25 + rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) + rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) + rho *= sequence_mask_rho + + #Calculate sigma water + rho_r = (rho).sum(axis=1) + if full_pdb_distance_matrix.shape!=distance_matrix.shape: + if self.burial_in_context==True: + init_index_shift=pdb_structure.init_index_shift + fin_index_shift=pdb_structure.fin_index_shift + rho_r=rho_r[init_index_shift:fin_index_shift] + rho_b = np.expand_dims(rho_r, 1) + rho1 = np.expand_dims(rho_r, 0) + rho2 = np.expand_dims(rho_r, 1) + sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) + sigma_protein = 1 - sigma_water + + #Calculate theta and indicators + theta = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - distance_matrix))) + thetaII = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - distance_matrix))) + burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) + direct_indicator = theta[:, :, np.newaxis, np.newaxis] + water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] + protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] + + self.indicators.append([]) + self.indicators[-1].append(burial_indicator[:,0]) + self.indicators[-1].append(burial_indicator[:,1]) + self.indicators[-1].append(burial_indicator[:,2]) + self.indicators[-1].append(direct_indicator[:,:,0,0]*sequence_mask_contact) + self.indicators[-1].append(protein_indicator[:,:,0,0]*sequence_mask_contact) + self.indicators[-1].append(water_indicator[:,:,0,0]*sequence_mask_contact) + + # Compute electrostatics + if p.k_electrostatics!=0: + electrostatics_mask = frustration.compute_mask(distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) + # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] + charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) + charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] + electrostatics_indicator = 1 / (distance_matrix + 1E-6) * np.exp(-distance_matrix / p.electrostatics_screening_length) * electrostatics_mask + self.indicators[-1].append(electrostatics_indicator) + """ + + self._native_energy=None # not sure what this does + + #assert len(list(set(Ns))) == 1, f"Not all structures had the same number of residues! Numbers of residues found were {set(Ns)}" + #self.N = Ns[0] # doesn't matter which one we choose, they're all the same if we passed the assert diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index 6621727..70dec2d 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM +from .AWSEM import AWSEM, AWSEMEnsemble from .Structure import Structure from .Map import Map from .Gamma import Gamma diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 0bdc5e5..3ade9f9 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM +from frustratometer.classes import AWSEM, AWSEMEnsemble from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -1109,111 +1109,29 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): if __name__ == '__main__': - native_pdb = "tests/data/1r69.pdb" - - structure_bound = Structure.full_pdb(native_pdb, chain=None) - structure_free = Structure.full_pdb(native_pdb, "A") - - model_bound = AWSEM(structure_bound, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) - model_free = AWSEM(structure_free, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) - reduced_alphabet = 'ADEFHIKLMNQRSTVWY' - - print(model_bound.sequence) - print(model_free.sequence) - - # binding_region=np.array([1, 2, 3, 4, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 68, 69, 70, 90, 91, 92, 93, 94, 95, 96, 97, 109, 110, 111, 112, 113, 114, 115, 116, 117, 127, 128, 129, 130, 131, 132, 133, 134, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 190, 191, 192, 193, 194, 195, 196, 197])-1 - energy_bound = AwsemEnergySelected(model_bound, alphabet=reduced_alphabet, selection=np.array(range(len(model_free.sequence)))) - energy_free = AwsemEnergySelected(model_free, alphabet=reduced_alphabet) - energy_average = AwsemEnergyAverage(model_free, alphabet=reduced_alphabet) - energy_std = AwsemEnergyStd(model_free, alphabet=reduced_alphabet) - energy_std_100 = AwsemEnergyStd(model_free, alphabet=reduced_alphabet, n_decoys=100) - energy_std_1000 = AwsemEnergyStd(model_free, alphabet=reduced_alphabet, n_decoys=1000) - energy_std_10000 = AwsemEnergyStd(model_free, alphabet=reduced_alphabet, n_decoys=10000) - energy_variance = AwsemEnergyVariance(model_free, alphabet=reduced_alphabet) + pdb_list = ["tests/data/1r69.pdb"] + pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) + ensemble = AWSEMEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) + ########################################################### + # temporary changes for testing it while it's not yet fully implemented + assert len(ensemble.indicators)==1, ensemble.indicators + #assert len(ensemble.potts_model['h'])==1, ensemble.potts_model['h'] + #assert len(ensemble.potts_model['J'])==1, ensemble.potts_model['J'] + ensemble.indicators = ensemble.indicators[0] + #ensemble.potts_model['h'] = ensemble.potts_model['h'][0] + #ensemble.potts_model['J'] = ensemble.potts_model['J'][0] + ensemble.mask = np.ones((63,63)) + ######################################################### + + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' + + awsem_energy = AwsemEnergy(ensemble, alphabet=reduced_alphabet) heterogeneity = Heterogeneity(exact=False, use_numba=True) - similarity = Similarity(model_free.sequence, use_numba=True) - - # energy_mix = energy_free - 20 * heterogeneity - energy_mix = (energy_free - energy_average) / energy_std - # energy_mix = energy_bound - energy_free - - energy_mixes = {"EnergyFree": energy_free, - "EnergyBound": energy_bound, - "Heterogeneity": heterogeneity, - "EnergyAverage": energy_average, - "EnergyStd_ndecoys100": energy_std_100, - "EnergyStd_ndecoys1000": energy_std_1000, - "EnergyStd_ndecoys10000": energy_std_10000, - "EnergyStd": energy_std, - "Zscore_ndecoys10000":(energy_free - energy_average) / energy_std_10000, - "Zscore":(energy_free - energy_average) / energy_std, - "EnergyVariance": energy_variance, - "Binding": (energy_bound - energy_free), - "Similarity": similarity, - "Ivan":energy_bound - 40 * heterogeneity, - "Takada": (energy_bound - energy_average) / energy_std, - "Ivan_binding":(energy_bound - energy_free) - 40 * heterogeneity, - "Takada_binding":(energy_free - energy_average) / energy_std + (energy_bound - energy_free), - "Ivan_Takada_binding": (energy_free - energy_average) / energy_std + (energy_bound - energy_free) - 40 * heterogeneity, - "Corrected_Takada": (energy_bound - energy_average) / (energy_std+5), - "Corrected_Takada_binding":(energy_free - energy_average) / (energy_std+5) + (energy_bound - energy_free), - "Ivan_Corrected_Takada_binding": (energy_free - energy_average) / (energy_std+5) + (energy_bound - energy_free) - 40 * heterogeneity, - "Ivan_bindidng similarity": (energy_bound - energy_free) - 40 * heterogeneity - 100*similarity, - "Corrected_Takada_binding_similarity":(energy_free - energy_average) / (energy_std+5) + (energy_bound - energy_free) - 100*similarity, - "Ivan_bindidng_similarityv2": (energy_bound - energy_free) - 40 * heterogeneity} - - for energy_name,energy_term in energy_mixes.items(): - print (f"Energy term: {energy_name}") - energy_term.benchmark(seq_indices=np.random.randint(0, len(reduced_alphabet), size=(100,len(structure_free.sequence)))) - if "ndecoys" not in energy_name: - energy_term.test(seq_index=np.random.randint(0, len(reduced_alphabet), size=len(structure_free.sequence))) - - monte_carlo = MonteCarlo(sequence = structure_free.sequence, energy=energy_term, alphabet=reduced_alphabet) - monte_carlo.benchmark_montecarlo_steps(n_repeats=3,n_steps=10000) - monte_carlo.benchmark_parallel_montecarlo_steps(n_repeats=3, n_steps=10000, n_replicas=8) - - - - # Profiling of the parallel tempering - import cProfile - import pstats - import io - monte_carlo = MonteCarlo(sequence=model_free.sequence, energy=energy_mix, alphabet=reduced_alphabet) - # evaluation_energies={"EnergyFree": energy_free, "Heterogeneity": heterogeneity, - # "EnergyAverage": energy_average, "EnergyStd": energy_std, - # "Similarity": similarity, "Zscore":(energy_free - energy_average) / energy_std}) - - monte_carlo.benchmark_montecarlo_steps(n_repeats=3, n_steps=100) - for n_replicas in [1, 2, 4, 8, 16]: - print(f"Running parallel tempering with {n_replicas} replicas") - monte_carlo.benchmark_parallel_montecarlo_steps(n_repeats=3, n_steps=100, n_replicas=n_replicas) - - for n_replicas in [1, 2, 4, 8, 16]: - print(f"Running parallel tempering with {n_replicas} replicas") - monte_carlo.benchmark_parallel_montecarlo_steps(n_repeats=3, n_steps=1000, n_replicas=n_replicas) - - monte_carlo.find_optimal_replicas(max_replicas=32, n_repeats=5, n_steps=1000) - monte_carlo.find_optimal_replicas(max_replicas=8, n_repeats=5, n_steps=10000) - monte_carlo.find_optimal_replicas(max_replicas=8, n_repeats=5, n_steps=100000) - - # # Run the profiler - # profiler = cProfile.Profile() - # profiler.enable() - - # monte_carlo.parallel_tempering(temperatures=np.logspace(3,-4,8), n_steps=1E4, n_steps_per_cycle=1E2) - # profiler.disable() - - # # Print the stats - # s = io.StringIO() - # ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative') - # ps.print_stats() - # ps.dump_stats('parallel_temperingv2.prof') - - - - + energy_mix = awsem_energy - 10*heterogeneity + monte_carlo = MonteCarlo(sequence = "SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT", energy=energy_mix, alphabet=reduced_alphabet) + monte_carlo.annealing(n_steps=1000) From fa23e340479b01d49e51c36e44d1237eb1f13314 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 17:16:00 -0500 Subject: [PATCH 02/20] fixed typo --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 72f6f9a..15b103d 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -130,7 +130,7 @@ def pdb_structure(self,pdb_structure): self.resid = resid self.N = N # set structure-dependent proterties - self.pdb_structure = pdb_structure + self._pdb_structure = pdb_structure self.structure=pdb_structure.structure self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file From fce6e82744a21d05a8afc6b9d3ba5ab75a93facb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 17:34:39 -0500 Subject: [PATCH 03/20] added missing 'self' references --- frustratometer/classes/AWSEM.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 15b103d..b84513d 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -140,7 +140,7 @@ def pdb_structure(self,pdb_structure): # reset indicator functions, energies, and potts model self.compute_indicators_energies_potts_model() - def compute_indicators_energies_potts_model(): + def compute_indicators_energies_potts_model(self): if self.burial_in_context==True: selected_matrix=self.full_pdb_distance_matrix else: @@ -190,7 +190,7 @@ def compute_indicators_energies_potts_model(): self.gamma_array=[] temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] temp_burial_gamma[0]=0 - temp_burial_gamma *= -0.5 * p.k_contact + temp_burial_gamma *= -0.5 * self.p.k_contact self.gamma_array.append(temp_burial_gamma[:,0]) self.gamma_array.append(temp_burial_gamma[:,1]) self.gamma_array.append(temp_burial_gamma[:,2]) @@ -212,14 +212,14 @@ def compute_indicators_energies_potts_model(): h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) # compute burial and contact energies - self.burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] + self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - self.contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] # Compute electrostatics and add to contact energy if self.p.k_electrostatics!=0: - self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) + self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] @@ -242,8 +242,8 @@ def compute_indicators_energies_potts_model(): # Compute potts model self.potts_model = {} - self.potts_model['h'] = burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] - self.potts_model['J'] = contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] # Set the gap energy to zero self.potts_model['h'][:, 0] = 0 self.potts_model['J'][:, :, 0, :] = 0 From 74839593d908ddd36ceb92a13d51adcba961f6f8 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 20:54:21 -0500 Subject: [PATCH 04/20] refactored; basic setup of decoy ensemble is working now --- frustratometer/classes/AWSEM.py | 108 +++++--------------- frustratometer/optimization/optimization.py | 7 -- 2 files changed, 27 insertions(+), 88 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index b84513d..7c94570 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -115,7 +115,8 @@ def __init__(self, self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict self.pdb_structure = pdb_structure - + + # allows us to update coordinates only by passing in a pdb file @property def pdb_structure(self): return self._pdb_structure @@ -397,7 +398,8 @@ def __init__(self, Returns ------- - AWSEMEnsemble object + AWSEMEnsemble object, which holds a list of indicator functions for each decoy structure, + as calculated by the AWSEM class. """ #Set attributes @@ -427,7 +429,6 @@ def __init__(self, self.water_gamma = gamma['Water'][0] self.burial_in_context=p.burial_in_context # need to be careful here--the same choice will have to apply to all structures, # the way this code is currently written - self.indicators = [] # we're always going to expose indicator functions for this class #self._decoy_fluctuation = {} # not sure what this does if p.k_electrostatics!=0: self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) @@ -436,82 +437,27 @@ def __init__(self, self.sequence_cutoff=p.min_sequence_separation_contact self.distance_cutoff=p.distance_cutoff_contact - Ns = [] # number of residues in each structure + burial_low_density_indicators = [] + burial_medium_density_indicators = [] + burial_high_density_indicators = [] + contact_direct_indicators = [] + contact_protein_indicators = [] + contact_water_indicators = [] + electrostatics_indicators = [] for pdb_structure in pdb_structures: - self.indicators.append(AWSEM(pdb_structure,expose_indicator_functions=True).indicators) - """ - #Structure details - # we can exclude most of the details present in the AWSEM class - structure=pdb_structure.structure - init_index_shift=pdb_structure.init_index_shift - distance_matrix=pdb_structure.distance_matrix - full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix - selection_CB = structure.select('name CB or (resname GLY IGL and name CA)') - - resid = selection_CB.getResindices() - N = len(resid) - Ns.append(N) # we'll check this later do make sure every structure has the same number of residues - - if self.burial_in_context==True: - selected_matrix=full_pdb_distance_matrix # use a matrix that includes extra residues to compute local density and contacts - # like the case where we're trying to design a protein that binds - # to another protein, and those residues affect the local environment even if they're - # not part of the sequence space that we're sampling - else: - selected_matrix=distance_matrix - sequence_mask_rho = frustration.compute_mask(selected_matrix, - maximum_contact_distance=None, - minimum_sequence_separation = p.min_sequence_separation_rho) - sequence_mask_contact = frustration.compute_mask(distance_matrix, - maximum_contact_distance=p.distance_cutoff_contact, - minimum_sequence_separation = p.min_sequence_separation_contact) - - # Calculate rho - rho = 0.25 - rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) - rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) - rho *= sequence_mask_rho - - #Calculate sigma water - rho_r = (rho).sum(axis=1) - if full_pdb_distance_matrix.shape!=distance_matrix.shape: - if self.burial_in_context==True: - init_index_shift=pdb_structure.init_index_shift - fin_index_shift=pdb_structure.fin_index_shift - rho_r=rho_r[init_index_shift:fin_index_shift] - rho_b = np.expand_dims(rho_r, 1) - rho1 = np.expand_dims(rho_r, 0) - rho2 = np.expand_dims(rho_r, 1) - sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) - sigma_protein = 1 - sigma_water - - #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - distance_matrix))) - thetaII = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - distance_matrix))) - burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) - direct_indicator = theta[:, :, np.newaxis, np.newaxis] - water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] - protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] - - self.indicators.append([]) - self.indicators[-1].append(burial_indicator[:,0]) - self.indicators[-1].append(burial_indicator[:,1]) - self.indicators[-1].append(burial_indicator[:,2]) - self.indicators[-1].append(direct_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators[-1].append(protein_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators[-1].append(water_indicator[:,:,0,0]*sequence_mask_contact) - - # Compute electrostatics - if p.k_electrostatics!=0: - electrostatics_mask = frustration.compute_mask(distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) - # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] - electrostatics_indicator = 1 / (distance_matrix + 1E-6) * np.exp(-distance_matrix / p.electrostatics_screening_length) * electrostatics_mask - self.indicators[-1].append(electrostatics_indicator) - """ - - self._native_energy=None # not sure what this does - - #assert len(list(set(Ns))) == 1, f"Not all structures had the same number of residues! Numbers of residues found were {set(Ns)}" - #self.N = Ns[0] # doesn't matter which one we choose, they're all the same if we passed the assert + all_indicators = AWSEM(pdb_structure,expose_indicator_functions=True).indicators + burial_low_density_indicators.append(all_indicators[0]) + burial_medium_density_indicators.append(all_indicators[1]) + burial_high_density_indicators.append(all_indicators[2]) + contact_direct_indicators.append(all_indicators[3]) + contact_protein_indicators.append(all_indicators[4]) + contact_water_indicators.append(all_indicators[5]) + electrostatics_indicators.append(all_indicators[6]) + # indicator function order: burial low density, burial medium density, burial high density, + # direct, protein, water, electrostatics + self.indicators = [np.array(burial_low_density_indicators), + np.array(burial_medium_density_indicators), + np.array(contact_direct_indicators), + np.array(contact_protein_indicators), + np.array(contact_water_indicators), + np.array(electrostatics_indicators),] diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 3ade9f9..2ecf44c 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1113,13 +1113,6 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) ensemble = AWSEMEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) ########################################################### - # temporary changes for testing it while it's not yet fully implemented - assert len(ensemble.indicators)==1, ensemble.indicators - #assert len(ensemble.potts_model['h'])==1, ensemble.potts_model['h'] - #assert len(ensemble.potts_model['J'])==1, ensemble.potts_model['J'] - ensemble.indicators = ensemble.indicators[0] - #ensemble.potts_model['h'] = ensemble.potts_model['h'][0] - #ensemble.potts_model['J'] = ensemble.potts_model['J'][0] ensemble.mask = np.ones((63,63)) ######################################################### From c9715d710bbb38f4fafa9d19e18426b085ebf907 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 20:55:26 -0500 Subject: [PATCH 05/20] changed 'AWSEMEnsemble' class name to 'DecoyEnsemble' --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 7c94570..2c1c1db 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -375,7 +375,7 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) -class AWSEMEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer +class DecoyEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer # also, note that the functions compute_configurational_decoy_statistics, # compute_configurational_energies, and configuration_frustration are # present in the AWSEM class but removed here, since we don't expect to From 641dfb997aac9d0ca4a7e333a667a02031920bb7 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Jul 2025 10:33:53 -0500 Subject: [PATCH 06/20] stripped DecoyEnsemble down to the bare minimum --- frustratometer/classes/AWSEM.py | 65 ++++++++---------------------- frustratometer/classes/__init__.py | 2 +- 2 files changed, 17 insertions(+), 50 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 2c1c1db..468dc96 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -7,7 +7,7 @@ from pydantic.types import Path from typing import List,Optional,Union,Generator -__all__ = ['AWSEM','AWSEMEnsemble'] +__all__ = ['AWSEM','DecoyEnsemble'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -379,64 +379,28 @@ class DecoyEnsemble(): # don't think it's necessary for this one to inherit from # also, note that the functions compute_configurational_decoy_statistics, # compute_configurational_energies, and configuration_frustration are # present in the AWSEM class but removed here, since we don't expect to - # compute frustration on an entire ensemble - #Mapping to DCA - q = 20 - aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy - aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') - + # compute frustration on an entire ensemble def __init__(self, pdb_structures: Generator[object,None,None], **parameters)->object: """ - Generate AWSEMEnsemble object + Generate DecoyEnsemble object Parameters ---------- pdb_structures : Generator[object,None,None] yields Structure objects representing decoy structures + other parameters: + masks and cutoffs affecting the AWSEM class's indicator function calculations; + they must be the same for all structures; burial_in_context also available, but use at your own risk Returns ------- - AWSEMEnsemble object, which holds a list of indicator functions for each decoy structure, - as calculated by the AWSEM class. + DecoyEnsemble object, which holds a list of 7 numpy arrays, ordered by indicator function type: + [low density (rho), medium density (rho), high density (rho), direct, protein, water, electrostatics]. + The numpy arrays' first axes vary the structure, while the second axis and third axis, if it exists, + hold the (appropriately masked) indicator functions for each residue or pair of residues """ - - #Set attributes - p = AWSEMParameters(**parameters) - if p.min_sequence_separation_contact is None: - p.min_sequence_separation_contact = 1 - if p.min_sequence_separation_rho is None: - p.min_sequence_separation_rho = 1 - if p.min_sequence_separation_electrostatics is None: - p.min_sequence_separation_electrostatics = 1 - - for field, value in p: - setattr(self, field, value) - - #Gamma parameters - if isinstance(p.gamma, Gamma): - gamma = p.gamma - elif isinstance(p.gamma, Path): - gamma = Gamma(p.gamma) - else: - raise ValueError("Gamma parameter must be a path or a Gamma object.") - - self.gamma=gamma - self.burial_gamma = gamma['Burial'].T - self.direct_gamma = gamma['Direct'][0] - self.protein_gamma = gamma['Protein'][0] - self.water_gamma = gamma['Water'][0] - self.burial_in_context=p.burial_in_context # need to be careful here--the same choice will have to apply to all structures, - # the way this code is currently written - #self._decoy_fluctuation = {} # not sure what this does - if p.k_electrostatics!=0: - self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) - self.distance_cutoff=None - else: - self.sequence_cutoff=p.min_sequence_separation_contact - self.distance_cutoff=p.distance_cutoff_contact - burial_low_density_indicators = [] burial_medium_density_indicators = [] burial_high_density_indicators = [] @@ -445,7 +409,10 @@ def __init__(self, contact_water_indicators = [] electrostatics_indicators = [] for pdb_structure in pdb_structures: - all_indicators = AWSEM(pdb_structure,expose_indicator_functions=True).indicators + # the AWSEM class takes care of the indicator calculation (including masking) for us + # AWSEM normally accepts an amino acid sequence argument, but we don't need that here + # However, we do need to pass through parameters used to generate the indicator functions + all_indicators = AWSEM(pdb_structure, expose_indicator_functions=True, **parameters).indicators burial_low_density_indicators.append(all_indicators[0]) burial_medium_density_indicators.append(all_indicators[1]) burial_high_density_indicators.append(all_indicators[2]) @@ -453,8 +420,8 @@ def __init__(self, contact_protein_indicators.append(all_indicators[4]) contact_water_indicators.append(all_indicators[5]) electrostatics_indicators.append(all_indicators[6]) - # indicator function order: burial low density, burial medium density, burial high density, - # direct, protein, water, electrostatics + # the idea here is that we have different conformers of a chain of a particular length; + # if not, we'll get a ValueError when trying to initialize numpy arrays from a ragged set of lists self.indicators = [np.array(burial_low_density_indicators), np.array(burial_medium_density_indicators), np.array(contact_direct_indicators), diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index 70dec2d..4ff9b4b 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM, AWSEMEnsemble +from .AWSEM import AWSEM, DecoyEnsemble from .Structure import Structure from .Map import Map from .Gamma import Gamma From 107d780cd9ac792c5cba2eca7b421cba064044d7 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Jul 2025 20:33:16 -0500 Subject: [PATCH 07/20] updated call to Structure in test_optimization --- tests/test_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_optimization.py b/tests/test_optimization.py index 7b939a1..2f1414b 100644 --- a/tests/test_optimization.py +++ b/tests/test_optimization.py @@ -347,7 +347,7 @@ def test_diff_mean_inner_product_1_by_1(n_elements = 10): def model(request): native_pdb = "tests/data/1bfz.pdb" distance_cutoff_contact, min_sequence_separation_contact, k_electrostatics = request.param - structure = Structure.full_pdb(native_pdb, "A") + structure = Structure(native_pdb, "A") model = AWSEM(structure, distance_cutoff_contact=distance_cutoff_contact, min_sequence_separation_contact=min_sequence_separation_contact, expose_indicator_functions=True, k_electrostatics=k_electrostatics) return model From 2bb3b832b908e12017dada49f3afcb3517660530 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Jul 2025 22:20:08 -0500 Subject: [PATCH 08/20] refactored, passing most tests but failing some due to inaccurate electrostatics energy calculation --- frustratometer/classes/AWSEM.py | 371 +++++++++++++------- frustratometer/optimization/optimization.py | 173 ++++++++- 2 files changed, 405 insertions(+), 139 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 468dc96..38288db 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -47,15 +47,15 @@ class AWSEMParameters(BaseModel): k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)") -class AWSEM(Frustratometer): +class AWSEMBase(Frustratometer): + #Mapping to DCA q = 20 aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') def __init__(self, - pdb_structure: object, - sequence: str =None, + sequence: str, expose_indicator_functions: bool=False, **parameters)->object: """ @@ -63,10 +63,8 @@ def __init__(self, Parameters ---------- - pdb_structure : object - Structure object generated by Structure class - sequence : str - The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. + sequence: str + The amino acid sequence expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. @@ -75,16 +73,26 @@ def __init__(self, AWSEM object """ - #Set parameters attributes - self.p = AWSEMParameters(**parameters) - if self.p.min_sequence_separation_contact is None: - self.p.min_sequence_separation_contact = 1 - if self.p.min_sequence_separation_rho is None: - self.p.min_sequence_separation_rho = 1 - if self.p.min_sequence_separation_electrostatics is None: - self.p.min_sequence_separation_electrostatics = 1 - for field, value in self.p: + # set sequence based on argument + self.sequence = sequence + + # set indicator function exposure based on argument + # i guess not exposing indicator functions saves memory? + self.expose_indicator_functions = expose_indicator_functions + + # parse other arguments + p = AWSEMParameters(**parameters) + if p.min_sequence_separation_contact is None: + p.min_sequence_separation_contact = 1 + if p.min_sequence_separation_rho is None: + p.min_sequence_separation_rho = 1 + if p.min_sequence_separation_electrostatics is None: + p.min_sequence_separation_electrostatics = 1 + for field, value in p: setattr(self, field, value) + self.p = p + + # set gamma if isinstance(self.p.gamma, Gamma): gamma = self.p.gamma elif isinstance(self.p.gamma, Path): @@ -96,38 +104,117 @@ def __init__(self, self.direct_gamma = gamma['Direct'][0] self.protein_gamma = gamma['Protein'][0] self.water_gamma = gamma['Water'][0] - self.burial_in_context=self.p.burial_in_context + # set other attributes + self.burial_in_context = self.p.burial_in_context + self.aa_freq = frustration.compute_aa_freq(self.sequence) + self.contact_freq = frustration.compute_contact_freq(self.sequence) + if self.p.k_electrostatics == 0: + self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) + self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, + # but it doesn't hurt to define the distance_cutoff attribute-- + # it's just like any other parameter, such as sequence_cutoff, + # that only matters if we need to compute a mask from a distance matrix + else: + self.sequence_cutoff=self.p.min_sequence_separation_contact + self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, + # but it doesn't hurt to define the distance_cutoff attribute-- + # it's just like any other parameter, such as sequence_cutoff, + # that only matters if we need to compute a mask from a distance matrix # ?????? self._decoy_fluctuation = {} # don't know what this does self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ - # sequence details - if sequence is None: - self.sequence=pdb_structure.sequence - else: - self.sequence=sequence - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + def setup_model(self): + self.calculate_indicators() + self.calculate_energy_and_potts() - # structure details - self.expose_indicator_functions = expose_indicator_functions - self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict - self.pdb_structure = pdb_structure + def calculate_indicators(self): + raise NotImplementedError("Subclasses must this method") - - # allows us to update coordinates only by passing in a pdb file - @property - def pdb_structure(self): - return self._pdb_structure - @pdb_structure.setter - def pdb_structure(self,pdb_structure): + def calculate_energy_and_potts(self): + + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) + + # compute burial and contact energies + self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] + direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] + water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] + protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] + contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + + # Compute electrostatics and add to contact energy + if self.p.k_electrostatics!=0: + electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis] + contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) + + self.contact_energy = contact_energy + + # Compute potts model + self.potts_model = {} + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + # Set the gap energy to zero + self.potts_model['h'][:, 0] = 0 + self.potts_model['J'][:, :, 0, :] = 0 + self.potts_model['J'][:, :, :, 0] = 0 + self._native_energy=None # don't know what this does + + + def compute_configurational_decoy_statistics(self): + raise NotImplementedError("Subclasses must define this method") + + def compute_configurational_energies(self): + raise NotImplementedError("Subclasses must define this method") + + def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): + mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) + return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + + + + + +class AWSEM(AWSEMBase): + + def __init__(self, + pdb_structure: object, + sequence: str =None, + expose_indicator_functions: bool=False, + **parameters)->object: + # assume the user wanted the sequence from the pdb structure if not given + if not sequence: + sequence = pdb_structure.sequence + # load structure-independent parameters and methods + super().__init__(sequence, expose_indicator_functions, **parameters) + # set up strucure + self.setup_structure(pdb_structure) + # calculate masks + if self.burial_in_context==True: + selected_matrix=self.full_pdb_distance_matrix + else: + selected_matrix=self.distance_matrix + self.sequence_mask_rho = frustration.compute_mask(selected_matrix, + maximum_contact_distance=None, + minimum_sequence_separation = self.p.min_sequence_separation_rho) + self.sequence_mask_contact = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=self.p.distance_cutoff_contact, + minimum_sequence_separation = self.p.min_sequence_separation_contact) + self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=None, + minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + self.mask = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=self.distance_cutoff, + minimum_sequence_separation = self.sequence_cutoff) + self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function + self.setup_model() + + def setup_structure(self, pdb_structure): # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') resid = selection_CB.getResindices() N=len(resid) - if N != len(self.sequence): - raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") self.resid = resid self.N = N # set structure-dependent proterties @@ -136,34 +223,38 @@ def pdb_structure(self,pdb_structure): self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file self.init_index_shift=pdb_structure.init_index_shift + self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict self.distance_matrix=pdb_structure.distance_matrix self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix - # reset indicator functions, energies, and potts model - self.compute_indicators_energies_potts_model() - def compute_indicators_energies_potts_model(self): - if self.burial_in_context==True: - selected_matrix=self.full_pdb_distance_matrix - else: - selected_matrix=self.distance_matrix - sequence_mask_rho = frustration.compute_mask(selected_matrix, - maximum_contact_distance=None, - minimum_sequence_separation = self.p.min_sequence_separation_rho) - sequence_mask_contact = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=self.p.distance_cutoff_contact, - minimum_sequence_separation = self.p.min_sequence_separation_contact) + @property + def pdb_structure(self): + return self._pdb_structure + @pdb_structure.setter + def pdb_structure(self,pdb_structure): + # reset structural attributes + self.setup_structure(pdb_structure) + # check that our new structure is compatible with our old one + if self.N != len(self.sequence): + raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") + self.calculate_indicators() + def change_conformation(alternative_pdb_structure): + # this function is an alias for the pdb_structure setter + self.pdb_structure = alternative_pdb_structure + + def calculate_indicators(self): # Calculate rho rho = 0.25 - rho *= (1 + np.tanh(self.p.eta * (selected_matrix- self.p.r_min))) - rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - selected_matrix))) - rho *= sequence_mask_rho + rho *= (1 + np.tanh(self.p.eta * (self.selected_matrix - self.p.r_min))) + rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - self.selected_matrix))) + rho *= self.sequence_mask_rho self.rho=rho #Calculate sigma water rho_r = (rho).sum(axis=1) if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: if self.burial_in_context==True: - self.init_index_shift=pdb_structure.init_index_shift - self.fin_index_shift=pdb_structure.fin_index_shift + self.init_index_shift=self.pdb_structure.init_index_shift + self.fin_index_shift=self.pdb_structure.fin_index_shift rho_r=rho_r[self.init_index_shift:self.fin_index_shift] self.rho_r=rho_r rho_b = np.expand_dims(rho_r, 1) @@ -179,81 +270,54 @@ def compute_indicators_energies_potts_model(self): water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] # store indicators and gammas for our particular sequence as attributes - if self.expose_indicator_functions: - self.indicators=[] - self.indicators.append(burial_indicator[:,0]) - self.indicators.append(burial_indicator[:,1]) - self.indicators.append(burial_indicator[:,2]) - self.indicators.append(direct_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators.append(protein_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators.append(water_indicator[:,:,0,0]*sequence_mask_contact) - - self.gamma_array=[] - temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] - temp_burial_gamma[0]=0 - temp_burial_gamma *= -0.5 * self.p.k_contact - self.gamma_array.append(temp_burial_gamma[:,0]) - self.gamma_array.append(temp_burial_gamma[:,1]) - self.gamma_array.append(temp_burial_gamma[:,2]) - - for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: - temp_gamma = contact_gamma[self.aa_map_awsem_x, self.aa_map_awsem_y].copy() - temp_gamma[0, :] = 0 - temp_gamma[:, 0] = 0 - temp_gamma *= -0.5 * self.k_contact - self.gamma_array.append(temp_gamma) - - self.burial_indicator = burial_indicator - self.direct_indicator = direct_indicator - self.water_indicator = water_indicator - self.protein_indicator = protein_indicator - - - J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) - h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) - - # compute burial and contact energies - self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] - direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] - water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] - protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - # Compute electrostatics and add to contact energy - if self.p.k_electrostatics!=0: - self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) - self.distance_cutoff=None - electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + self.indicators=[] + self.indicators.append(burial_indicator[:,0]) + self.indicators.append(burial_indicator[:,1]) + self.indicators.append(burial_indicator[:,2]) + self.indicators.append(direct_indicator[:,:,0,0]*self.sequence_mask_contact) + self.indicators.append(protein_indicator[:,:,0,0]*self.sequence_mask_contact) + self.indicators.append(water_indicator[:,:,0,0]*self.sequence_mask_contact) + self.gamma_array=[] + temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] + temp_burial_gamma[0]=0 + temp_burial_gamma *= -0.5 * self.p.k_contact + self.gamma_array.append(temp_burial_gamma[:,0]) + self.gamma_array.append(temp_burial_gamma[:,1]) + self.gamma_array.append(temp_burial_gamma[:,2]) + for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: + temp_gamma = contact_gamma[self.aa_map_awsem_x, self.aa_map_awsem_y].copy() + temp_gamma[0, :] = 0 + temp_gamma[:, 0] = 0 + temp_gamma *= -0.5 * self.k_contact + self.gamma_array.append(temp_gamma) + self.burial_indicator = burial_indicator # probably could get rid of either this or indicators list + self.direct_indicator = direct_indicator # probably could get rid of either this or indicators list + self.water_indicator = water_indicator # probably could get rid of either this or indicators list + self.protein_indicator = protein_indicator # probably could get rid of either this or indicators list + if self.p.k_electrostatics != 0: # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] - electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * electrostatics_mask - electrostatics_energy = -self.p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) - contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) - if self.expose_indicator_functions: - self.indicators.append(electrostatics_indicator) - temp_gamma=0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] - temp_gamma[0,:]=0 - temp_gamma[:,0]=0 - self.gamma_array.append(temp_gamma) - else: - self.sequence_cutoff=self.p.min_sequence_separation_contact - self.distance_cutoff=self.p.distance_cutoff_contact - self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) - self.contact_energy = contact_energy - - # Compute potts model - self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] - # Set the gap energy to zero - self.potts_model['h'][:, 0] = 0 - self.potts_model['J'][:, :, 0, :] = 0 - self.potts_model['J'][:, :, :, 0] = 0 - self._native_energy=None # don't know what this does - - def change_conformation(alternative_pdb_structure): - # this function is an alias for the pdb_structure setter - self.pdb_structure = alternative_pdb_structure + electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask + self.indicators.append(electrostatics_indicator) + self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y][1:,1:] + temp_gamma = 0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] + temp_gamma[0,:]=0 + temp_gamma[:,0]=0 + self.gamma_array.append(temp_gamma) + + def calculate_energy_and_potts(self): + super().calculate_energy_and_potts() + if not self.expose_indicator_functions: + del self.burial_indicator + del self.direct_indicator + del self.water_indicator + del self.protein_indicator + if "electrostatics_indicator" in dir(self): + # won't exist if electrostatics are turned off + del self.electrostatics_indicator + del self.indicators def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] @@ -370,9 +434,37 @@ def compute_configurational_energies(self): # import pandas as pd return configurational_energies #, pd.DataFrame(decoy_data, columns=decoy_data_columns) - def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): - mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) - return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + +class AWSEMIndicators(AWSEMBase): + + def __init__(self, + indicators: list, + sequence: str, # sequence is optional if we initialize from a Structure but not here + expose_indicator_functions: bool=False, + **parameters)->object: + """ + A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions + + Parameters + ---------- + indicators : list + List of numpy.ndarray holding the indicator functions, with different decoys stacked along axis 0 + sequence : str + The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. + expose_indicator_functions: bool + If set to True, indicator functions of the contact and burial energy terms can be accessed by user. + + Returns + ------- + AWSEMIndicators object + + """ + super().__init__(sequence, expose_indicator_functions, **parameters) + self.indicators = indicators + self.setup_model() + + def calculate_indicators(self): + pass # the function was initialized with indicators, so there's nothing to do class DecoyEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer @@ -408,11 +500,14 @@ def __init__(self, contact_protein_indicators = [] contact_water_indicators = [] electrostatics_indicators = [] + # the AWSEM class takes care of the indicator calculation (including masking) for us + # AWSEM normally accepts an amino acid sequence argument, but we don't need that here + # However, we do need to pass through parameters used to generate the indicator functions + awsem_obj = AWSEM(pdb_structures[0], expose_indicator_functions=True, **parameters) for pdb_structure in pdb_structures: - # the AWSEM class takes care of the indicator calculation (including masking) for us - # AWSEM normally accepts an amino acid sequence argument, but we don't need that here - # However, we do need to pass through parameters used to generate the indicator functions - all_indicators = AWSEM(pdb_structure, expose_indicator_functions=True, **parameters).indicators + awsem_obj.pdb_structure = pdb_structures[pdb_structure] # we can use the pdb_structure setter to update structural + # stuff without fully re-initializing the object + all_indicators = awsem_obj.indicators burial_low_density_indicators.append(all_indicators[0]) burial_medium_density_indicators.append(all_indicators[1]) burial_high_density_indicators.append(all_indicators[2]) @@ -428,3 +523,11 @@ def __init__(self, np.array(contact_protein_indicators), np.array(contact_water_indicators), np.array(electrostatics_indicators),] + # Although a DecoyEnsemble does not have an energy or frustration in the same sense as an AWSEM, + # it is helpful to attach gamma parameters to the DecoyEnsemble so that DecoyEnergyAverage, etc. + # can read them through class inheritance in the same way that AwsemEnergyAverage, etc. read the gammas + # from the AWSEM class. + # We take the gammas directly from the AWSEM that we used to get our indicators. + # Technically, we generated one AWSEM for each pdb_structure, but the parameters are identical, + # so we can for convenience read the gammas from the most recently initialized awsem_obj + self.gamma_array = awsem_obj.gamma_array diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 2ecf44c..6ce30a8 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM, AWSEMEnsemble +from frustratometer.classes import AWSEM, DecoyEnsemble from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -512,6 +512,171 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" +class EnsembleEnergyStatistics(EnergyTerm): + """ + An abstract class (but not using the ABC framework) for ensemble statistics that are distributive in the gammas, + meaning the quantity that we want to calculate, (_decoys), is equal to (gammas * _decoys). + This allows us to average the indicator functions over all decoys just once, and then conduct the MC search as we would for + a single sequence. + + When this class was initially written, the intended use cases are for EnsembleEnergyAverage and EnsembleEnergyStd. + These classes just need to call the EnsembleEnergyStatistics __init__ and define the collapse_indicators function. + """ + def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): + # + # boilerplate from Awsem*Average classes + if not type(model)==DecoyEnsemble: + raise TypeError("EnsembleEnergyAverage may only be initialized from a DecoyEnsemble") + self.model=model + self._use_numba=use_numba + self.alphabet=alphabet + self.reindex_dca=[_AA.index(aa) for aa in alphabet] + self.alphabet_size=len(alphabet) + #TODO: Fix the gamma matrix to account for elecrostatics-- i think this is already done! + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + # + # collapse over decoys (axis 0 of our indicator arrays) + self.indicators = self.collapse_indicators(model.indicators) + + self.initialize_functions() + + def initialize_functions(self): + len_alphabet=self.alphabet_size + gamma=self.gamma + + # Precompute the mean of the indicators + means = [np.average(indicator_type, axis=-1) for indicator_type in self.indicators] + + def compute_energy(seq_index): + counts = np.zeros(len_alphabet, dtype=np.int64) + for val in seq_index: + counts[val] += 1 + + # Calculate phi_mean + phi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) + + # 1D indicators + c=0 + for i in range(len_indicators1D): + for j in range(len_alphabet): + phi_mean[c] = indicator_means[i] * counts[j] + c += 1 + + # 2D indicators + for i in range(len_indicators2D): + for j in range(len_alphabet): + for k in range(len_alphabet): + t=1 if j==k else 0 + phi_mean[c] = indicator_means[i+ len_indicators1D] * counts[j] * (counts[k] - t) + c += 1 + + # Calculate energy + energy = 0 + for i in range(phi_len): + energy += gamma[i] * phi_mean[i] + + return energy + + def denergy_mutation(seq_index, pos, aa): + counts = np.zeros(len_alphabet, dtype=np.int64) + for val in seq_index: + counts[val] += 1 + aa_old=seq_index[pos] + if aa_old==aa: + return 0. + + # Calculate phi_mean + + dphi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) + + # 1D indicators + for i in range(len_indicators1D): + dphi_mean[i*len_alphabet + aa_old] -= indicator_means[i] + dphi_mean[i*len_alphabet + aa] += indicator_means[i] + + offset = len_alphabet*len_indicators1D + for i in range(len_indicators2D): + for j in range(len_alphabet): + k=aa_old + if j==k: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= 2 * indicator_means[i + len_indicators1D] * (counts[j]-1) + elif j==aa: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[k] - counts[j] -1) + else: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= indicator_means[i + len_indicators1D] * counts[j] + dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] -= indicator_means[i + len_indicators1D] * counts[j] + k=aa + if j==k: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += 2 * indicator_means[i + len_indicators1D] * counts[j] + elif j==aa_old: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[j] - counts[k] -1) + else: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i + len_indicators1D] * counts[j] + dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] += indicator_means[i + len_indicators1D] * counts[j] + + # Calculate energy + denergy = 0 + for i in range(phi_len): + denergy += gamma[i] * dphi_mean[i] + + return denergy + + self.compute_energy = compute_energy + self.compute_denergy_mutation = denergy_mutation + + awsem_energy = AwsemEnergy(use_numba=self.use_numba, model=self.model, alphabet=self.alphabet).energy_function + + def compute_energy_sample(seq_index,n_decoys=100000): + """ Function to compute the variance of the energy of permutations of a sequence using random shuffling. + This function is much faster than compute_energy_permutation but is an approximation""" + energies=np.zeros(n_decoys) + shuffled_index=seq_index.copy() + for i in numba.prange(n_decoys): + energies[i]=awsem_energy(shuffled_index[np.random.permutation(len(shuffled_index))]) + return np.mean(energies) + + def compute_energy_permutation(seq_index): + """ Function to compute the variance of the energy of all permutations of a sequence + Caution: This function is very slow for normal sequences """ + from itertools import permutations + decoy_sequences = np.array(list(permutations(seq_index))) + energies=np.zeros(len(decoy_sequences)) + for i in numba.prange(len(decoy_sequences)): + energies[i]=awsem_energy(decoy_sequences[i]) + return np.mean(energies) + + def collapse_indicators(uncollapsed_indicators): + raise NotImplementedError("EnsembleEnergyStatistics is an abstract class. Use a subclass implementing this function.") + + self.compute_energy_sample=self.numbify(compute_energy_sample,parallel=True) + self.compute_energy_permutation=compute_energy_permutation + + def regression_test(self, seq_index): + expected_energy=self.compute_energy_permutation(seq_index) + energy=self.compute_energy(seq_index) + assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" + +class EnsembleEnergyAverage(EnsembleEnergyStatistics): + """ + See documentation for EnsembleEnergyStatistics + """ + def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): + super().__init__(model, use_numba, alphabet) + + def collapse_indicators(uncollapsed_indicators): + self.indicators = [np.average(indicator, axis=0) for indicator in uncollapsed_indicators] + +class EnsembleEnergyStd(EnsembleEnergyStatistics): + """ + See documentation for EnsembleEnergyStatistics + """ + def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): + super().__init__(model, use_numba, alphabet) + + def collapse_indicators(uncollapsed_indicators): + self.indicators = [np.std(indicator, axis=0) for indicator in uncollapsed_indicators] + + class AwsemEnergyVariance(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self._use_numba=use_numba @@ -1111,10 +1276,8 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): pdb_list = ["tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) - ensemble = AWSEMEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) - ########################################################### - ensemble.mask = np.ones((63,63)) - ######################################################### + ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' From d0b23f35b20c58ded1bd80eb9f47c5487788dedd Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 13:19:41 -0500 Subject: [PATCH 09/20] tests passing now (except for the dca tests requiring hmmer, which have been failing for a while) --- frustratometer/classes/AWSEM.py | 45 ++++++++++++++++++--- frustratometer/optimization/optimization.py | 11 ++--- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 38288db..1462754 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -7,7 +7,7 @@ from pydantic.types import Path from typing import List,Optional,Union,Generator -__all__ = ['AWSEM','DecoyEnsemble'] +__all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -74,6 +74,7 @@ def __init__(self, """ # set sequence based on argument + self.N = len(sequence) self.sequence = sequence # set indicator function exposure based on argument @@ -109,7 +110,7 @@ def __init__(self, self.burial_in_context = self.p.burial_in_context self.aa_freq = frustration.compute_aa_freq(self.sequence) self.contact_freq = frustration.compute_contact_freq(self.sequence) - if self.p.k_electrostatics == 0: + if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- @@ -204,9 +205,14 @@ def __init__(self, self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + with open('my_data.txt','w') as f: + f.write(f"self.distance_cutoff: {self.distance_cutoff}\n") + f.write(f"self.sequence_cutoff: {self.sequence_cutoff}\n") + np.save('my_distance_matrix.npy',self.distance_matrix) self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) + np.save('my_mask_new.npy',self.mask) self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function self.setup_model() @@ -301,7 +307,7 @@ def calculate_indicators(self): electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list - self.electrostatics_gamma = -self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y][1:,1:] + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] temp_gamma = 0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 @@ -439,6 +445,10 @@ class AWSEMIndicators(AWSEMBase): def __init__(self, indicators: list, + burial_indicator, + direct_indicator, + protein_indicator, + water_indicator, sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, **parameters)->object: @@ -460,7 +470,12 @@ def __init__(self, """ super().__init__(sequence, expose_indicator_functions, **parameters) + self.N self.indicators = indicators + self.burial_indicator = burial_indicator + self.direct_indicator = direct_indicator + self.protein_indicator = protein_indicator + self.water_indicator = water_indicator self.setup_model() def calculate_indicators(self): @@ -500,13 +515,18 @@ def __init__(self, contact_protein_indicators = [] contact_water_indicators = [] electrostatics_indicators = [] + burial_indicators_other = [] + direct_indicators_other = [] + protein_indicators_other = [] + water_indicators_other = [] + electrostatics_indicators_other = [] # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions - awsem_obj = AWSEM(pdb_structures[0], expose_indicator_functions=True, **parameters) + awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, **parameters) for pdb_structure in pdb_structures: - awsem_obj.pdb_structure = pdb_structures[pdb_structure] # we can use the pdb_structure setter to update structural - # stuff without fully re-initializing the object + awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural + # stuff without fully re-initializing the object all_indicators = awsem_obj.indicators burial_low_density_indicators.append(all_indicators[0]) burial_medium_density_indicators.append(all_indicators[1]) @@ -515,6 +535,11 @@ def __init__(self, contact_protein_indicators.append(all_indicators[4]) contact_water_indicators.append(all_indicators[5]) electrostatics_indicators.append(all_indicators[6]) + burial_indicators_other.append(awsem_obj.electrostatics_indicator) + direct_indicators_other.append(awsem_obj.electrostatics_indicator) + protein_indicators_other.append(awsem_obj.electrostatics_indicator) + water_indicators_other.append(awsem_obj.electrostatics_indicator) + electrostatics_indicators_other.append(awsem_obj.electrostatics_indicator) # the idea here is that we have different conformers of a chain of a particular length; # if not, we'll get a ValueError when trying to initialize numpy arrays from a ragged set of lists self.indicators = [np.array(burial_low_density_indicators), @@ -523,6 +548,11 @@ def __init__(self, np.array(contact_protein_indicators), np.array(contact_water_indicators), np.array(electrostatics_indicators),] + self.burial_indicator = np.average(np.array(burial_indicators_other),axis=0) + self.direct_indicator = np.average(np.array(direct_indicators_other),axis=0) + self.protein_indicator = np.average(np.array(protein_indicators_other),axis=0) + self.water_indicator = np.average(np.array(water_indicators_other),axis=0) + self.electrostatics_indicator = np.average(np.array(electrostatics_indicators_other),axis=0) # Although a DecoyEnsemble does not have an energy or frustration in the same sense as an AWSEM, # it is helpful to attach gamma parameters to the DecoyEnsemble so that DecoyEnergyAverage, etc. # can read them through class inheritance in the same way that AwsemEnergyAverage, etc. read the gammas @@ -531,3 +561,6 @@ def __init__(self, # Technically, we generated one AWSEM for each pdb_structure, but the parameters are identical, # so we can for convenience read the gammas from the most recently initialized awsem_obj self.gamma_array = awsem_obj.gamma_array + + def average(self): + return [np.average(indicator, axis=0) for indicator in self.indicators], self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator \ No newline at end of file diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 6ce30a8..bc2abe2 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM, DecoyEnsemble +from frustratometer.classes import AWSEM, AWSEMIndicators, DecoyEnsemble from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -1274,14 +1274,15 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): if __name__ == '__main__': - pdb_list = ["tests/data/1r69.pdb"] + pdb_list = ["tests/data/1r69.pdb","tests/data/1r69.pdb","tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) - ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) - + ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) + indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators = ensemble.average() + average = AWSEMIndicators(indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators,"SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT") reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' - awsem_energy = AwsemEnergy(ensemble, alphabet=reduced_alphabet) + awsem_energy = AwsemEnergy(average, alphabet=reduced_alphabet) heterogeneity = Heterogeneity(exact=False, use_numba=True) energy_mix = awsem_energy - 10*heterogeneity From 9fe70c2c1f7f5e4ce74683fce7e2f9da729651bb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 17:10:59 -0500 Subject: [PATCH 10/20] tests still passing (except for that hmmer issue) and AWSEMIndicators/DecoyEnsemble now working as expected --- frustratometer/classes/AWSEM.py | 100 ++++++++------------ frustratometer/classes/__init__.py | 2 +- frustratometer/optimization/optimization.py | 7 +- 3 files changed, 45 insertions(+), 64 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 1462754..00358d0 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -37,7 +37,6 @@ class AWSEMParameters(BaseModel): r_maxII: float = Field(9.5, description="Maximum distance for mediated contact potential. (Angstrom)") eta_sigma: float = Field(7.0, description="Sharpness of the density-based switching function between protein-mediated and water-mediated contacts.") - #Membrane membrane_gamma: Union[Path,Gamma] = Field(_path/'data'/'AWSEM_membrane_2015.json', description="File or Gamma object containing the membrane Gamma values (for membrane proteins)") eta_switching: int = Field(10, description="Switching distance for the membrane switching function") @@ -46,6 +45,8 @@ class AWSEMParameters(BaseModel): min_sequence_separation_electrostatics: Optional[int] = Field(1, description="Minimum sequence separation for electrostatics calculation.") k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)") + charges: np.array = Field(np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]), description="Charge on each residue type") + # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] class AWSEMBase(Frustratometer): @@ -110,18 +111,21 @@ def __init__(self, self.burial_in_context = self.p.burial_in_context self.aa_freq = frustration.compute_aa_freq(self.sequence) self.contact_freq = frustration.compute_contact_freq(self.sequence) + charges2 = self.p.charges[:,np.newaxis] * self.p.charges[np.newaxis,:] if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] else: self.sequence_cutoff=self.p.min_sequence_separation_contact self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix + self.charges2 = charges2 # ?????? self._decoy_fluctuation = {} # don't know what this does self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ @@ -301,14 +305,10 @@ def calculate_indicators(self): self.water_indicator = water_indicator # probably could get rid of either this or indicators list self.protein_indicator = protein_indicator # probably could get rid of either this or indicators list if self.p.k_electrostatics != 0: - # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list - self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] - temp_gamma = 0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] + temp_gamma = 0.5 * self.p.k_electrostatics * self.charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 self.gamma_array.append(temp_gamma) @@ -444,11 +444,11 @@ def compute_configurational_energies(self): class AWSEMIndicators(AWSEMBase): def __init__(self, - indicators: list, burial_indicator, direct_indicator, protein_indicator, water_indicator, + electrostatics_indicator, sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, **parameters)->object: @@ -470,23 +470,23 @@ def __init__(self, """ super().__init__(sequence, expose_indicator_functions, **parameters) - self.N - self.indicators = indicators self.burial_indicator = burial_indicator self.direct_indicator = direct_indicator self.protein_indicator = protein_indicator self.water_indicator = water_indicator + self.electrostatics_indicator = electrostatics_indicator + self.sequence_mask_contact = np.full((self.N,self.N), True) + self.mask = np.full((self.N,self.N), True) + # mask should have been applied when calculating the indicator functions, + # so we set it such that no further masking is performed self.setup_model() def calculate_indicators(self): pass # the function was initialized with indicators, so there's nothing to do -class DecoyEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer - # also, note that the functions compute_configurational_decoy_statistics, - # compute_configurational_energies, and configuration_frustration are - # present in the AWSEM class but removed here, since we don't expect to - # compute frustration on an entire ensemble +class DecoyEnsemble(): + def __init__(self, pdb_structures: Generator[object,None,None], **parameters)->object: @@ -508,18 +508,11 @@ def __init__(self, The numpy arrays' first axes vary the structure, while the second axis and third axis, if it exists, hold the (appropriately masked) indicator functions for each residue or pair of residues """ - burial_low_density_indicators = [] - burial_medium_density_indicators = [] - burial_high_density_indicators = [] - contact_direct_indicators = [] - contact_protein_indicators = [] - contact_water_indicators = [] + burial_indicators = [] + direct_indicators = [] + protein_indicators = [] + water_indicators = [] electrostatics_indicators = [] - burial_indicators_other = [] - direct_indicators_other = [] - protein_indicators_other = [] - water_indicators_other = [] - electrostatics_indicators_other = [] # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions @@ -527,40 +520,27 @@ def __init__(self, for pdb_structure in pdb_structures: awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object - all_indicators = awsem_obj.indicators - burial_low_density_indicators.append(all_indicators[0]) - burial_medium_density_indicators.append(all_indicators[1]) - burial_high_density_indicators.append(all_indicators[2]) - contact_direct_indicators.append(all_indicators[3]) - contact_protein_indicators.append(all_indicators[4]) - contact_water_indicators.append(all_indicators[5]) - electrostatics_indicators.append(all_indicators[6]) - burial_indicators_other.append(awsem_obj.electrostatics_indicator) - direct_indicators_other.append(awsem_obj.electrostatics_indicator) - protein_indicators_other.append(awsem_obj.electrostatics_indicator) - water_indicators_other.append(awsem_obj.electrostatics_indicator) - electrostatics_indicators_other.append(awsem_obj.electrostatics_indicator) - # the idea here is that we have different conformers of a chain of a particular length; - # if not, we'll get a ValueError when trying to initialize numpy arrays from a ragged set of lists - self.indicators = [np.array(burial_low_density_indicators), - np.array(burial_medium_density_indicators), - np.array(contact_direct_indicators), - np.array(contact_protein_indicators), - np.array(contact_water_indicators), - np.array(electrostatics_indicators),] - self.burial_indicator = np.average(np.array(burial_indicators_other),axis=0) - self.direct_indicator = np.average(np.array(direct_indicators_other),axis=0) - self.protein_indicator = np.average(np.array(protein_indicators_other),axis=0) - self.water_indicator = np.average(np.array(water_indicators_other),axis=0) - self.electrostatics_indicator = np.average(np.array(electrostatics_indicators_other),axis=0) - # Although a DecoyEnsemble does not have an energy or frustration in the same sense as an AWSEM, - # it is helpful to attach gamma parameters to the DecoyEnsemble so that DecoyEnergyAverage, etc. - # can read them through class inheritance in the same way that AwsemEnergyAverage, etc. read the gammas - # from the AWSEM class. - # We take the gammas directly from the AWSEM that we used to get our indicators. - # Technically, we generated one AWSEM for each pdb_structure, but the parameters are identical, - # so we can for convenience read the gammas from the most recently initialized awsem_obj - self.gamma_array = awsem_obj.gamma_array + burial_indicators.append(awsem_obj.burial_indicator) + direct_indicators.append(awsem_obj.direct_indicator) + protein_indicators.append(awsem_obj.protein_indicator) + water_indicators.append(awsem_obj.water_indicator) + if hasattr(awsem_obj, 'electrostatics_indicator'): + electrostatics_indicators.append(awsem_obj.electrostatics_indicator) + # Stack and average indicators, ensuring correct shape for calculate_energy_and_potts + self.burial_indicator = np.mean(np.stack(burial_indicators, axis=0), axis=0) # (N, 3) + self.direct_indicator = np.mean(np.stack(direct_indicators, axis=0), axis=0) # (N, N, 1, 1) + self.protein_indicator = np.mean(np.stack(protein_indicators, axis=0), axis=0) # (N, N, 1, 1) + self.water_indicator = np.mean(np.stack(water_indicators, axis=0), axis=0) # (N, N, 1, 1) + if electrostatics_indicators: + self.electrostatics_indicator = np.mean(np.stack(electrostatics_indicators, axis=0), axis=0) # (N, N) + else: + self.electrostatics_indicator = None + # Attach gamma parameters from the AWSEM object + self.burial_gamma = awsem_obj.burial_gamma + self.direct_gamma = awsem_obj.direct_gamma + self.protein_gamma = awsem_obj.protein_gamma + self.water_gamma = awsem_obj.water_gamma + self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) def average(self): - return [np.average(indicator, axis=0) for indicator in self.indicators], self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator \ No newline at end of file + return self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator, self.electrostatics_indicator \ No newline at end of file diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index 4ff9b4b..fd8b79b 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM, DecoyEnsemble +from .AWSEM import AWSEM, AWSEMIndicators, DecoyEnsemble from .Structure import Structure from .Map import Map from .Gamma import Gamma diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index bc2abe2..34f93c7 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1277,9 +1277,10 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): pdb_list = ["tests/data/1r69.pdb","tests/data/1r69.pdb","tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) - indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators = ensemble.average() - average = AWSEMIndicators(indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators,"SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT") - + burial_indicators, direct_indicators, protein_indicators, water_indicators, electrostatics_indicators = ensemble.average() + average = AWSEMIndicators(burial_indicators, direct_indicators, protein_indicators, water_indicators, electrostatics_indicators, + "SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT") + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' awsem_energy = AwsemEnergy(average, alphabet=reduced_alphabet) From d3386c9dee09cf4be56c0cbcaae44f4e6ef7bfbb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 17:18:38 -0500 Subject: [PATCH 11/20] got rid of unused classes --- frustratometer/optimization/optimization.py | 164 -------------------- 1 file changed, 164 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 34f93c7..416f195 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -512,170 +512,6 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" -class EnsembleEnergyStatistics(EnergyTerm): - """ - An abstract class (but not using the ABC framework) for ensemble statistics that are distributive in the gammas, - meaning the quantity that we want to calculate, (_decoys), is equal to (gammas * _decoys). - This allows us to average the indicator functions over all decoys just once, and then conduct the MC search as we would for - a single sequence. - - When this class was initially written, the intended use cases are for EnsembleEnergyAverage and EnsembleEnergyStd. - These classes just need to call the EnsembleEnergyStatistics __init__ and define the collapse_indicators function. - """ - def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): - # - # boilerplate from Awsem*Average classes - if not type(model)==DecoyEnsemble: - raise TypeError("EnsembleEnergyAverage may only be initialized from a DecoyEnsemble") - self.model=model - self._use_numba=use_numba - self.alphabet=alphabet - self.reindex_dca=[_AA.index(aa) for aa in alphabet] - self.alphabet_size=len(alphabet) - #TODO: Fix the gamma matrix to account for elecrostatics-- i think this is already done! - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) - # - # collapse over decoys (axis 0 of our indicator arrays) - self.indicators = self.collapse_indicators(model.indicators) - - self.initialize_functions() - - def initialize_functions(self): - len_alphabet=self.alphabet_size - gamma=self.gamma - - # Precompute the mean of the indicators - means = [np.average(indicator_type, axis=-1) for indicator_type in self.indicators] - - def compute_energy(seq_index): - counts = np.zeros(len_alphabet, dtype=np.int64) - for val in seq_index: - counts[val] += 1 - - # Calculate phi_mean - phi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) - - # 1D indicators - c=0 - for i in range(len_indicators1D): - for j in range(len_alphabet): - phi_mean[c] = indicator_means[i] * counts[j] - c += 1 - - # 2D indicators - for i in range(len_indicators2D): - for j in range(len_alphabet): - for k in range(len_alphabet): - t=1 if j==k else 0 - phi_mean[c] = indicator_means[i+ len_indicators1D] * counts[j] * (counts[k] - t) - c += 1 - - # Calculate energy - energy = 0 - for i in range(phi_len): - energy += gamma[i] * phi_mean[i] - - return energy - - def denergy_mutation(seq_index, pos, aa): - counts = np.zeros(len_alphabet, dtype=np.int64) - for val in seq_index: - counts[val] += 1 - aa_old=seq_index[pos] - if aa_old==aa: - return 0. - - # Calculate phi_mean - - dphi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) - - # 1D indicators - for i in range(len_indicators1D): - dphi_mean[i*len_alphabet + aa_old] -= indicator_means[i] - dphi_mean[i*len_alphabet + aa] += indicator_means[i] - - offset = len_alphabet*len_indicators1D - for i in range(len_indicators2D): - for j in range(len_alphabet): - k=aa_old - if j==k: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= 2 * indicator_means[i + len_indicators1D] * (counts[j]-1) - elif j==aa: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[k] - counts[j] -1) - else: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= indicator_means[i + len_indicators1D] * counts[j] - dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] -= indicator_means[i + len_indicators1D] * counts[j] - k=aa - if j==k: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += 2 * indicator_means[i + len_indicators1D] * counts[j] - elif j==aa_old: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[j] - counts[k] -1) - else: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i + len_indicators1D] * counts[j] - dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] += indicator_means[i + len_indicators1D] * counts[j] - - # Calculate energy - denergy = 0 - for i in range(phi_len): - denergy += gamma[i] * dphi_mean[i] - - return denergy - - self.compute_energy = compute_energy - self.compute_denergy_mutation = denergy_mutation - - awsem_energy = AwsemEnergy(use_numba=self.use_numba, model=self.model, alphabet=self.alphabet).energy_function - - def compute_energy_sample(seq_index,n_decoys=100000): - """ Function to compute the variance of the energy of permutations of a sequence using random shuffling. - This function is much faster than compute_energy_permutation but is an approximation""" - energies=np.zeros(n_decoys) - shuffled_index=seq_index.copy() - for i in numba.prange(n_decoys): - energies[i]=awsem_energy(shuffled_index[np.random.permutation(len(shuffled_index))]) - return np.mean(energies) - - def compute_energy_permutation(seq_index): - """ Function to compute the variance of the energy of all permutations of a sequence - Caution: This function is very slow for normal sequences """ - from itertools import permutations - decoy_sequences = np.array(list(permutations(seq_index))) - energies=np.zeros(len(decoy_sequences)) - for i in numba.prange(len(decoy_sequences)): - energies[i]=awsem_energy(decoy_sequences[i]) - return np.mean(energies) - - def collapse_indicators(uncollapsed_indicators): - raise NotImplementedError("EnsembleEnergyStatistics is an abstract class. Use a subclass implementing this function.") - - self.compute_energy_sample=self.numbify(compute_energy_sample,parallel=True) - self.compute_energy_permutation=compute_energy_permutation - - def regression_test(self, seq_index): - expected_energy=self.compute_energy_permutation(seq_index) - energy=self.compute_energy(seq_index) - assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" - -class EnsembleEnergyAverage(EnsembleEnergyStatistics): - """ - See documentation for EnsembleEnergyStatistics - """ - def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): - super().__init__(model, use_numba, alphabet) - - def collapse_indicators(uncollapsed_indicators): - self.indicators = [np.average(indicator, axis=0) for indicator in uncollapsed_indicators] - -class EnsembleEnergyStd(EnsembleEnergyStatistics): - """ - See documentation for EnsembleEnergyStatistics - """ - def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): - super().__init__(model, use_numba, alphabet) - - def collapse_indicators(uncollapsed_indicators): - self.indicators = [np.std(indicator, axis=0) for indicator in uncollapsed_indicators] - class AwsemEnergyVariance(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): From 88834d3c174edc1c040e17b60ab964b3523b6d35 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 17:27:29 -0500 Subject: [PATCH 12/20] updated documentation --- frustratometer/classes/AWSEM.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 00358d0..c58cb1a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -444,11 +444,11 @@ def compute_configurational_energies(self): class AWSEMIndicators(AWSEMBase): def __init__(self, - burial_indicator, - direct_indicator, - protein_indicator, - water_indicator, - electrostatics_indicator, + burial_indicator: np.ndarray, + direct_indicator: np.ndarray, + protein_indicator: np.ndarray, + water_indicator: np.ndarray, + electrostatics_indicator: Union[np.ndarray, None], sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, **parameters)->object: @@ -457,8 +457,17 @@ def __init__(self, Parameters ---------- - indicators : list - List of numpy.ndarray holding the indicator functions, with different decoys stacked along axis 0 + burial_indicator : np.ndarray + Burial indicator array, most likely accessed using the burial_indicator attribute of an AWSEM + direct_indicator : np.ndarray + Direct indicator array, most likely accessed using the direct_indicator attribute of an AWSEM + protein_indicator : np.ndarray + Protein indicator array, most likely accessed using the protein_indicator attribute of an AWSEM + water_indicator : np.ndarray + Water indicator array, most likely accessed using the water_indicator attribute of an AWSEM + electrostatics_indicator : Union[np.ndarray, None] + Electrostatics indicator array, most likely accessed using the electrostatics_indicator attribute of an AWSEM. + May be None is electrostatics were turned off (k_electrostatics=0). sequence : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool @@ -503,10 +512,7 @@ def __init__(self, Returns ------- - DecoyEnsemble object, which holds a list of 7 numpy arrays, ordered by indicator function type: - [low density (rho), medium density (rho), high density (rho), direct, protein, water, electrostatics]. - The numpy arrays' first axes vary the structure, while the second axis and third axis, if it exists, - hold the (appropriately masked) indicator functions for each residue or pair of residues + DecoyEnsemble object, which holds indicator arrays and gammas computed by the AWSEM class. """ burial_indicators = [] direct_indicators = [] From 2a88e03a9be80aaa4d98bc8f0664df73b02681a2 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 10:08:31 -0500 Subject: [PATCH 13/20] refactored DecoyEnsemble to minimize memory usage and implemented avg/std classes --- frustratometer/classes/AWSEM.py | 168 ++++++++++++++++++++++++++------ 1 file changed, 136 insertions(+), 32 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index c58cb1a..bf90fa8 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -178,9 +178,6 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) - - - class AWSEM(AWSEMBase): def __init__(self, @@ -223,7 +220,7 @@ def __init__(self, def setup_structure(self, pdb_structure): # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') - resid = selection_CB.getResindices() + resid = list(set(selection_CB.getResindices())) # sometimes, it decides to split a single residue in 2 N=len(resid) self.resid = resid self.N = N @@ -246,6 +243,7 @@ def pdb_structure(self,pdb_structure): self.setup_structure(pdb_structure) # check that our new structure is compatible with our old one if self.N != len(self.sequence): + import pdb; pdb.set_trace() raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") self.calculate_indicators() def change_conformation(alternative_pdb_structure): @@ -512,41 +510,147 @@ def __init__(self, Returns ------- - DecoyEnsemble object, which holds indicator arrays and gammas computed by the AWSEM class. + DecoyEnsemble object, which holds indicator arrays (and gammas???) computed by the AWSEM class. """ - burial_indicators = [] - direct_indicators = [] - protein_indicators = [] - water_indicators = [] - electrostatics_indicators = [] # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions - awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, **parameters) + awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, repair_pdb=True, **parameters) for pdb_structure in pdb_structures: awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object - burial_indicators.append(awsem_obj.burial_indicator) - direct_indicators.append(awsem_obj.direct_indicator) - protein_indicators.append(awsem_obj.protein_indicator) - water_indicators.append(awsem_obj.water_indicator) - if hasattr(awsem_obj, 'electrostatics_indicator'): - electrostatics_indicators.append(awsem_obj.electrostatics_indicator) + with open('burial_indicators.npy','ab') as f: + np.save(f,awsem_obj.burial_indicator) + with open('direct_indicators.npy','ab') as f: + np.save(f,awsem_obj.direct_indicator) + with open('protein_indicators.npy','ab') as f: + np.save(f,awsem_obj.protein_indicator) + with open('water_indicators.npy','ab') as f: + np.save(f,awsem_obj.water_indicator) + with open('electrostatics_indicators.npy','ab') as f: + if hasattr(awsem_obj, 'electrostatics_indicator'): + np.save(f,awsem_obj.electrostatics_indicator) + else: + np.save(f,None) # Stack and average indicators, ensuring correct shape for calculate_energy_and_potts - self.burial_indicator = np.mean(np.stack(burial_indicators, axis=0), axis=0) # (N, 3) - self.direct_indicator = np.mean(np.stack(direct_indicators, axis=0), axis=0) # (N, N, 1, 1) - self.protein_indicator = np.mean(np.stack(protein_indicators, axis=0), axis=0) # (N, N, 1, 1) - self.water_indicator = np.mean(np.stack(water_indicators, axis=0), axis=0) # (N, N, 1, 1) - if electrostatics_indicators: - self.electrostatics_indicator = np.mean(np.stack(electrostatics_indicators, axis=0), axis=0) # (N, N) - else: - self.electrostatics_indicator = None - # Attach gamma parameters from the AWSEM object - self.burial_gamma = awsem_obj.burial_gamma - self.direct_gamma = awsem_obj.direct_gamma - self.protein_gamma = awsem_obj.protein_gamma - self.water_gamma = awsem_obj.water_gamma - self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) + self.burial_indicators = self.get_indicators("burial_indicators.npy") + self.direct_indicators = self.get_indicators("direct_indicators.npy") + self.protein_indicators = self.get_indicators("protein_indicators.npy") + self.water_indicators = self.get_indicators("water_indicators.npy") + self.electrostatics_indicators = self.get_indicators("electrostatics_indicators.npy") + + # averages are needed to compute standard deviation, so it's useful to have them as attributes + self.avg_burial = None + self.avg_direct = None + self.avg_prot = None + self.avg_wat = None + self.avg_elect = None + + ## Attach gamma parameters from the AWSEM object + ## Kind of off-topic from my current use of this class + #self.burial_gamma = awsem_obj.burial_gamma + #self.direct_gamma = awsem_obj.direct_gamma + #self.protein_gamma = awsem_obj.protein_gamma + #self.water_gamma = awsem_obj.water_gamma + #self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) + + # this might help with memory + del awsem_obj + + def get_indicators(self, filename): + # expecting a numpy file + with open(filename, 'rb') as f: + while True: + try: + yield np.load(f, allow_pickle=True) # needed to load None if not electrostatics + except EOFError: + break def average(self): - return self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator, self.electrostatics_indicator \ No newline at end of file + # average burial computation from generator + avg_burial = 0 + counter = 0 + for array in self.burial_indicators: + counter += 1 + burial_indicator += array + avg_burial /= counter + self.avg_burial = avg_burial + # average direct computation from generator + avg_direct = 0 + counter = 0 + for array in self.direct_indicators: + counter += 1 + direct_indicator += array + avg_direct /= counter + self.avg_direct = avg_direct + # average prot computation from generator + avg_prot = 0 + counter = 0 + for array in self.protein_indicators: + counter += 1 + protein_indicator += array + avg_prot /= counter + self.avg_prot = avg_prot + # average wat computation from generator + avg_wat = 0 + counter = 0 + for array in self.water_indicators: + counter += 1 + water_indicator += array + avg_wat /= counter + self.avg_wat = avg_wat + # average elec computation from generator + # if not defined, set to zero, which will have no impact + if self.electrostatics_indicators == None: + avg_elec = 0 + else: + avg_elec = 0 + counter = 0 + for array in self.electrostatics_indicators: + counter += 1 + avg_elec += array + avg_elec /= counter + self.avg_elec = avg_elec + return self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec + + def std(self): + if self.avg_burial is None or self.avg_direct is None or \ + self.avg_prot is None or self.avg_wat is None or \ + self.avg_elec is None: + self.average() # compute averages if not already done + # std burial computation from generator and previously computed average + std_burial = 0 + counter = 0 + for array in self.burial_indicators: + counter += 1 + std_burial += (array - self.avg_burial) ** 2 + std_burial = np.sqrt(std_burial / counter) + # std direct computation from generator and previously computed average + std_direct = 0 + counter = 0 + for array in self.direct_indicators: + counter += 1 + std_direct += (array - self.avg_direct) ** 2 + std_direct = np.sqrt(std_direct / counter) + # std prot computation from generator and previously computed average + std_prot = 0 + counter = 0 + for array in self.protein_indicators: + counter += 1 + std_prot += (array - self.avg_prot) ** 2 + std_prot = np.sqrt(std_prot / counter) + # std wat computation from generator and previously computed average + std_wat = 0 + counter = 0 + for array in self.water_indicators: + counter += 1 + std_wat += (array - self.avg_wat) ** 2 + std_wat = np.sqrt(std_wat / counter) + # std elec computation from generator and previously computed average + std_elec = 0 + counter = 0 + for array in self.electrostatics_indicators: + counter += 1 + std_elec += (array - self.avg_elec) ** 2 + std_elec = np.sqrt(std_elec / counter) + return std_burial, std_direct, std_prot, std_wat, std_elec \ No newline at end of file From 790adce77afd970a7ef1315032f5283c299d5907 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 11:36:03 -0500 Subject: [PATCH 14/20] got pdb.py from my amyloid_atlas branch, which allows prody for loading the structures. this probably wasn't necessary --- frustratometer/pdb/pdb.py | 87 +++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/frustratometer/pdb/pdb.py b/frustratometer/pdb/pdb.py index 853e6e6..2feeb65 100644 --- a/frustratometer/pdb/pdb.py +++ b/frustratometer/pdb/pdb.py @@ -37,7 +37,8 @@ def download(pdbID: str,directory: Union[Path,str]=Path.cwd()) -> Path: return pdb_file def get_sequence(pdb_file: str, - chain: str + chain: str, + return_start_mask: bool=False ) -> str: """ Get a protein sequence from a pdb file @@ -46,8 +47,10 @@ def get_sequence(pdb_file: str, ---------- pdb_file : str, PDB file location. - chain: str, - Chain ID of the selected protein. + chain: str or list, + Chain ID(s) of the selected protein. + return_start_mask: bool, + Return binary mask list indicating whether each position is the start of a chain Returns ------- @@ -58,37 +61,63 @@ def get_sequence(pdb_file: str, Get a protein sequence from a PDB file :param pdb: PDB file location - :param chain: chain name of PDB file to get sequence + :param chain: chain name(s) of PDB file to get sequence :return: protein sequence """ - if ".cif" in str(pdb_file): - parser = MMCIFParser() - else: - parser = PDBParser() - structure = parser.get_structure('name', pdb_file) + if ".cif" in str(pdb_file): # BIOPYTHON + parser = MMCIFParser() # BIOPYTHON + else: # BIOPYTHON + parser = PDBParser() # BIOPYTHON + structure = parser.get_structure('name', pdb_file) #BIOPYTHON + #structure = prody.parsePDB(str(pdb_file)) # PRODY + #hv = structure.getHierView() # PRODY if chain==None: - all_chains=[i.get_id() for i in structure.get_chains()] + all_chains=[i.get_id() for i in structure.get_chains()] # BIOPYTHON + #all_chains = [structure_chain.getChid() for structure_chain in hv] # PRODY else: - all_chains=[chain] + if type(chain) == list: + all_chains = chain + elif type(chain) == str: + all_chains = [id for id in chain if id != " "] # remove spaces if present in string + else: + raise TypeError(f"chain must be list or str but was {type(chain)}") sequence = "" - for chain in all_chains: - c = structure[0][chain] + start_mask = [] + for single_chain in all_chains: + c = structure[0][single_chain] # BIOPYTHON + #c = hv[single_chain] # PRODY chain_seq = "" for residue in c: - is_regular_res = residue.has_id('CA') and residue.has_id('O') - res_id = residue.get_id()[0] - if (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res: - residue_name = residue.get_resname() + is_regular_res = residue.has_id('CA') and residue.has_id('O') # BIOPYTHON + #atom_names = [atom.getName() for atom in residue] # PRODY + #is_regular_res = ("CA" in atom_names and "O" in atom_names) # PRODY + res_id = residue.get_id()[0] #BIOPYTHON + if (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res: # BIOPYTHON + # i don't know what H_HSE, H_M3L, and H_CAS are doing + # because they aren't in three_to_one, so those should throw an error + # long story short, I don't think we have to worry about them when switching from biopython to prody + #if is_regular_res: # PRODY + residue_name = residue.get_resname() # BIOPYTHON + #residue_name = residue.getResname() # PRODY chain_seq += three_to_one[residue_name] + if chain_seq == "": # empty chain, like a nucleic acid chain (see 8ZWK) + continue # FYI, currently, a non-empty chain with certain invalid residues will throw an error at the three_to_one[residue_name] above sequence += chain_seq - return sequence + start_mask.append(1) + for _ in range(1,len(chain_seq)): + start_mask.append(0) + if return_start_mask: + return (sequence,start_mask) + else: + return sequence def get_distance_matrix(pdb_file: Union[Path,str], chain: str, - method: str = 'CB' + method: str = 'CB', + return_distance_midpoints: bool = False, ) -> np.array: """ Calculate the distance matrix of the specified atoms in a PDB file. @@ -106,6 +135,10 @@ def get_distance_matrix(pdb_file: Union[Path,str], 'CA' for using only the CA atom, 'minimum' for using the minimum distance between all atoms in each residue, 'CB_force' computes a new coordinate for the CB atom based on the CA, C, and N atoms and uses CB distance even for glycine. + return_distance_midpoints: bool + Whether to return a matrix of the same shape as distance_matrix representing the same contacts as distance_matrix + that indicates the absolute coordinates of the midpoint between the pair of atoms. This helps us compute the pair distribution + functions of the different classes of contacts. So this matrix isn't really a matrix because each "element" has 3 channels: x, y, and z Returns: np.array: The distance matrix of the selected atoms. @@ -121,7 +154,7 @@ def get_distance_matrix(pdb_file: Union[Path,str], if method == 'CA': coords = structure.select('protein and name CA' + chain_selection).getCoords() elif method == 'CB': - coords = structure.select('(protein and (name CB) or (resname GLY and name CA))' + chain_selection).getCoords() + coords = structure.select('(protein and (name CB) or (resname GLY IGL and name CA))' + chain_selection).getCoords() elif method == 'minimum': selection = structure.select('protein' + chain_selection) coords = selection.getCoords() @@ -163,8 +196,20 @@ def get_distance_matrix(pdb_file: Union[Path,str], if len(coords) == 0: raise IndexError('Empty selection for distance map') + # coords should be a numpy array of shape (N,3) distance_matrix = sdist.squareform(sdist.pdist(coords)) - return distance_matrix + assert distance_matrix.shape[0] == distance_matrix.shape[1] + if return_distance_midpoints: + midpoint_matrix = np.zeros((distance_matrix.shape[0],distance_matrix.shape[1],3)) + for i in range(distance_matrix.shape[0]): + for j in range(distance_matrix.shape[1]): + midpoint_matrix[i,j,:] = (coords[None,i,:] + coords[None,j,:])/2 + # check that indexing is consistent with distance_matrix + assert np.allclose(np.linalg.norm(coords[i,:]-coords[j,:]),distance_matrix[i,j]) + assert np.allclose(midpoint_matrix,midpoint_matrix.transpose(1,0,2)) # check symmetry + return distance_matrix, midpoint_matrix + else: + return distance_matrix def full_to_filtered_aligned_mapping(aligned_sequence: str, From 6068d3c152a2786ffec549c138a6a8c7f54fa8cb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 11:36:46 -0500 Subject: [PATCH 15/20] undid unnecessary modification to structure processing in AWSEM --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index bf90fa8..3817658 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -220,7 +220,7 @@ def __init__(self, def setup_structure(self, pdb_structure): # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') - resid = list(set(selection_CB.getResindices())) # sometimes, it decides to split a single residue in 2 + resid = selection_CB.getResindices() N=len(resid) self.resid = resid self.N = N From 6d5586b09070472f895d687fe671f985e79e671a Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 11:37:22 -0500 Subject: [PATCH 16/20] changed name of DecoyEnsemble.average to DecoyEnsemble.avg --- frustratometer/classes/AWSEM.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 3817658..5ba8e17 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -566,7 +566,7 @@ def get_indicators(self, filename): except EOFError: break - def average(self): + def avg(self): # average burial computation from generator avg_burial = 0 counter = 0 @@ -617,7 +617,7 @@ def std(self): if self.avg_burial is None or self.avg_direct is None or \ self.avg_prot is None or self.avg_wat is None or \ self.avg_elec is None: - self.average() # compute averages if not already done + self.avg() # compute averages if not already done # std burial computation from generator and previously computed average std_burial = 0 counter = 0 From 1c1a5effa9e55eb816eda23be8f75bb2ff76db16 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 12:59:00 -0500 Subject: [PATCH 17/20] fix wrong variable used --- frustratometer/classes/AWSEM.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 5ba8e17..5691498 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -566,13 +566,14 @@ def get_indicators(self, filename): except EOFError: break + # average indicator functions over all decoys def avg(self): # average burial computation from generator avg_burial = 0 counter = 0 for array in self.burial_indicators: counter += 1 - burial_indicator += array + avg_burial += array avg_burial /= counter self.avg_burial = avg_burial # average direct computation from generator @@ -580,7 +581,7 @@ def avg(self): counter = 0 for array in self.direct_indicators: counter += 1 - direct_indicator += array + avg_direct += array avg_direct /= counter self.avg_direct = avg_direct # average prot computation from generator @@ -588,7 +589,7 @@ def avg(self): counter = 0 for array in self.protein_indicators: counter += 1 - protein_indicator += array + avg_prot += array avg_prot /= counter self.avg_prot = avg_prot # average wat computation from generator @@ -596,7 +597,7 @@ def avg(self): counter = 0 for array in self.water_indicators: counter += 1 - water_indicator += array + avg_wat += array avg_wat /= counter self.avg_wat = avg_wat # average elec computation from generator @@ -613,6 +614,7 @@ def avg(self): self.avg_elec = avg_elec return self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec + # standard deviation of indicator functions over all decoys def std(self): if self.avg_burial is None or self.avg_direct is None or \ self.avg_prot is None or self.avg_wat is None or \ From 3b64349b2d571c745d0d7ee4f37e570995e5b00b Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 13:01:52 -0500 Subject: [PATCH 18/20] reinitialize indicator function generator each time attribute is accessed --- frustratometer/classes/AWSEM.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 5691498..ca4ac5b 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -532,20 +532,15 @@ def __init__(self, np.save(f,awsem_obj.electrostatics_indicator) else: np.save(f,None) - # Stack and average indicators, ensuring correct shape for calculate_energy_and_potts - self.burial_indicators = self.get_indicators("burial_indicators.npy") - self.direct_indicators = self.get_indicators("direct_indicators.npy") - self.protein_indicators = self.get_indicators("protein_indicators.npy") - self.water_indicators = self.get_indicators("water_indicators.npy") - self.electrostatics_indicators = self.get_indicators("electrostatics_indicators.npy") - - # averages are needed to compute standard deviation, so it's useful to have them as attributes + # averages are needed to compute standard deviation + # having these be attributes allows them to be easily passed + # from the avg method to the std method self.avg_burial = None self.avg_direct = None self.avg_prot = None self.avg_wat = None self.avg_elect = None - + ################################################ ## Attach gamma parameters from the AWSEM object ## Kind of off-topic from my current use of this class #self.burial_gamma = awsem_obj.burial_gamma @@ -553,10 +548,31 @@ def __init__(self, #self.protein_gamma = awsem_obj.protein_gamma #self.water_gamma = awsem_obj.water_gamma #self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) - + ################################################ # this might help with memory del awsem_obj + # To manage memory, we need the indicator attributes to be generators (see self.get_indicators). + # But to ensure that we can iterate over them more than once, we need to + # be able to reinitialize the generators. We accomplish this with properties + @property + def burial_indicators(self): + return self.get_indicators("burial_indicators.npy") + @property + def direct_indicators(self): + return self.get_indicators("direct_indicators.npy") + @property + def protein_indicators(self): + return self.get_indicators("protein_indicators.npy") + @property + def water_indicators(self): + return self.get_indicators("water_indicators.npy") + @property + def electrostatics_indicators(self): + return self.get_indicators("electrostatics_indicators.npy") + + # allows us to process indicators without holding them all in memory + # this requires that every method that acts on the indicators iterates over them def get_indicators(self, filename): # expecting a numpy file with open(filename, 'rb') as f: From 3a281d72044b9c2cd85f77c7ec6e337c43696296 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 18:31:43 -0500 Subject: [PATCH 19/20] take absolute value of gamma (sqrt(gamma**2)) for approximate variance calculation --- frustratometer/classes/AWSEM.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index ca4ac5b..4802135 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -449,6 +449,7 @@ def __init__(self, electrostatics_indicator: Union[np.ndarray, None], sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, + absolute_value_gamma: bool=False, **parameters)->object: """ A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions @@ -470,7 +471,9 @@ def __init__(self, The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. - + absolute_value_gamma: bool + If True, replace gammas with their absolute values. This is helpful for the standard deviation approximation + Returns ------- AWSEMIndicators object @@ -484,6 +487,13 @@ def __init__(self, self.electrostatics_indicator = electrostatics_indicator self.sequence_mask_contact = np.full((self.N,self.N), True) self.mask = np.full((self.N,self.N), True) + if absolute_value_gamma: + self.burial_gamma = np.abs(self.burial_gamma) + self.direct_gamma = np.abs(self.direct_gamma) + self.protein_gamma = np.abs(self.protein_gamma) + self.water_gamma = np.abs(self.water_gamma) + self.electrostatics_gamma = np.abs(self.electrostatics_gamma) + self.absolute_value_gamma = absolute_value_gamma # mask should have been applied when calculating the indicator functions, # so we set it such that no further masking is performed self.setup_model() From 26795fe64a89cd81c40ed5d8f93f49cbf83d17b9 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 12 Jul 2025 16:00:42 -0500 Subject: [PATCH 20/20] i guess our sign convention caused taking the absolute value of the gammas to make all energies negative. so now, we take the negative of the absolute values of the gammas to make all energies positive --- frustratometer/classes/AWSEM.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 4802135..5131fbd 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -488,11 +488,11 @@ def __init__(self, self.sequence_mask_contact = np.full((self.N,self.N), True) self.mask = np.full((self.N,self.N), True) if absolute_value_gamma: - self.burial_gamma = np.abs(self.burial_gamma) - self.direct_gamma = np.abs(self.direct_gamma) - self.protein_gamma = np.abs(self.protein_gamma) - self.water_gamma = np.abs(self.water_gamma) - self.electrostatics_gamma = np.abs(self.electrostatics_gamma) + self.burial_gamma = -np.abs(self.burial_gamma) + self.direct_gamma = -np.abs(self.direct_gamma) + self.protein_gamma = -np.abs(self.protein_gamma) + self.water_gamma = -np.abs(self.water_gamma) + self.electrostatics_gamma = -np.abs(self.electrostatics_gamma) self.absolute_value_gamma = absolute_value_gamma # mask should have been applied when calculating the indicator functions, # so we set it such that no further masking is performed @@ -681,4 +681,4 @@ def std(self): counter += 1 std_elec += (array - self.avg_elec) ** 2 std_elec = np.sqrt(std_elec / counter) - return std_burial, std_direct, std_prot, std_wat, std_elec \ No newline at end of file + return std_burial, std_direct, std_prot, std_wat, std_elec