From 34e409439dbe9f785884d65a88ed32f5d0f4d617 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Apr 2025 14:08:08 -0500 Subject: [PATCH 01/76] saving changes --- frustratometer/classes/AWSEM.py | 300 +++++++++++++++----- frustratometer/classes/__init__.py | 2 +- frustratometer/optimization/optimization.py | 124 ++------ 3 files changed, 246 insertions(+), 180 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 770f4ad0..72f6f9ae 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -5,9 +5,9 @@ from .Gamma import Gamma from pydantic import BaseModel, Field, ConfigDict from pydantic.types import Path -from typing import List,Optional,Union +from typing import List,Optional,Union,Generator -__all__ = ['AWSEM'] +__all__ = ['AWSEM','AWSEMEnsemble'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -75,73 +75,88 @@ def __init__(self, AWSEM object """ - #Set attributes - p = AWSEMParameters(**parameters) - if p.min_sequence_separation_contact is None: - p.min_sequence_separation_contact = 1 - if p.min_sequence_separation_rho is None: - p.min_sequence_separation_rho = 1 - if p.min_sequence_separation_electrostatics is None: - p.min_sequence_separation_electrostatics = 1 - - for field, value in p: + #Set parameters attributes + self.p = AWSEMParameters(**parameters) + if self.p.min_sequence_separation_contact is None: + self.p.min_sequence_separation_contact = 1 + if self.p.min_sequence_separation_rho is None: + self.p.min_sequence_separation_rho = 1 + if self.p.min_sequence_separation_electrostatics is None: + self.p.min_sequence_separation_electrostatics = 1 + for field, value in self.p: setattr(self, field, value) - - #Gamma parameters - if isinstance(p.gamma, Gamma): - gamma = p.gamma - elif isinstance(p.gamma, Path): - gamma = Gamma(p.gamma) + if isinstance(self.p.gamma, Gamma): + gamma = self.p.gamma + elif isinstance(self.p.gamma, Path): + gamma = Gamma(self.p.gamma) else: raise ValueError("Gamma parameter must be a path or a Gamma object.") - self.gamma=gamma self.burial_gamma = gamma['Burial'].T self.direct_gamma = gamma['Direct'][0] self.protein_gamma = gamma['Protein'][0] self.water_gamma = gamma['Water'][0] - self.burial_in_context=p.burial_in_context + self.burial_in_context=self.p.burial_in_context - #Structure details - self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict + # ?????? + self._decoy_fluctuation = {} # don't know what this does + self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ + + # sequence details if sequence is None: self.sequence=pdb_structure.sequence else: self.sequence=sequence + self.aa_freq = frustration.compute_aa_freq(self.sequence) + self.contact_freq = frustration.compute_contact_freq(self.sequence) + + # structure details + self.expose_indicator_functions = expose_indicator_functions + self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict + self.pdb_structure = pdb_structure + + + @property + def pdb_structure(self): + return self._pdb_structure + @pdb_structure.setter + def pdb_structure(self,pdb_structure): + # check structure + selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') + resid = selection_CB.getResindices() + N=len(resid) + if N != len(self.sequence): + raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") + self.resid = resid + self.N = N + # set structure-dependent proterties + self.pdb_structure = pdb_structure self.structure=pdb_structure.structure self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file self.init_index_shift=pdb_structure.init_index_shift self.distance_matrix=pdb_structure.distance_matrix self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix - selection_CB = self.structure.select('name CB or (resname GLY IGL and name CA)') - - resid = selection_CB.getResindices() - self.resid=resid - self.N=len(self.resid) - assert self.N == len(self.sequence), "The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object." + # reset indicator functions, energies, and potts model + self.compute_indicators_energies_potts_model() + def compute_indicators_energies_potts_model(): if self.burial_in_context==True: selected_matrix=self.full_pdb_distance_matrix else: selected_matrix=self.distance_matrix sequence_mask_rho = frustration.compute_mask(selected_matrix, maximum_contact_distance=None, - minimum_sequence_separation = p.min_sequence_separation_rho) + minimum_sequence_separation = self.p.min_sequence_separation_rho) sequence_mask_contact = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=p.distance_cutoff_contact, - minimum_sequence_separation = p.min_sequence_separation_contact) - - self._decoy_fluctuation = {} - self.minimally_frustrated_threshold=.78 - + maximum_contact_distance=self.p.distance_cutoff_contact, + minimum_sequence_separation = self.p.min_sequence_separation_contact) # Calculate rho rho = 0.25 - rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) - rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) + rho *= (1 + np.tanh(self.p.eta * (selected_matrix- self.p.r_min))) + rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - selected_matrix))) rho *= sequence_mask_rho self.rho=rho - #Calculate sigma water rho_r = (rho).sum(axis=1) if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: @@ -153,23 +168,21 @@ def __init__(self, rho_b = np.expand_dims(rho_r, 1) rho1 = np.expand_dims(rho_r, 0) rho2 = np.expand_dims(rho_r, 1) - sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) + sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) sigma_protein = 1 - sigma_water - #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(p.eta * (self.distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - self.distance_matrix))) - thetaII = 0.25 * (1 + np.tanh(p.eta * (self.distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - self.distance_matrix))) - burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) + theta = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - self.distance_matrix))) + thetaII = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_minII))) * (1 + np.tanh(self.p.eta * (self.p.r_maxII - self.distance_matrix))) + burial_indicator = np.tanh(self.p.burial_kappa * (rho_b - self.p.burial_ro_min)) + np.tanh(self.p.burial_kappa * (self.p.burial_ro_max - rho_b)) direct_indicator = theta[:, :, np.newaxis, np.newaxis] water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] - - if expose_indicator_functions: + # store indicators and gammas for our particular sequence as attributes + if self.expose_indicator_functions: self.indicators=[] self.indicators.append(burial_indicator[:,0]) self.indicators.append(burial_indicator[:,1]) self.indicators.append(burial_indicator[:,2]) - self.indicators.append(direct_indicator[:,:,0,0]*sequence_mask_contact) self.indicators.append(protein_indicator[:,:,0,0]*sequence_mask_contact) self.indicators.append(water_indicator[:,:,0,0]*sequence_mask_contact) @@ -194,60 +207,52 @@ def __init__(self, self.water_indicator = water_indicator self.protein_indicator = protein_indicator - + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) - - #Burial energy - burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] - self.burial_energy = burial_energy - - #Contact energy + + # compute burial and contact energies + self.burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - contact_energy = p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - - # Compute electrostatics - if p.k_electrostatics!=0: - self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) + self.contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + # Compute electrostatics and add to contact energy + if self.p.k_electrostatics!=0: + self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None - - - electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) + electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] - - electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / p.electrostatics_screening_length) * electrostatics_mask - electrostatics_energy = -p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) - + electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * electrostatics_mask + electrostatics_energy = -self.p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) - if expose_indicator_functions: + if self.expose_indicator_functions: self.indicators.append(electrostatics_indicator) - temp_gamma=0.5 * p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] + temp_gamma=0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 self.gamma_array.append(temp_gamma) else: - self.sequence_cutoff=p.min_sequence_separation_contact - self.distance_cutoff=p.distance_cutoff_contact + self.sequence_cutoff=self.p.min_sequence_separation_contact + self.distance_cutoff=self.p.distance_cutoff_contact self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) - self.contact_energy = contact_energy - # Compute fast properties - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + # Compute potts model self.potts_model = {} self.potts_model['h'] = burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] self.potts_model['J'] = contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] - # Set the gap energy to zero self.potts_model['h'][:, 0] = 0 self.potts_model['J'][:, :, 0, :] = 0 self.potts_model['J'][:, :, :, 0] = 0 - self._native_energy=None + self._native_energy=None # don't know what this does + + def change_conformation(alternative_pdb_structure): + # this function is an alias for the pdb_structure setter + self.pdb_structure = alternative_pdb_structure def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] @@ -366,4 +371,147 @@ def compute_configurational_energies(self): def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) - return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) \ No newline at end of file + return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + + +class AWSEMEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer + # also, note that the functions compute_configurational_decoy_statistics, + # compute_configurational_energies, and configuration_frustration are + # present in the AWSEM class but removed here, since we don't expect to + # compute frustration on an entire ensemble + #Mapping to DCA + q = 20 + aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy + aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') + + def __init__(self, + pdb_structures: Generator[object,None,None], + **parameters)->object: + """ + Generate AWSEMEnsemble object + + Parameters + ---------- + pdb_structures : Generator[object,None,None] + yields Structure objects representing decoy structures + + Returns + ------- + AWSEMEnsemble object + """ + + #Set attributes + p = AWSEMParameters(**parameters) + if p.min_sequence_separation_contact is None: + p.min_sequence_separation_contact = 1 + if p.min_sequence_separation_rho is None: + p.min_sequence_separation_rho = 1 + if p.min_sequence_separation_electrostatics is None: + p.min_sequence_separation_electrostatics = 1 + + for field, value in p: + setattr(self, field, value) + + #Gamma parameters + if isinstance(p.gamma, Gamma): + gamma = p.gamma + elif isinstance(p.gamma, Path): + gamma = Gamma(p.gamma) + else: + raise ValueError("Gamma parameter must be a path or a Gamma object.") + + self.gamma=gamma + self.burial_gamma = gamma['Burial'].T + self.direct_gamma = gamma['Direct'][0] + self.protein_gamma = gamma['Protein'][0] + self.water_gamma = gamma['Water'][0] + self.burial_in_context=p.burial_in_context # need to be careful here--the same choice will have to apply to all structures, + # the way this code is currently written + self.indicators = [] # we're always going to expose indicator functions for this class + #self._decoy_fluctuation = {} # not sure what this does + if p.k_electrostatics!=0: + self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) + self.distance_cutoff=None + else: + self.sequence_cutoff=p.min_sequence_separation_contact + self.distance_cutoff=p.distance_cutoff_contact + + Ns = [] # number of residues in each structure + for pdb_structure in pdb_structures: + self.indicators.append(AWSEM(pdb_structure,expose_indicator_functions=True).indicators) + """ + #Structure details + # we can exclude most of the details present in the AWSEM class + structure=pdb_structure.structure + init_index_shift=pdb_structure.init_index_shift + distance_matrix=pdb_structure.distance_matrix + full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix + selection_CB = structure.select('name CB or (resname GLY IGL and name CA)') + + resid = selection_CB.getResindices() + N = len(resid) + Ns.append(N) # we'll check this later do make sure every structure has the same number of residues + + if self.burial_in_context==True: + selected_matrix=full_pdb_distance_matrix # use a matrix that includes extra residues to compute local density and contacts + # like the case where we're trying to design a protein that binds + # to another protein, and those residues affect the local environment even if they're + # not part of the sequence space that we're sampling + else: + selected_matrix=distance_matrix + sequence_mask_rho = frustration.compute_mask(selected_matrix, + maximum_contact_distance=None, + minimum_sequence_separation = p.min_sequence_separation_rho) + sequence_mask_contact = frustration.compute_mask(distance_matrix, + maximum_contact_distance=p.distance_cutoff_contact, + minimum_sequence_separation = p.min_sequence_separation_contact) + + # Calculate rho + rho = 0.25 + rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) + rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) + rho *= sequence_mask_rho + + #Calculate sigma water + rho_r = (rho).sum(axis=1) + if full_pdb_distance_matrix.shape!=distance_matrix.shape: + if self.burial_in_context==True: + init_index_shift=pdb_structure.init_index_shift + fin_index_shift=pdb_structure.fin_index_shift + rho_r=rho_r[init_index_shift:fin_index_shift] + rho_b = np.expand_dims(rho_r, 1) + rho1 = np.expand_dims(rho_r, 0) + rho2 = np.expand_dims(rho_r, 1) + sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) + sigma_protein = 1 - sigma_water + + #Calculate theta and indicators + theta = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - distance_matrix))) + thetaII = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - distance_matrix))) + burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) + direct_indicator = theta[:, :, np.newaxis, np.newaxis] + water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] + protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] + + self.indicators.append([]) + self.indicators[-1].append(burial_indicator[:,0]) + self.indicators[-1].append(burial_indicator[:,1]) + self.indicators[-1].append(burial_indicator[:,2]) + self.indicators[-1].append(direct_indicator[:,:,0,0]*sequence_mask_contact) + self.indicators[-1].append(protein_indicator[:,:,0,0]*sequence_mask_contact) + self.indicators[-1].append(water_indicator[:,:,0,0]*sequence_mask_contact) + + # Compute electrostatics + if p.k_electrostatics!=0: + electrostatics_mask = frustration.compute_mask(distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) + # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] + charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) + charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] + electrostatics_indicator = 1 / (distance_matrix + 1E-6) * np.exp(-distance_matrix / p.electrostatics_screening_length) * electrostatics_mask + self.indicators[-1].append(electrostatics_indicator) + """ + + self._native_energy=None # not sure what this does + + #assert len(list(set(Ns))) == 1, f"Not all structures had the same number of residues! Numbers of residues found were {set(Ns)}" + #self.N = Ns[0] # doesn't matter which one we choose, they're all the same if we passed the assert diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index 6621727b..70dec2da 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM +from .AWSEM import AWSEM, AWSEMEnsemble from .Structure import Structure from .Map import Map from .Gamma import Gamma diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 0bdc5e59..3ade9f99 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM +from frustratometer.classes import AWSEM, AWSEMEnsemble from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -1109,111 +1109,29 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): if __name__ == '__main__': - native_pdb = "tests/data/1r69.pdb" - - structure_bound = Structure.full_pdb(native_pdb, chain=None) - structure_free = Structure.full_pdb(native_pdb, "A") - - model_bound = AWSEM(structure_bound, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) - model_free = AWSEM(structure_free, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) - reduced_alphabet = 'ADEFHIKLMNQRSTVWY' - - print(model_bound.sequence) - print(model_free.sequence) - - # binding_region=np.array([1, 2, 3, 4, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 68, 69, 70, 90, 91, 92, 93, 94, 95, 96, 97, 109, 110, 111, 112, 113, 114, 115, 116, 117, 127, 128, 129, 130, 131, 132, 133, 134, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 190, 191, 192, 193, 194, 195, 196, 197])-1 - energy_bound = AwsemEnergySelected(model_bound, alphabet=reduced_alphabet, selection=np.array(range(len(model_free.sequence)))) - energy_free = AwsemEnergySelected(model_free, alphabet=reduced_alphabet) - energy_average = AwsemEnergyAverage(model_free, alphabet=reduced_alphabet) - energy_std = AwsemEnergyStd(model_free, alphabet=reduced_alphabet) - energy_std_100 = AwsemEnergyStd(model_free, alphabet=reduced_alphabet, n_decoys=100) - energy_std_1000 = AwsemEnergyStd(model_free, alphabet=reduced_alphabet, n_decoys=1000) - energy_std_10000 = AwsemEnergyStd(model_free, alphabet=reduced_alphabet, n_decoys=10000) - energy_variance = AwsemEnergyVariance(model_free, alphabet=reduced_alphabet) + pdb_list = ["tests/data/1r69.pdb"] + pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) + ensemble = AWSEMEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) + ########################################################### + # temporary changes for testing it while it's not yet fully implemented + assert len(ensemble.indicators)==1, ensemble.indicators + #assert len(ensemble.potts_model['h'])==1, ensemble.potts_model['h'] + #assert len(ensemble.potts_model['J'])==1, ensemble.potts_model['J'] + ensemble.indicators = ensemble.indicators[0] + #ensemble.potts_model['h'] = ensemble.potts_model['h'][0] + #ensemble.potts_model['J'] = ensemble.potts_model['J'][0] + ensemble.mask = np.ones((63,63)) + ######################################################### + + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' + + awsem_energy = AwsemEnergy(ensemble, alphabet=reduced_alphabet) heterogeneity = Heterogeneity(exact=False, use_numba=True) - similarity = Similarity(model_free.sequence, use_numba=True) - - # energy_mix = energy_free - 20 * heterogeneity - energy_mix = (energy_free - energy_average) / energy_std - # energy_mix = energy_bound - energy_free - - energy_mixes = {"EnergyFree": energy_free, - "EnergyBound": energy_bound, - "Heterogeneity": heterogeneity, - "EnergyAverage": energy_average, - "EnergyStd_ndecoys100": energy_std_100, - "EnergyStd_ndecoys1000": energy_std_1000, - "EnergyStd_ndecoys10000": energy_std_10000, - "EnergyStd": energy_std, - "Zscore_ndecoys10000":(energy_free - energy_average) / energy_std_10000, - "Zscore":(energy_free - energy_average) / energy_std, - "EnergyVariance": energy_variance, - "Binding": (energy_bound - energy_free), - "Similarity": similarity, - "Ivan":energy_bound - 40 * heterogeneity, - "Takada": (energy_bound - energy_average) / energy_std, - "Ivan_binding":(energy_bound - energy_free) - 40 * heterogeneity, - "Takada_binding":(energy_free - energy_average) / energy_std + (energy_bound - energy_free), - "Ivan_Takada_binding": (energy_free - energy_average) / energy_std + (energy_bound - energy_free) - 40 * heterogeneity, - "Corrected_Takada": (energy_bound - energy_average) / (energy_std+5), - "Corrected_Takada_binding":(energy_free - energy_average) / (energy_std+5) + (energy_bound - energy_free), - "Ivan_Corrected_Takada_binding": (energy_free - energy_average) / (energy_std+5) + (energy_bound - energy_free) - 40 * heterogeneity, - "Ivan_bindidng similarity": (energy_bound - energy_free) - 40 * heterogeneity - 100*similarity, - "Corrected_Takada_binding_similarity":(energy_free - energy_average) / (energy_std+5) + (energy_bound - energy_free) - 100*similarity, - "Ivan_bindidng_similarityv2": (energy_bound - energy_free) - 40 * heterogeneity} - - for energy_name,energy_term in energy_mixes.items(): - print (f"Energy term: {energy_name}") - energy_term.benchmark(seq_indices=np.random.randint(0, len(reduced_alphabet), size=(100,len(structure_free.sequence)))) - if "ndecoys" not in energy_name: - energy_term.test(seq_index=np.random.randint(0, len(reduced_alphabet), size=len(structure_free.sequence))) - - monte_carlo = MonteCarlo(sequence = structure_free.sequence, energy=energy_term, alphabet=reduced_alphabet) - monte_carlo.benchmark_montecarlo_steps(n_repeats=3,n_steps=10000) - monte_carlo.benchmark_parallel_montecarlo_steps(n_repeats=3, n_steps=10000, n_replicas=8) - - - - # Profiling of the parallel tempering - import cProfile - import pstats - import io - monte_carlo = MonteCarlo(sequence=model_free.sequence, energy=energy_mix, alphabet=reduced_alphabet) - # evaluation_energies={"EnergyFree": energy_free, "Heterogeneity": heterogeneity, - # "EnergyAverage": energy_average, "EnergyStd": energy_std, - # "Similarity": similarity, "Zscore":(energy_free - energy_average) / energy_std}) - - monte_carlo.benchmark_montecarlo_steps(n_repeats=3, n_steps=100) - for n_replicas in [1, 2, 4, 8, 16]: - print(f"Running parallel tempering with {n_replicas} replicas") - monte_carlo.benchmark_parallel_montecarlo_steps(n_repeats=3, n_steps=100, n_replicas=n_replicas) - - for n_replicas in [1, 2, 4, 8, 16]: - print(f"Running parallel tempering with {n_replicas} replicas") - monte_carlo.benchmark_parallel_montecarlo_steps(n_repeats=3, n_steps=1000, n_replicas=n_replicas) - - monte_carlo.find_optimal_replicas(max_replicas=32, n_repeats=5, n_steps=1000) - monte_carlo.find_optimal_replicas(max_replicas=8, n_repeats=5, n_steps=10000) - monte_carlo.find_optimal_replicas(max_replicas=8, n_repeats=5, n_steps=100000) - - # # Run the profiler - # profiler = cProfile.Profile() - # profiler.enable() - - # monte_carlo.parallel_tempering(temperatures=np.logspace(3,-4,8), n_steps=1E4, n_steps_per_cycle=1E2) - # profiler.disable() - - # # Print the stats - # s = io.StringIO() - # ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative') - # ps.print_stats() - # ps.dump_stats('parallel_temperingv2.prof') - - - - + energy_mix = awsem_energy - 10*heterogeneity + monte_carlo = MonteCarlo(sequence = "SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT", energy=energy_mix, alphabet=reduced_alphabet) + monte_carlo.annealing(n_steps=1000) From fa23e340479b01d49e51c36e44d1237eb1f13314 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 17:16:00 -0500 Subject: [PATCH 02/76] fixed typo --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 72f6f9ae..15b103d3 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -130,7 +130,7 @@ def pdb_structure(self,pdb_structure): self.resid = resid self.N = N # set structure-dependent proterties - self.pdb_structure = pdb_structure + self._pdb_structure = pdb_structure self.structure=pdb_structure.structure self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file From fce6e82744a21d05a8afc6b9d3ba5ab75a93facb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 17:34:39 -0500 Subject: [PATCH 03/76] added missing 'self' references --- frustratometer/classes/AWSEM.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 15b103d3..b84513d2 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -140,7 +140,7 @@ def pdb_structure(self,pdb_structure): # reset indicator functions, energies, and potts model self.compute_indicators_energies_potts_model() - def compute_indicators_energies_potts_model(): + def compute_indicators_energies_potts_model(self): if self.burial_in_context==True: selected_matrix=self.full_pdb_distance_matrix else: @@ -190,7 +190,7 @@ def compute_indicators_energies_potts_model(): self.gamma_array=[] temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] temp_burial_gamma[0]=0 - temp_burial_gamma *= -0.5 * p.k_contact + temp_burial_gamma *= -0.5 * self.p.k_contact self.gamma_array.append(temp_burial_gamma[:,0]) self.gamma_array.append(temp_burial_gamma[:,1]) self.gamma_array.append(temp_burial_gamma[:,2]) @@ -212,14 +212,14 @@ def compute_indicators_energies_potts_model(): h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) # compute burial and contact energies - self.burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] + self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - self.contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] # Compute electrostatics and add to contact energy if self.p.k_electrostatics!=0: - self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) + self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] @@ -242,8 +242,8 @@ def compute_indicators_energies_potts_model(): # Compute potts model self.potts_model = {} - self.potts_model['h'] = burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] - self.potts_model['J'] = contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] # Set the gap energy to zero self.potts_model['h'][:, 0] = 0 self.potts_model['J'][:, :, 0, :] = 0 From 74839593d908ddd36ceb92a13d51adcba961f6f8 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 20:54:21 -0500 Subject: [PATCH 04/76] refactored; basic setup of decoy ensemble is working now --- frustratometer/classes/AWSEM.py | 108 +++++--------------- frustratometer/optimization/optimization.py | 7 -- 2 files changed, 27 insertions(+), 88 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index b84513d2..7c945703 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -115,7 +115,8 @@ def __init__(self, self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict self.pdb_structure = pdb_structure - + + # allows us to update coordinates only by passing in a pdb file @property def pdb_structure(self): return self._pdb_structure @@ -397,7 +398,8 @@ def __init__(self, Returns ------- - AWSEMEnsemble object + AWSEMEnsemble object, which holds a list of indicator functions for each decoy structure, + as calculated by the AWSEM class. """ #Set attributes @@ -427,7 +429,6 @@ def __init__(self, self.water_gamma = gamma['Water'][0] self.burial_in_context=p.burial_in_context # need to be careful here--the same choice will have to apply to all structures, # the way this code is currently written - self.indicators = [] # we're always going to expose indicator functions for this class #self._decoy_fluctuation = {} # not sure what this does if p.k_electrostatics!=0: self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) @@ -436,82 +437,27 @@ def __init__(self, self.sequence_cutoff=p.min_sequence_separation_contact self.distance_cutoff=p.distance_cutoff_contact - Ns = [] # number of residues in each structure + burial_low_density_indicators = [] + burial_medium_density_indicators = [] + burial_high_density_indicators = [] + contact_direct_indicators = [] + contact_protein_indicators = [] + contact_water_indicators = [] + electrostatics_indicators = [] for pdb_structure in pdb_structures: - self.indicators.append(AWSEM(pdb_structure,expose_indicator_functions=True).indicators) - """ - #Structure details - # we can exclude most of the details present in the AWSEM class - structure=pdb_structure.structure - init_index_shift=pdb_structure.init_index_shift - distance_matrix=pdb_structure.distance_matrix - full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix - selection_CB = structure.select('name CB or (resname GLY IGL and name CA)') - - resid = selection_CB.getResindices() - N = len(resid) - Ns.append(N) # we'll check this later do make sure every structure has the same number of residues - - if self.burial_in_context==True: - selected_matrix=full_pdb_distance_matrix # use a matrix that includes extra residues to compute local density and contacts - # like the case where we're trying to design a protein that binds - # to another protein, and those residues affect the local environment even if they're - # not part of the sequence space that we're sampling - else: - selected_matrix=distance_matrix - sequence_mask_rho = frustration.compute_mask(selected_matrix, - maximum_contact_distance=None, - minimum_sequence_separation = p.min_sequence_separation_rho) - sequence_mask_contact = frustration.compute_mask(distance_matrix, - maximum_contact_distance=p.distance_cutoff_contact, - minimum_sequence_separation = p.min_sequence_separation_contact) - - # Calculate rho - rho = 0.25 - rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) - rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) - rho *= sequence_mask_rho - - #Calculate sigma water - rho_r = (rho).sum(axis=1) - if full_pdb_distance_matrix.shape!=distance_matrix.shape: - if self.burial_in_context==True: - init_index_shift=pdb_structure.init_index_shift - fin_index_shift=pdb_structure.fin_index_shift - rho_r=rho_r[init_index_shift:fin_index_shift] - rho_b = np.expand_dims(rho_r, 1) - rho1 = np.expand_dims(rho_r, 0) - rho2 = np.expand_dims(rho_r, 1) - sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) - sigma_protein = 1 - sigma_water - - #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - distance_matrix))) - thetaII = 0.25 * (1 + np.tanh(p.eta * (distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - distance_matrix))) - burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) - direct_indicator = theta[:, :, np.newaxis, np.newaxis] - water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] - protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] - - self.indicators.append([]) - self.indicators[-1].append(burial_indicator[:,0]) - self.indicators[-1].append(burial_indicator[:,1]) - self.indicators[-1].append(burial_indicator[:,2]) - self.indicators[-1].append(direct_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators[-1].append(protein_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators[-1].append(water_indicator[:,:,0,0]*sequence_mask_contact) - - # Compute electrostatics - if p.k_electrostatics!=0: - electrostatics_mask = frustration.compute_mask(distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) - # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] - electrostatics_indicator = 1 / (distance_matrix + 1E-6) * np.exp(-distance_matrix / p.electrostatics_screening_length) * electrostatics_mask - self.indicators[-1].append(electrostatics_indicator) - """ - - self._native_energy=None # not sure what this does - - #assert len(list(set(Ns))) == 1, f"Not all structures had the same number of residues! Numbers of residues found were {set(Ns)}" - #self.N = Ns[0] # doesn't matter which one we choose, they're all the same if we passed the assert + all_indicators = AWSEM(pdb_structure,expose_indicator_functions=True).indicators + burial_low_density_indicators.append(all_indicators[0]) + burial_medium_density_indicators.append(all_indicators[1]) + burial_high_density_indicators.append(all_indicators[2]) + contact_direct_indicators.append(all_indicators[3]) + contact_protein_indicators.append(all_indicators[4]) + contact_water_indicators.append(all_indicators[5]) + electrostatics_indicators.append(all_indicators[6]) + # indicator function order: burial low density, burial medium density, burial high density, + # direct, protein, water, electrostatics + self.indicators = [np.array(burial_low_density_indicators), + np.array(burial_medium_density_indicators), + np.array(contact_direct_indicators), + np.array(contact_protein_indicators), + np.array(contact_water_indicators), + np.array(electrostatics_indicators),] diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 3ade9f99..2ecf44cd 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1113,13 +1113,6 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) ensemble = AWSEMEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) ########################################################### - # temporary changes for testing it while it's not yet fully implemented - assert len(ensemble.indicators)==1, ensemble.indicators - #assert len(ensemble.potts_model['h'])==1, ensemble.potts_model['h'] - #assert len(ensemble.potts_model['J'])==1, ensemble.potts_model['J'] - ensemble.indicators = ensemble.indicators[0] - #ensemble.potts_model['h'] = ensemble.potts_model['h'][0] - #ensemble.potts_model['J'] = ensemble.potts_model['J'][0] ensemble.mask = np.ones((63,63)) ######################################################### From c9715d710bbb38f4fafa9d19e18426b085ebf907 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 8 Jul 2025 20:55:26 -0500 Subject: [PATCH 05/76] changed 'AWSEMEnsemble' class name to 'DecoyEnsemble' --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 7c945703..2c1c1db9 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -375,7 +375,7 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) -class AWSEMEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer +class DecoyEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer # also, note that the functions compute_configurational_decoy_statistics, # compute_configurational_energies, and configuration_frustration are # present in the AWSEM class but removed here, since we don't expect to From 641dfb997aac9d0ca4a7e333a667a02031920bb7 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Jul 2025 10:33:53 -0500 Subject: [PATCH 06/76] stripped DecoyEnsemble down to the bare minimum --- frustratometer/classes/AWSEM.py | 65 ++++++++---------------------- frustratometer/classes/__init__.py | 2 +- 2 files changed, 17 insertions(+), 50 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 2c1c1db9..468dc96a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -7,7 +7,7 @@ from pydantic.types import Path from typing import List,Optional,Union,Generator -__all__ = ['AWSEM','AWSEMEnsemble'] +__all__ = ['AWSEM','DecoyEnsemble'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -379,64 +379,28 @@ class DecoyEnsemble(): # don't think it's necessary for this one to inherit from # also, note that the functions compute_configurational_decoy_statistics, # compute_configurational_energies, and configuration_frustration are # present in the AWSEM class but removed here, since we don't expect to - # compute frustration on an entire ensemble - #Mapping to DCA - q = 20 - aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy - aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') - + # compute frustration on an entire ensemble def __init__(self, pdb_structures: Generator[object,None,None], **parameters)->object: """ - Generate AWSEMEnsemble object + Generate DecoyEnsemble object Parameters ---------- pdb_structures : Generator[object,None,None] yields Structure objects representing decoy structures + other parameters: + masks and cutoffs affecting the AWSEM class's indicator function calculations; + they must be the same for all structures; burial_in_context also available, but use at your own risk Returns ------- - AWSEMEnsemble object, which holds a list of indicator functions for each decoy structure, - as calculated by the AWSEM class. + DecoyEnsemble object, which holds a list of 7 numpy arrays, ordered by indicator function type: + [low density (rho), medium density (rho), high density (rho), direct, protein, water, electrostatics]. + The numpy arrays' first axes vary the structure, while the second axis and third axis, if it exists, + hold the (appropriately masked) indicator functions for each residue or pair of residues """ - - #Set attributes - p = AWSEMParameters(**parameters) - if p.min_sequence_separation_contact is None: - p.min_sequence_separation_contact = 1 - if p.min_sequence_separation_rho is None: - p.min_sequence_separation_rho = 1 - if p.min_sequence_separation_electrostatics is None: - p.min_sequence_separation_electrostatics = 1 - - for field, value in p: - setattr(self, field, value) - - #Gamma parameters - if isinstance(p.gamma, Gamma): - gamma = p.gamma - elif isinstance(p.gamma, Path): - gamma = Gamma(p.gamma) - else: - raise ValueError("Gamma parameter must be a path or a Gamma object.") - - self.gamma=gamma - self.burial_gamma = gamma['Burial'].T - self.direct_gamma = gamma['Direct'][0] - self.protein_gamma = gamma['Protein'][0] - self.water_gamma = gamma['Water'][0] - self.burial_in_context=p.burial_in_context # need to be careful here--the same choice will have to apply to all structures, - # the way this code is currently written - #self._decoy_fluctuation = {} # not sure what this does - if p.k_electrostatics!=0: - self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) - self.distance_cutoff=None - else: - self.sequence_cutoff=p.min_sequence_separation_contact - self.distance_cutoff=p.distance_cutoff_contact - burial_low_density_indicators = [] burial_medium_density_indicators = [] burial_high_density_indicators = [] @@ -445,7 +409,10 @@ def __init__(self, contact_water_indicators = [] electrostatics_indicators = [] for pdb_structure in pdb_structures: - all_indicators = AWSEM(pdb_structure,expose_indicator_functions=True).indicators + # the AWSEM class takes care of the indicator calculation (including masking) for us + # AWSEM normally accepts an amino acid sequence argument, but we don't need that here + # However, we do need to pass through parameters used to generate the indicator functions + all_indicators = AWSEM(pdb_structure, expose_indicator_functions=True, **parameters).indicators burial_low_density_indicators.append(all_indicators[0]) burial_medium_density_indicators.append(all_indicators[1]) burial_high_density_indicators.append(all_indicators[2]) @@ -453,8 +420,8 @@ def __init__(self, contact_protein_indicators.append(all_indicators[4]) contact_water_indicators.append(all_indicators[5]) electrostatics_indicators.append(all_indicators[6]) - # indicator function order: burial low density, burial medium density, burial high density, - # direct, protein, water, electrostatics + # the idea here is that we have different conformers of a chain of a particular length; + # if not, we'll get a ValueError when trying to initialize numpy arrays from a ragged set of lists self.indicators = [np.array(burial_low_density_indicators), np.array(burial_medium_density_indicators), np.array(contact_direct_indicators), diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index 70dec2da..4ff9b4b5 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM, AWSEMEnsemble +from .AWSEM import AWSEM, DecoyEnsemble from .Structure import Structure from .Map import Map from .Gamma import Gamma From 107d780cd9ac792c5cba2eca7b421cba064044d7 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Jul 2025 20:33:16 -0500 Subject: [PATCH 07/76] updated call to Structure in test_optimization --- tests/test_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_optimization.py b/tests/test_optimization.py index 7b939a17..2f1414bc 100644 --- a/tests/test_optimization.py +++ b/tests/test_optimization.py @@ -347,7 +347,7 @@ def test_diff_mean_inner_product_1_by_1(n_elements = 10): def model(request): native_pdb = "tests/data/1bfz.pdb" distance_cutoff_contact, min_sequence_separation_contact, k_electrostatics = request.param - structure = Structure.full_pdb(native_pdb, "A") + structure = Structure(native_pdb, "A") model = AWSEM(structure, distance_cutoff_contact=distance_cutoff_contact, min_sequence_separation_contact=min_sequence_separation_contact, expose_indicator_functions=True, k_electrostatics=k_electrostatics) return model From 2bb3b832b908e12017dada49f3afcb3517660530 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 9 Jul 2025 22:20:08 -0500 Subject: [PATCH 08/76] refactored, passing most tests but failing some due to inaccurate electrostatics energy calculation --- frustratometer/classes/AWSEM.py | 371 +++++++++++++------- frustratometer/optimization/optimization.py | 173 ++++++++- 2 files changed, 405 insertions(+), 139 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 468dc96a..38288dba 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -47,15 +47,15 @@ class AWSEMParameters(BaseModel): k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)") -class AWSEM(Frustratometer): +class AWSEMBase(Frustratometer): + #Mapping to DCA q = 20 aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') def __init__(self, - pdb_structure: object, - sequence: str =None, + sequence: str, expose_indicator_functions: bool=False, **parameters)->object: """ @@ -63,10 +63,8 @@ def __init__(self, Parameters ---------- - pdb_structure : object - Structure object generated by Structure class - sequence : str - The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. + sequence: str + The amino acid sequence expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. @@ -75,16 +73,26 @@ def __init__(self, AWSEM object """ - #Set parameters attributes - self.p = AWSEMParameters(**parameters) - if self.p.min_sequence_separation_contact is None: - self.p.min_sequence_separation_contact = 1 - if self.p.min_sequence_separation_rho is None: - self.p.min_sequence_separation_rho = 1 - if self.p.min_sequence_separation_electrostatics is None: - self.p.min_sequence_separation_electrostatics = 1 - for field, value in self.p: + # set sequence based on argument + self.sequence = sequence + + # set indicator function exposure based on argument + # i guess not exposing indicator functions saves memory? + self.expose_indicator_functions = expose_indicator_functions + + # parse other arguments + p = AWSEMParameters(**parameters) + if p.min_sequence_separation_contact is None: + p.min_sequence_separation_contact = 1 + if p.min_sequence_separation_rho is None: + p.min_sequence_separation_rho = 1 + if p.min_sequence_separation_electrostatics is None: + p.min_sequence_separation_electrostatics = 1 + for field, value in p: setattr(self, field, value) + self.p = p + + # set gamma if isinstance(self.p.gamma, Gamma): gamma = self.p.gamma elif isinstance(self.p.gamma, Path): @@ -96,38 +104,117 @@ def __init__(self, self.direct_gamma = gamma['Direct'][0] self.protein_gamma = gamma['Protein'][0] self.water_gamma = gamma['Water'][0] - self.burial_in_context=self.p.burial_in_context + # set other attributes + self.burial_in_context = self.p.burial_in_context + self.aa_freq = frustration.compute_aa_freq(self.sequence) + self.contact_freq = frustration.compute_contact_freq(self.sequence) + if self.p.k_electrostatics == 0: + self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) + self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, + # but it doesn't hurt to define the distance_cutoff attribute-- + # it's just like any other parameter, such as sequence_cutoff, + # that only matters if we need to compute a mask from a distance matrix + else: + self.sequence_cutoff=self.p.min_sequence_separation_contact + self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, + # but it doesn't hurt to define the distance_cutoff attribute-- + # it's just like any other parameter, such as sequence_cutoff, + # that only matters if we need to compute a mask from a distance matrix # ?????? self._decoy_fluctuation = {} # don't know what this does self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ - # sequence details - if sequence is None: - self.sequence=pdb_structure.sequence - else: - self.sequence=sequence - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + def setup_model(self): + self.calculate_indicators() + self.calculate_energy_and_potts() - # structure details - self.expose_indicator_functions = expose_indicator_functions - self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict - self.pdb_structure = pdb_structure + def calculate_indicators(self): + raise NotImplementedError("Subclasses must this method") - - # allows us to update coordinates only by passing in a pdb file - @property - def pdb_structure(self): - return self._pdb_structure - @pdb_structure.setter - def pdb_structure(self,pdb_structure): + def calculate_energy_and_potts(self): + + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) + + # compute burial and contact energies + self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] + direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] + water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] + protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] + contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + + # Compute electrostatics and add to contact energy + if self.p.k_electrostatics!=0: + electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis] + contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) + + self.contact_energy = contact_energy + + # Compute potts model + self.potts_model = {} + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + # Set the gap energy to zero + self.potts_model['h'][:, 0] = 0 + self.potts_model['J'][:, :, 0, :] = 0 + self.potts_model['J'][:, :, :, 0] = 0 + self._native_energy=None # don't know what this does + + + def compute_configurational_decoy_statistics(self): + raise NotImplementedError("Subclasses must define this method") + + def compute_configurational_energies(self): + raise NotImplementedError("Subclasses must define this method") + + def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): + mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) + return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + + + + + +class AWSEM(AWSEMBase): + + def __init__(self, + pdb_structure: object, + sequence: str =None, + expose_indicator_functions: bool=False, + **parameters)->object: + # assume the user wanted the sequence from the pdb structure if not given + if not sequence: + sequence = pdb_structure.sequence + # load structure-independent parameters and methods + super().__init__(sequence, expose_indicator_functions, **parameters) + # set up strucure + self.setup_structure(pdb_structure) + # calculate masks + if self.burial_in_context==True: + selected_matrix=self.full_pdb_distance_matrix + else: + selected_matrix=self.distance_matrix + self.sequence_mask_rho = frustration.compute_mask(selected_matrix, + maximum_contact_distance=None, + minimum_sequence_separation = self.p.min_sequence_separation_rho) + self.sequence_mask_contact = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=self.p.distance_cutoff_contact, + minimum_sequence_separation = self.p.min_sequence_separation_contact) + self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=None, + minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + self.mask = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=self.distance_cutoff, + minimum_sequence_separation = self.sequence_cutoff) + self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function + self.setup_model() + + def setup_structure(self, pdb_structure): # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') resid = selection_CB.getResindices() N=len(resid) - if N != len(self.sequence): - raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") self.resid = resid self.N = N # set structure-dependent proterties @@ -136,34 +223,38 @@ def pdb_structure(self,pdb_structure): self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file self.init_index_shift=pdb_structure.init_index_shift + self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict self.distance_matrix=pdb_structure.distance_matrix self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix - # reset indicator functions, energies, and potts model - self.compute_indicators_energies_potts_model() - def compute_indicators_energies_potts_model(self): - if self.burial_in_context==True: - selected_matrix=self.full_pdb_distance_matrix - else: - selected_matrix=self.distance_matrix - sequence_mask_rho = frustration.compute_mask(selected_matrix, - maximum_contact_distance=None, - minimum_sequence_separation = self.p.min_sequence_separation_rho) - sequence_mask_contact = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=self.p.distance_cutoff_contact, - minimum_sequence_separation = self.p.min_sequence_separation_contact) + @property + def pdb_structure(self): + return self._pdb_structure + @pdb_structure.setter + def pdb_structure(self,pdb_structure): + # reset structural attributes + self.setup_structure(pdb_structure) + # check that our new structure is compatible with our old one + if self.N != len(self.sequence): + raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") + self.calculate_indicators() + def change_conformation(alternative_pdb_structure): + # this function is an alias for the pdb_structure setter + self.pdb_structure = alternative_pdb_structure + + def calculate_indicators(self): # Calculate rho rho = 0.25 - rho *= (1 + np.tanh(self.p.eta * (selected_matrix- self.p.r_min))) - rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - selected_matrix))) - rho *= sequence_mask_rho + rho *= (1 + np.tanh(self.p.eta * (self.selected_matrix - self.p.r_min))) + rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - self.selected_matrix))) + rho *= self.sequence_mask_rho self.rho=rho #Calculate sigma water rho_r = (rho).sum(axis=1) if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: if self.burial_in_context==True: - self.init_index_shift=pdb_structure.init_index_shift - self.fin_index_shift=pdb_structure.fin_index_shift + self.init_index_shift=self.pdb_structure.init_index_shift + self.fin_index_shift=self.pdb_structure.fin_index_shift rho_r=rho_r[self.init_index_shift:self.fin_index_shift] self.rho_r=rho_r rho_b = np.expand_dims(rho_r, 1) @@ -179,81 +270,54 @@ def compute_indicators_energies_potts_model(self): water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] # store indicators and gammas for our particular sequence as attributes - if self.expose_indicator_functions: - self.indicators=[] - self.indicators.append(burial_indicator[:,0]) - self.indicators.append(burial_indicator[:,1]) - self.indicators.append(burial_indicator[:,2]) - self.indicators.append(direct_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators.append(protein_indicator[:,:,0,0]*sequence_mask_contact) - self.indicators.append(water_indicator[:,:,0,0]*sequence_mask_contact) - - self.gamma_array=[] - temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] - temp_burial_gamma[0]=0 - temp_burial_gamma *= -0.5 * self.p.k_contact - self.gamma_array.append(temp_burial_gamma[:,0]) - self.gamma_array.append(temp_burial_gamma[:,1]) - self.gamma_array.append(temp_burial_gamma[:,2]) - - for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: - temp_gamma = contact_gamma[self.aa_map_awsem_x, self.aa_map_awsem_y].copy() - temp_gamma[0, :] = 0 - temp_gamma[:, 0] = 0 - temp_gamma *= -0.5 * self.k_contact - self.gamma_array.append(temp_gamma) - - self.burial_indicator = burial_indicator - self.direct_indicator = direct_indicator - self.water_indicator = water_indicator - self.protein_indicator = protein_indicator - - - J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) - h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) - - # compute burial and contact energies - self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] - direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] - water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] - protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - # Compute electrostatics and add to contact energy - if self.p.k_electrostatics!=0: - self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) - self.distance_cutoff=None - electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + self.indicators=[] + self.indicators.append(burial_indicator[:,0]) + self.indicators.append(burial_indicator[:,1]) + self.indicators.append(burial_indicator[:,2]) + self.indicators.append(direct_indicator[:,:,0,0]*self.sequence_mask_contact) + self.indicators.append(protein_indicator[:,:,0,0]*self.sequence_mask_contact) + self.indicators.append(water_indicator[:,:,0,0]*self.sequence_mask_contact) + self.gamma_array=[] + temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] + temp_burial_gamma[0]=0 + temp_burial_gamma *= -0.5 * self.p.k_contact + self.gamma_array.append(temp_burial_gamma[:,0]) + self.gamma_array.append(temp_burial_gamma[:,1]) + self.gamma_array.append(temp_burial_gamma[:,2]) + for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: + temp_gamma = contact_gamma[self.aa_map_awsem_x, self.aa_map_awsem_y].copy() + temp_gamma[0, :] = 0 + temp_gamma[:, 0] = 0 + temp_gamma *= -0.5 * self.k_contact + self.gamma_array.append(temp_gamma) + self.burial_indicator = burial_indicator # probably could get rid of either this or indicators list + self.direct_indicator = direct_indicator # probably could get rid of either this or indicators list + self.water_indicator = water_indicator # probably could get rid of either this or indicators list + self.protein_indicator = protein_indicator # probably could get rid of either this or indicators list + if self.p.k_electrostatics != 0: # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] - electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * electrostatics_mask - electrostatics_energy = -self.p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) - contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) - if self.expose_indicator_functions: - self.indicators.append(electrostatics_indicator) - temp_gamma=0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] - temp_gamma[0,:]=0 - temp_gamma[:,0]=0 - self.gamma_array.append(temp_gamma) - else: - self.sequence_cutoff=self.p.min_sequence_separation_contact - self.distance_cutoff=self.p.distance_cutoff_contact - self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) - self.contact_energy = contact_energy - - # Compute potts model - self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] - # Set the gap energy to zero - self.potts_model['h'][:, 0] = 0 - self.potts_model['J'][:, :, 0, :] = 0 - self.potts_model['J'][:, :, :, 0] = 0 - self._native_energy=None # don't know what this does - - def change_conformation(alternative_pdb_structure): - # this function is an alias for the pdb_structure setter - self.pdb_structure = alternative_pdb_structure + electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask + self.indicators.append(electrostatics_indicator) + self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y][1:,1:] + temp_gamma = 0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] + temp_gamma[0,:]=0 + temp_gamma[:,0]=0 + self.gamma_array.append(temp_gamma) + + def calculate_energy_and_potts(self): + super().calculate_energy_and_potts() + if not self.expose_indicator_functions: + del self.burial_indicator + del self.direct_indicator + del self.water_indicator + del self.protein_indicator + if "electrostatics_indicator" in dir(self): + # won't exist if electrostatics are turned off + del self.electrostatics_indicator + del self.indicators def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] @@ -370,9 +434,37 @@ def compute_configurational_energies(self): # import pandas as pd return configurational_energies #, pd.DataFrame(decoy_data, columns=decoy_data_columns) - def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): - mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) - return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + +class AWSEMIndicators(AWSEMBase): + + def __init__(self, + indicators: list, + sequence: str, # sequence is optional if we initialize from a Structure but not here + expose_indicator_functions: bool=False, + **parameters)->object: + """ + A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions + + Parameters + ---------- + indicators : list + List of numpy.ndarray holding the indicator functions, with different decoys stacked along axis 0 + sequence : str + The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. + expose_indicator_functions: bool + If set to True, indicator functions of the contact and burial energy terms can be accessed by user. + + Returns + ------- + AWSEMIndicators object + + """ + super().__init__(sequence, expose_indicator_functions, **parameters) + self.indicators = indicators + self.setup_model() + + def calculate_indicators(self): + pass # the function was initialized with indicators, so there's nothing to do class DecoyEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer @@ -408,11 +500,14 @@ def __init__(self, contact_protein_indicators = [] contact_water_indicators = [] electrostatics_indicators = [] + # the AWSEM class takes care of the indicator calculation (including masking) for us + # AWSEM normally accepts an amino acid sequence argument, but we don't need that here + # However, we do need to pass through parameters used to generate the indicator functions + awsem_obj = AWSEM(pdb_structures[0], expose_indicator_functions=True, **parameters) for pdb_structure in pdb_structures: - # the AWSEM class takes care of the indicator calculation (including masking) for us - # AWSEM normally accepts an amino acid sequence argument, but we don't need that here - # However, we do need to pass through parameters used to generate the indicator functions - all_indicators = AWSEM(pdb_structure, expose_indicator_functions=True, **parameters).indicators + awsem_obj.pdb_structure = pdb_structures[pdb_structure] # we can use the pdb_structure setter to update structural + # stuff without fully re-initializing the object + all_indicators = awsem_obj.indicators burial_low_density_indicators.append(all_indicators[0]) burial_medium_density_indicators.append(all_indicators[1]) burial_high_density_indicators.append(all_indicators[2]) @@ -428,3 +523,11 @@ def __init__(self, np.array(contact_protein_indicators), np.array(contact_water_indicators), np.array(electrostatics_indicators),] + # Although a DecoyEnsemble does not have an energy or frustration in the same sense as an AWSEM, + # it is helpful to attach gamma parameters to the DecoyEnsemble so that DecoyEnergyAverage, etc. + # can read them through class inheritance in the same way that AwsemEnergyAverage, etc. read the gammas + # from the AWSEM class. + # We take the gammas directly from the AWSEM that we used to get our indicators. + # Technically, we generated one AWSEM for each pdb_structure, but the parameters are identical, + # so we can for convenience read the gammas from the most recently initialized awsem_obj + self.gamma_array = awsem_obj.gamma_array diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 2ecf44cd..6ce30a82 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM, AWSEMEnsemble +from frustratometer.classes import AWSEM, DecoyEnsemble from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -512,6 +512,171 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" +class EnsembleEnergyStatistics(EnergyTerm): + """ + An abstract class (but not using the ABC framework) for ensemble statistics that are distributive in the gammas, + meaning the quantity that we want to calculate, (_decoys), is equal to (gammas * _decoys). + This allows us to average the indicator functions over all decoys just once, and then conduct the MC search as we would for + a single sequence. + + When this class was initially written, the intended use cases are for EnsembleEnergyAverage and EnsembleEnergyStd. + These classes just need to call the EnsembleEnergyStatistics __init__ and define the collapse_indicators function. + """ + def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): + # + # boilerplate from Awsem*Average classes + if not type(model)==DecoyEnsemble: + raise TypeError("EnsembleEnergyAverage may only be initialized from a DecoyEnsemble") + self.model=model + self._use_numba=use_numba + self.alphabet=alphabet + self.reindex_dca=[_AA.index(aa) for aa in alphabet] + self.alphabet_size=len(alphabet) + #TODO: Fix the gamma matrix to account for elecrostatics-- i think this is already done! + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + # + # collapse over decoys (axis 0 of our indicator arrays) + self.indicators = self.collapse_indicators(model.indicators) + + self.initialize_functions() + + def initialize_functions(self): + len_alphabet=self.alphabet_size + gamma=self.gamma + + # Precompute the mean of the indicators + means = [np.average(indicator_type, axis=-1) for indicator_type in self.indicators] + + def compute_energy(seq_index): + counts = np.zeros(len_alphabet, dtype=np.int64) + for val in seq_index: + counts[val] += 1 + + # Calculate phi_mean + phi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) + + # 1D indicators + c=0 + for i in range(len_indicators1D): + for j in range(len_alphabet): + phi_mean[c] = indicator_means[i] * counts[j] + c += 1 + + # 2D indicators + for i in range(len_indicators2D): + for j in range(len_alphabet): + for k in range(len_alphabet): + t=1 if j==k else 0 + phi_mean[c] = indicator_means[i+ len_indicators1D] * counts[j] * (counts[k] - t) + c += 1 + + # Calculate energy + energy = 0 + for i in range(phi_len): + energy += gamma[i] * phi_mean[i] + + return energy + + def denergy_mutation(seq_index, pos, aa): + counts = np.zeros(len_alphabet, dtype=np.int64) + for val in seq_index: + counts[val] += 1 + aa_old=seq_index[pos] + if aa_old==aa: + return 0. + + # Calculate phi_mean + + dphi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) + + # 1D indicators + for i in range(len_indicators1D): + dphi_mean[i*len_alphabet + aa_old] -= indicator_means[i] + dphi_mean[i*len_alphabet + aa] += indicator_means[i] + + offset = len_alphabet*len_indicators1D + for i in range(len_indicators2D): + for j in range(len_alphabet): + k=aa_old + if j==k: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= 2 * indicator_means[i + len_indicators1D] * (counts[j]-1) + elif j==aa: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[k] - counts[j] -1) + else: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= indicator_means[i + len_indicators1D] * counts[j] + dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] -= indicator_means[i + len_indicators1D] * counts[j] + k=aa + if j==k: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += 2 * indicator_means[i + len_indicators1D] * counts[j] + elif j==aa_old: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[j] - counts[k] -1) + else: + dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i + len_indicators1D] * counts[j] + dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] += indicator_means[i + len_indicators1D] * counts[j] + + # Calculate energy + denergy = 0 + for i in range(phi_len): + denergy += gamma[i] * dphi_mean[i] + + return denergy + + self.compute_energy = compute_energy + self.compute_denergy_mutation = denergy_mutation + + awsem_energy = AwsemEnergy(use_numba=self.use_numba, model=self.model, alphabet=self.alphabet).energy_function + + def compute_energy_sample(seq_index,n_decoys=100000): + """ Function to compute the variance of the energy of permutations of a sequence using random shuffling. + This function is much faster than compute_energy_permutation but is an approximation""" + energies=np.zeros(n_decoys) + shuffled_index=seq_index.copy() + for i in numba.prange(n_decoys): + energies[i]=awsem_energy(shuffled_index[np.random.permutation(len(shuffled_index))]) + return np.mean(energies) + + def compute_energy_permutation(seq_index): + """ Function to compute the variance of the energy of all permutations of a sequence + Caution: This function is very slow for normal sequences """ + from itertools import permutations + decoy_sequences = np.array(list(permutations(seq_index))) + energies=np.zeros(len(decoy_sequences)) + for i in numba.prange(len(decoy_sequences)): + energies[i]=awsem_energy(decoy_sequences[i]) + return np.mean(energies) + + def collapse_indicators(uncollapsed_indicators): + raise NotImplementedError("EnsembleEnergyStatistics is an abstract class. Use a subclass implementing this function.") + + self.compute_energy_sample=self.numbify(compute_energy_sample,parallel=True) + self.compute_energy_permutation=compute_energy_permutation + + def regression_test(self, seq_index): + expected_energy=self.compute_energy_permutation(seq_index) + energy=self.compute_energy(seq_index) + assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" + +class EnsembleEnergyAverage(EnsembleEnergyStatistics): + """ + See documentation for EnsembleEnergyStatistics + """ + def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): + super().__init__(model, use_numba, alphabet) + + def collapse_indicators(uncollapsed_indicators): + self.indicators = [np.average(indicator, axis=0) for indicator in uncollapsed_indicators] + +class EnsembleEnergyStd(EnsembleEnergyStatistics): + """ + See documentation for EnsembleEnergyStatistics + """ + def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): + super().__init__(model, use_numba, alphabet) + + def collapse_indicators(uncollapsed_indicators): + self.indicators = [np.std(indicator, axis=0) for indicator in uncollapsed_indicators] + + class AwsemEnergyVariance(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self._use_numba=use_numba @@ -1111,10 +1276,8 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): pdb_list = ["tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) - ensemble = AWSEMEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=2, expose_indicator_functions=True) - ########################################################### - ensemble.mask = np.ones((63,63)) - ######################################################### + ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' From d0b23f35b20c58ded1bd80eb9f47c5487788dedd Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 13:19:41 -0500 Subject: [PATCH 09/76] tests passing now (except for the dca tests requiring hmmer, which have been failing for a while) --- frustratometer/classes/AWSEM.py | 45 ++++++++++++++++++--- frustratometer/optimization/optimization.py | 11 ++--- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 38288dba..1462754b 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -7,7 +7,7 @@ from pydantic.types import Path from typing import List,Optional,Union,Generator -__all__ = ['AWSEM','DecoyEnsemble'] +__all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -74,6 +74,7 @@ def __init__(self, """ # set sequence based on argument + self.N = len(sequence) self.sequence = sequence # set indicator function exposure based on argument @@ -109,7 +110,7 @@ def __init__(self, self.burial_in_context = self.p.burial_in_context self.aa_freq = frustration.compute_aa_freq(self.sequence) self.contact_freq = frustration.compute_contact_freq(self.sequence) - if self.p.k_electrostatics == 0: + if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- @@ -204,9 +205,14 @@ def __init__(self, self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + with open('my_data.txt','w') as f: + f.write(f"self.distance_cutoff: {self.distance_cutoff}\n") + f.write(f"self.sequence_cutoff: {self.sequence_cutoff}\n") + np.save('my_distance_matrix.npy',self.distance_matrix) self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) + np.save('my_mask_new.npy',self.mask) self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function self.setup_model() @@ -301,7 +307,7 @@ def calculate_indicators(self): electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list - self.electrostatics_gamma = -self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y][1:,1:] + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] temp_gamma = 0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 @@ -439,6 +445,10 @@ class AWSEMIndicators(AWSEMBase): def __init__(self, indicators: list, + burial_indicator, + direct_indicator, + protein_indicator, + water_indicator, sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, **parameters)->object: @@ -460,7 +470,12 @@ def __init__(self, """ super().__init__(sequence, expose_indicator_functions, **parameters) + self.N self.indicators = indicators + self.burial_indicator = burial_indicator + self.direct_indicator = direct_indicator + self.protein_indicator = protein_indicator + self.water_indicator = water_indicator self.setup_model() def calculate_indicators(self): @@ -500,13 +515,18 @@ def __init__(self, contact_protein_indicators = [] contact_water_indicators = [] electrostatics_indicators = [] + burial_indicators_other = [] + direct_indicators_other = [] + protein_indicators_other = [] + water_indicators_other = [] + electrostatics_indicators_other = [] # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions - awsem_obj = AWSEM(pdb_structures[0], expose_indicator_functions=True, **parameters) + awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, **parameters) for pdb_structure in pdb_structures: - awsem_obj.pdb_structure = pdb_structures[pdb_structure] # we can use the pdb_structure setter to update structural - # stuff without fully re-initializing the object + awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural + # stuff without fully re-initializing the object all_indicators = awsem_obj.indicators burial_low_density_indicators.append(all_indicators[0]) burial_medium_density_indicators.append(all_indicators[1]) @@ -515,6 +535,11 @@ def __init__(self, contact_protein_indicators.append(all_indicators[4]) contact_water_indicators.append(all_indicators[5]) electrostatics_indicators.append(all_indicators[6]) + burial_indicators_other.append(awsem_obj.electrostatics_indicator) + direct_indicators_other.append(awsem_obj.electrostatics_indicator) + protein_indicators_other.append(awsem_obj.electrostatics_indicator) + water_indicators_other.append(awsem_obj.electrostatics_indicator) + electrostatics_indicators_other.append(awsem_obj.electrostatics_indicator) # the idea here is that we have different conformers of a chain of a particular length; # if not, we'll get a ValueError when trying to initialize numpy arrays from a ragged set of lists self.indicators = [np.array(burial_low_density_indicators), @@ -523,6 +548,11 @@ def __init__(self, np.array(contact_protein_indicators), np.array(contact_water_indicators), np.array(electrostatics_indicators),] + self.burial_indicator = np.average(np.array(burial_indicators_other),axis=0) + self.direct_indicator = np.average(np.array(direct_indicators_other),axis=0) + self.protein_indicator = np.average(np.array(protein_indicators_other),axis=0) + self.water_indicator = np.average(np.array(water_indicators_other),axis=0) + self.electrostatics_indicator = np.average(np.array(electrostatics_indicators_other),axis=0) # Although a DecoyEnsemble does not have an energy or frustration in the same sense as an AWSEM, # it is helpful to attach gamma parameters to the DecoyEnsemble so that DecoyEnergyAverage, etc. # can read them through class inheritance in the same way that AwsemEnergyAverage, etc. read the gammas @@ -531,3 +561,6 @@ def __init__(self, # Technically, we generated one AWSEM for each pdb_structure, but the parameters are identical, # so we can for convenience read the gammas from the most recently initialized awsem_obj self.gamma_array = awsem_obj.gamma_array + + def average(self): + return [np.average(indicator, axis=0) for indicator in self.indicators], self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator \ No newline at end of file diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 6ce30a82..bc2abe24 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM, DecoyEnsemble +from frustratometer.classes import AWSEM, AWSEMIndicators, DecoyEnsemble from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -1274,14 +1274,15 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): if __name__ == '__main__': - pdb_list = ["tests/data/1r69.pdb"] + pdb_list = ["tests/data/1r69.pdb","tests/data/1r69.pdb","tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) - ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) - + ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) + indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators = ensemble.average() + average = AWSEMIndicators(indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators,"SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT") reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' - awsem_energy = AwsemEnergy(ensemble, alphabet=reduced_alphabet) + awsem_energy = AwsemEnergy(average, alphabet=reduced_alphabet) heterogeneity = Heterogeneity(exact=False, use_numba=True) energy_mix = awsem_energy - 10*heterogeneity From 9fe70c2c1f7f5e4ce74683fce7e2f9da729651bb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 17:10:59 -0500 Subject: [PATCH 10/76] tests still passing (except for that hmmer issue) and AWSEMIndicators/DecoyEnsemble now working as expected --- frustratometer/classes/AWSEM.py | 100 ++++++++------------ frustratometer/classes/__init__.py | 2 +- frustratometer/optimization/optimization.py | 7 +- 3 files changed, 45 insertions(+), 64 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 1462754b..00358d0a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -37,7 +37,6 @@ class AWSEMParameters(BaseModel): r_maxII: float = Field(9.5, description="Maximum distance for mediated contact potential. (Angstrom)") eta_sigma: float = Field(7.0, description="Sharpness of the density-based switching function between protein-mediated and water-mediated contacts.") - #Membrane membrane_gamma: Union[Path,Gamma] = Field(_path/'data'/'AWSEM_membrane_2015.json', description="File or Gamma object containing the membrane Gamma values (for membrane proteins)") eta_switching: int = Field(10, description="Switching distance for the membrane switching function") @@ -46,6 +45,8 @@ class AWSEMParameters(BaseModel): min_sequence_separation_electrostatics: Optional[int] = Field(1, description="Minimum sequence separation for electrostatics calculation.") k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)") + charges: np.array = Field(np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]), description="Charge on each residue type") + # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] class AWSEMBase(Frustratometer): @@ -110,18 +111,21 @@ def __init__(self, self.burial_in_context = self.p.burial_in_context self.aa_freq = frustration.compute_aa_freq(self.sequence) self.contact_freq = frustration.compute_contact_freq(self.sequence) + charges2 = self.p.charges[:,np.newaxis] * self.p.charges[np.newaxis,:] if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] else: self.sequence_cutoff=self.p.min_sequence_separation_contact self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix + self.charges2 = charges2 # ?????? self._decoy_fluctuation = {} # don't know what this does self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ @@ -301,14 +305,10 @@ def calculate_indicators(self): self.water_indicator = water_indicator # probably could get rid of either this or indicators list self.protein_indicator = protein_indicator # probably could get rid of either this or indicators list if self.p.k_electrostatics != 0: - # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list - self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] - temp_gamma = 0.5 * self.p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] + temp_gamma = 0.5 * self.p.k_electrostatics * self.charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 self.gamma_array.append(temp_gamma) @@ -444,11 +444,11 @@ def compute_configurational_energies(self): class AWSEMIndicators(AWSEMBase): def __init__(self, - indicators: list, burial_indicator, direct_indicator, protein_indicator, water_indicator, + electrostatics_indicator, sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, **parameters)->object: @@ -470,23 +470,23 @@ def __init__(self, """ super().__init__(sequence, expose_indicator_functions, **parameters) - self.N - self.indicators = indicators self.burial_indicator = burial_indicator self.direct_indicator = direct_indicator self.protein_indicator = protein_indicator self.water_indicator = water_indicator + self.electrostatics_indicator = electrostatics_indicator + self.sequence_mask_contact = np.full((self.N,self.N), True) + self.mask = np.full((self.N,self.N), True) + # mask should have been applied when calculating the indicator functions, + # so we set it such that no further masking is performed self.setup_model() def calculate_indicators(self): pass # the function was initialized with indicators, so there's nothing to do -class DecoyEnsemble(): # don't think it's necessary for this one to inherit from Frustratometer - # also, note that the functions compute_configurational_decoy_statistics, - # compute_configurational_energies, and configuration_frustration are - # present in the AWSEM class but removed here, since we don't expect to - # compute frustration on an entire ensemble +class DecoyEnsemble(): + def __init__(self, pdb_structures: Generator[object,None,None], **parameters)->object: @@ -508,18 +508,11 @@ def __init__(self, The numpy arrays' first axes vary the structure, while the second axis and third axis, if it exists, hold the (appropriately masked) indicator functions for each residue or pair of residues """ - burial_low_density_indicators = [] - burial_medium_density_indicators = [] - burial_high_density_indicators = [] - contact_direct_indicators = [] - contact_protein_indicators = [] - contact_water_indicators = [] + burial_indicators = [] + direct_indicators = [] + protein_indicators = [] + water_indicators = [] electrostatics_indicators = [] - burial_indicators_other = [] - direct_indicators_other = [] - protein_indicators_other = [] - water_indicators_other = [] - electrostatics_indicators_other = [] # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions @@ -527,40 +520,27 @@ def __init__(self, for pdb_structure in pdb_structures: awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object - all_indicators = awsem_obj.indicators - burial_low_density_indicators.append(all_indicators[0]) - burial_medium_density_indicators.append(all_indicators[1]) - burial_high_density_indicators.append(all_indicators[2]) - contact_direct_indicators.append(all_indicators[3]) - contact_protein_indicators.append(all_indicators[4]) - contact_water_indicators.append(all_indicators[5]) - electrostatics_indicators.append(all_indicators[6]) - burial_indicators_other.append(awsem_obj.electrostatics_indicator) - direct_indicators_other.append(awsem_obj.electrostatics_indicator) - protein_indicators_other.append(awsem_obj.electrostatics_indicator) - water_indicators_other.append(awsem_obj.electrostatics_indicator) - electrostatics_indicators_other.append(awsem_obj.electrostatics_indicator) - # the idea here is that we have different conformers of a chain of a particular length; - # if not, we'll get a ValueError when trying to initialize numpy arrays from a ragged set of lists - self.indicators = [np.array(burial_low_density_indicators), - np.array(burial_medium_density_indicators), - np.array(contact_direct_indicators), - np.array(contact_protein_indicators), - np.array(contact_water_indicators), - np.array(electrostatics_indicators),] - self.burial_indicator = np.average(np.array(burial_indicators_other),axis=0) - self.direct_indicator = np.average(np.array(direct_indicators_other),axis=0) - self.protein_indicator = np.average(np.array(protein_indicators_other),axis=0) - self.water_indicator = np.average(np.array(water_indicators_other),axis=0) - self.electrostatics_indicator = np.average(np.array(electrostatics_indicators_other),axis=0) - # Although a DecoyEnsemble does not have an energy or frustration in the same sense as an AWSEM, - # it is helpful to attach gamma parameters to the DecoyEnsemble so that DecoyEnergyAverage, etc. - # can read them through class inheritance in the same way that AwsemEnergyAverage, etc. read the gammas - # from the AWSEM class. - # We take the gammas directly from the AWSEM that we used to get our indicators. - # Technically, we generated one AWSEM for each pdb_structure, but the parameters are identical, - # so we can for convenience read the gammas from the most recently initialized awsem_obj - self.gamma_array = awsem_obj.gamma_array + burial_indicators.append(awsem_obj.burial_indicator) + direct_indicators.append(awsem_obj.direct_indicator) + protein_indicators.append(awsem_obj.protein_indicator) + water_indicators.append(awsem_obj.water_indicator) + if hasattr(awsem_obj, 'electrostatics_indicator'): + electrostatics_indicators.append(awsem_obj.electrostatics_indicator) + # Stack and average indicators, ensuring correct shape for calculate_energy_and_potts + self.burial_indicator = np.mean(np.stack(burial_indicators, axis=0), axis=0) # (N, 3) + self.direct_indicator = np.mean(np.stack(direct_indicators, axis=0), axis=0) # (N, N, 1, 1) + self.protein_indicator = np.mean(np.stack(protein_indicators, axis=0), axis=0) # (N, N, 1, 1) + self.water_indicator = np.mean(np.stack(water_indicators, axis=0), axis=0) # (N, N, 1, 1) + if electrostatics_indicators: + self.electrostatics_indicator = np.mean(np.stack(electrostatics_indicators, axis=0), axis=0) # (N, N) + else: + self.electrostatics_indicator = None + # Attach gamma parameters from the AWSEM object + self.burial_gamma = awsem_obj.burial_gamma + self.direct_gamma = awsem_obj.direct_gamma + self.protein_gamma = awsem_obj.protein_gamma + self.water_gamma = awsem_obj.water_gamma + self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) def average(self): - return [np.average(indicator, axis=0) for indicator in self.indicators], self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator \ No newline at end of file + return self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator, self.electrostatics_indicator \ No newline at end of file diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index 4ff9b4b5..fd8b79b8 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM, DecoyEnsemble +from .AWSEM import AWSEM, AWSEMIndicators, DecoyEnsemble from .Structure import Structure from .Map import Map from .Gamma import Gamma diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index bc2abe24..34f93c78 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1277,9 +1277,10 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): pdb_list = ["tests/data/1r69.pdb","tests/data/1r69.pdb","tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) - indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators = ensemble.average() - average = AWSEMIndicators(indicators, burial_indicators, direct_indicators, protein_indicators, water_indicators,"SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT") - + burial_indicators, direct_indicators, protein_indicators, water_indicators, electrostatics_indicators = ensemble.average() + average = AWSEMIndicators(burial_indicators, direct_indicators, protein_indicators, water_indicators, electrostatics_indicators, + "SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT") + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' awsem_energy = AwsemEnergy(average, alphabet=reduced_alphabet) From d3386c9dee09cf4be56c0cbcaae44f4e6ef7bfbb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 17:18:38 -0500 Subject: [PATCH 11/76] got rid of unused classes --- frustratometer/optimization/optimization.py | 164 -------------------- 1 file changed, 164 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 34f93c78..416f195f 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -512,170 +512,6 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" -class EnsembleEnergyStatistics(EnergyTerm): - """ - An abstract class (but not using the ABC framework) for ensemble statistics that are distributive in the gammas, - meaning the quantity that we want to calculate, (_decoys), is equal to (gammas * _decoys). - This allows us to average the indicator functions over all decoys just once, and then conduct the MC search as we would for - a single sequence. - - When this class was initially written, the intended use cases are for EnsembleEnergyAverage and EnsembleEnergyStd. - These classes just need to call the EnsembleEnergyStatistics __init__ and define the collapse_indicators function. - """ - def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): - # - # boilerplate from Awsem*Average classes - if not type(model)==DecoyEnsemble: - raise TypeError("EnsembleEnergyAverage may only be initialized from a DecoyEnsemble") - self.model=model - self._use_numba=use_numba - self.alphabet=alphabet - self.reindex_dca=[_AA.index(aa) for aa in alphabet] - self.alphabet_size=len(alphabet) - #TODO: Fix the gamma matrix to account for elecrostatics-- i think this is already done! - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) - # - # collapse over decoys (axis 0 of our indicator arrays) - self.indicators = self.collapse_indicators(model.indicators) - - self.initialize_functions() - - def initialize_functions(self): - len_alphabet=self.alphabet_size - gamma=self.gamma - - # Precompute the mean of the indicators - means = [np.average(indicator_type, axis=-1) for indicator_type in self.indicators] - - def compute_energy(seq_index): - counts = np.zeros(len_alphabet, dtype=np.int64) - for val in seq_index: - counts[val] += 1 - - # Calculate phi_mean - phi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) - - # 1D indicators - c=0 - for i in range(len_indicators1D): - for j in range(len_alphabet): - phi_mean[c] = indicator_means[i] * counts[j] - c += 1 - - # 2D indicators - for i in range(len_indicators2D): - for j in range(len_alphabet): - for k in range(len_alphabet): - t=1 if j==k else 0 - phi_mean[c] = indicator_means[i+ len_indicators1D] * counts[j] * (counts[k] - t) - c += 1 - - # Calculate energy - energy = 0 - for i in range(phi_len): - energy += gamma[i] * phi_mean[i] - - return energy - - def denergy_mutation(seq_index, pos, aa): - counts = np.zeros(len_alphabet, dtype=np.int64) - for val in seq_index: - counts[val] += 1 - aa_old=seq_index[pos] - if aa_old==aa: - return 0. - - # Calculate phi_mean - - dphi_mean = np.zeros(len_alphabet*len_indicators1D + len_alphabet**2*len_indicators2D) - - # 1D indicators - for i in range(len_indicators1D): - dphi_mean[i*len_alphabet + aa_old] -= indicator_means[i] - dphi_mean[i*len_alphabet + aa] += indicator_means[i] - - offset = len_alphabet*len_indicators1D - for i in range(len_indicators2D): - for j in range(len_alphabet): - k=aa_old - if j==k: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= 2 * indicator_means[i + len_indicators1D] * (counts[j]-1) - elif j==aa: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[k] - counts[j] -1) - else: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] -= indicator_means[i + len_indicators1D] * counts[j] - dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] -= indicator_means[i + len_indicators1D] * counts[j] - k=aa - if j==k: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += 2 * indicator_means[i + len_indicators1D] * counts[j] - elif j==aa_old: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i+ len_indicators1D] * (counts[j] - counts[k] -1) - else: - dphi_mean[offset + i*len_alphabet**2 + j*len_alphabet + k] += indicator_means[i + len_indicators1D] * counts[j] - dphi_mean[offset + i*len_alphabet**2 + k*len_alphabet + j] += indicator_means[i + len_indicators1D] * counts[j] - - # Calculate energy - denergy = 0 - for i in range(phi_len): - denergy += gamma[i] * dphi_mean[i] - - return denergy - - self.compute_energy = compute_energy - self.compute_denergy_mutation = denergy_mutation - - awsem_energy = AwsemEnergy(use_numba=self.use_numba, model=self.model, alphabet=self.alphabet).energy_function - - def compute_energy_sample(seq_index,n_decoys=100000): - """ Function to compute the variance of the energy of permutations of a sequence using random shuffling. - This function is much faster than compute_energy_permutation but is an approximation""" - energies=np.zeros(n_decoys) - shuffled_index=seq_index.copy() - for i in numba.prange(n_decoys): - energies[i]=awsem_energy(shuffled_index[np.random.permutation(len(shuffled_index))]) - return np.mean(energies) - - def compute_energy_permutation(seq_index): - """ Function to compute the variance of the energy of all permutations of a sequence - Caution: This function is very slow for normal sequences """ - from itertools import permutations - decoy_sequences = np.array(list(permutations(seq_index))) - energies=np.zeros(len(decoy_sequences)) - for i in numba.prange(len(decoy_sequences)): - energies[i]=awsem_energy(decoy_sequences[i]) - return np.mean(energies) - - def collapse_indicators(uncollapsed_indicators): - raise NotImplementedError("EnsembleEnergyStatistics is an abstract class. Use a subclass implementing this function.") - - self.compute_energy_sample=self.numbify(compute_energy_sample,parallel=True) - self.compute_energy_permutation=compute_energy_permutation - - def regression_test(self, seq_index): - expected_energy=self.compute_energy_permutation(seq_index) - energy=self.compute_energy(seq_index) - assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" - -class EnsembleEnergyAverage(EnsembleEnergyStatistics): - """ - See documentation for EnsembleEnergyStatistics - """ - def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): - super().__init__(model, use_numba, alphabet) - - def collapse_indicators(uncollapsed_indicators): - self.indicators = [np.average(indicator, axis=0) for indicator in uncollapsed_indicators] - -class EnsembleEnergyStd(EnsembleEnergyStatistics): - """ - See documentation for EnsembleEnergyStatistics - """ - def __init__(self, model:DecoyEnsemble, use_numba=True, alphabet=_AA): - super().__init__(model, use_numba, alphabet) - - def collapse_indicators(uncollapsed_indicators): - self.indicators = [np.std(indicator, axis=0) for indicator in uncollapsed_indicators] - class AwsemEnergyVariance(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): From 88834d3c174edc1c040e17b60ab964b3523b6d35 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 10 Jul 2025 17:27:29 -0500 Subject: [PATCH 12/76] updated documentation --- frustratometer/classes/AWSEM.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 00358d0a..c58cb1a1 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -444,11 +444,11 @@ def compute_configurational_energies(self): class AWSEMIndicators(AWSEMBase): def __init__(self, - burial_indicator, - direct_indicator, - protein_indicator, - water_indicator, - electrostatics_indicator, + burial_indicator: np.ndarray, + direct_indicator: np.ndarray, + protein_indicator: np.ndarray, + water_indicator: np.ndarray, + electrostatics_indicator: Union[np.ndarray, None], sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, **parameters)->object: @@ -457,8 +457,17 @@ def __init__(self, Parameters ---------- - indicators : list - List of numpy.ndarray holding the indicator functions, with different decoys stacked along axis 0 + burial_indicator : np.ndarray + Burial indicator array, most likely accessed using the burial_indicator attribute of an AWSEM + direct_indicator : np.ndarray + Direct indicator array, most likely accessed using the direct_indicator attribute of an AWSEM + protein_indicator : np.ndarray + Protein indicator array, most likely accessed using the protein_indicator attribute of an AWSEM + water_indicator : np.ndarray + Water indicator array, most likely accessed using the water_indicator attribute of an AWSEM + electrostatics_indicator : Union[np.ndarray, None] + Electrostatics indicator array, most likely accessed using the electrostatics_indicator attribute of an AWSEM. + May be None is electrostatics were turned off (k_electrostatics=0). sequence : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool @@ -503,10 +512,7 @@ def __init__(self, Returns ------- - DecoyEnsemble object, which holds a list of 7 numpy arrays, ordered by indicator function type: - [low density (rho), medium density (rho), high density (rho), direct, protein, water, electrostatics]. - The numpy arrays' first axes vary the structure, while the second axis and third axis, if it exists, - hold the (appropriately masked) indicator functions for each residue or pair of residues + DecoyEnsemble object, which holds indicator arrays and gammas computed by the AWSEM class. """ burial_indicators = [] direct_indicators = [] From 2a88e03a9be80aaa4d98bc8f0664df73b02681a2 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 10:08:31 -0500 Subject: [PATCH 13/76] refactored DecoyEnsemble to minimize memory usage and implemented avg/std classes --- frustratometer/classes/AWSEM.py | 168 ++++++++++++++++++++++++++------ 1 file changed, 136 insertions(+), 32 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index c58cb1a1..bf90fa86 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -178,9 +178,6 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) - - - class AWSEM(AWSEMBase): def __init__(self, @@ -223,7 +220,7 @@ def __init__(self, def setup_structure(self, pdb_structure): # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') - resid = selection_CB.getResindices() + resid = list(set(selection_CB.getResindices())) # sometimes, it decides to split a single residue in 2 N=len(resid) self.resid = resid self.N = N @@ -246,6 +243,7 @@ def pdb_structure(self,pdb_structure): self.setup_structure(pdb_structure) # check that our new structure is compatible with our old one if self.N != len(self.sequence): + import pdb; pdb.set_trace() raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") self.calculate_indicators() def change_conformation(alternative_pdb_structure): @@ -512,41 +510,147 @@ def __init__(self, Returns ------- - DecoyEnsemble object, which holds indicator arrays and gammas computed by the AWSEM class. + DecoyEnsemble object, which holds indicator arrays (and gammas???) computed by the AWSEM class. """ - burial_indicators = [] - direct_indicators = [] - protein_indicators = [] - water_indicators = [] - electrostatics_indicators = [] # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions - awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, **parameters) + awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, repair_pdb=True, **parameters) for pdb_structure in pdb_structures: awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object - burial_indicators.append(awsem_obj.burial_indicator) - direct_indicators.append(awsem_obj.direct_indicator) - protein_indicators.append(awsem_obj.protein_indicator) - water_indicators.append(awsem_obj.water_indicator) - if hasattr(awsem_obj, 'electrostatics_indicator'): - electrostatics_indicators.append(awsem_obj.electrostatics_indicator) + with open('burial_indicators.npy','ab') as f: + np.save(f,awsem_obj.burial_indicator) + with open('direct_indicators.npy','ab') as f: + np.save(f,awsem_obj.direct_indicator) + with open('protein_indicators.npy','ab') as f: + np.save(f,awsem_obj.protein_indicator) + with open('water_indicators.npy','ab') as f: + np.save(f,awsem_obj.water_indicator) + with open('electrostatics_indicators.npy','ab') as f: + if hasattr(awsem_obj, 'electrostatics_indicator'): + np.save(f,awsem_obj.electrostatics_indicator) + else: + np.save(f,None) # Stack and average indicators, ensuring correct shape for calculate_energy_and_potts - self.burial_indicator = np.mean(np.stack(burial_indicators, axis=0), axis=0) # (N, 3) - self.direct_indicator = np.mean(np.stack(direct_indicators, axis=0), axis=0) # (N, N, 1, 1) - self.protein_indicator = np.mean(np.stack(protein_indicators, axis=0), axis=0) # (N, N, 1, 1) - self.water_indicator = np.mean(np.stack(water_indicators, axis=0), axis=0) # (N, N, 1, 1) - if electrostatics_indicators: - self.electrostatics_indicator = np.mean(np.stack(electrostatics_indicators, axis=0), axis=0) # (N, N) - else: - self.electrostatics_indicator = None - # Attach gamma parameters from the AWSEM object - self.burial_gamma = awsem_obj.burial_gamma - self.direct_gamma = awsem_obj.direct_gamma - self.protein_gamma = awsem_obj.protein_gamma - self.water_gamma = awsem_obj.water_gamma - self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) + self.burial_indicators = self.get_indicators("burial_indicators.npy") + self.direct_indicators = self.get_indicators("direct_indicators.npy") + self.protein_indicators = self.get_indicators("protein_indicators.npy") + self.water_indicators = self.get_indicators("water_indicators.npy") + self.electrostatics_indicators = self.get_indicators("electrostatics_indicators.npy") + + # averages are needed to compute standard deviation, so it's useful to have them as attributes + self.avg_burial = None + self.avg_direct = None + self.avg_prot = None + self.avg_wat = None + self.avg_elect = None + + ## Attach gamma parameters from the AWSEM object + ## Kind of off-topic from my current use of this class + #self.burial_gamma = awsem_obj.burial_gamma + #self.direct_gamma = awsem_obj.direct_gamma + #self.protein_gamma = awsem_obj.protein_gamma + #self.water_gamma = awsem_obj.water_gamma + #self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) + + # this might help with memory + del awsem_obj + + def get_indicators(self, filename): + # expecting a numpy file + with open(filename, 'rb') as f: + while True: + try: + yield np.load(f, allow_pickle=True) # needed to load None if not electrostatics + except EOFError: + break def average(self): - return self.burial_indicator, self.direct_indicator, self.protein_indicator, self.water_indicator, self.electrostatics_indicator \ No newline at end of file + # average burial computation from generator + avg_burial = 0 + counter = 0 + for array in self.burial_indicators: + counter += 1 + burial_indicator += array + avg_burial /= counter + self.avg_burial = avg_burial + # average direct computation from generator + avg_direct = 0 + counter = 0 + for array in self.direct_indicators: + counter += 1 + direct_indicator += array + avg_direct /= counter + self.avg_direct = avg_direct + # average prot computation from generator + avg_prot = 0 + counter = 0 + for array in self.protein_indicators: + counter += 1 + protein_indicator += array + avg_prot /= counter + self.avg_prot = avg_prot + # average wat computation from generator + avg_wat = 0 + counter = 0 + for array in self.water_indicators: + counter += 1 + water_indicator += array + avg_wat /= counter + self.avg_wat = avg_wat + # average elec computation from generator + # if not defined, set to zero, which will have no impact + if self.electrostatics_indicators == None: + avg_elec = 0 + else: + avg_elec = 0 + counter = 0 + for array in self.electrostatics_indicators: + counter += 1 + avg_elec += array + avg_elec /= counter + self.avg_elec = avg_elec + return self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec + + def std(self): + if self.avg_burial is None or self.avg_direct is None or \ + self.avg_prot is None or self.avg_wat is None or \ + self.avg_elec is None: + self.average() # compute averages if not already done + # std burial computation from generator and previously computed average + std_burial = 0 + counter = 0 + for array in self.burial_indicators: + counter += 1 + std_burial += (array - self.avg_burial) ** 2 + std_burial = np.sqrt(std_burial / counter) + # std direct computation from generator and previously computed average + std_direct = 0 + counter = 0 + for array in self.direct_indicators: + counter += 1 + std_direct += (array - self.avg_direct) ** 2 + std_direct = np.sqrt(std_direct / counter) + # std prot computation from generator and previously computed average + std_prot = 0 + counter = 0 + for array in self.protein_indicators: + counter += 1 + std_prot += (array - self.avg_prot) ** 2 + std_prot = np.sqrt(std_prot / counter) + # std wat computation from generator and previously computed average + std_wat = 0 + counter = 0 + for array in self.water_indicators: + counter += 1 + std_wat += (array - self.avg_wat) ** 2 + std_wat = np.sqrt(std_wat / counter) + # std elec computation from generator and previously computed average + std_elec = 0 + counter = 0 + for array in self.electrostatics_indicators: + counter += 1 + std_elec += (array - self.avg_elec) ** 2 + std_elec = np.sqrt(std_elec / counter) + return std_burial, std_direct, std_prot, std_wat, std_elec \ No newline at end of file From 790adce77afd970a7ef1315032f5283c299d5907 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 11:36:03 -0500 Subject: [PATCH 14/76] got pdb.py from my amyloid_atlas branch, which allows prody for loading the structures. this probably wasn't necessary --- frustratometer/pdb/pdb.py | 87 +++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/frustratometer/pdb/pdb.py b/frustratometer/pdb/pdb.py index 853e6e62..2feeb658 100644 --- a/frustratometer/pdb/pdb.py +++ b/frustratometer/pdb/pdb.py @@ -37,7 +37,8 @@ def download(pdbID: str,directory: Union[Path,str]=Path.cwd()) -> Path: return pdb_file def get_sequence(pdb_file: str, - chain: str + chain: str, + return_start_mask: bool=False ) -> str: """ Get a protein sequence from a pdb file @@ -46,8 +47,10 @@ def get_sequence(pdb_file: str, ---------- pdb_file : str, PDB file location. - chain: str, - Chain ID of the selected protein. + chain: str or list, + Chain ID(s) of the selected protein. + return_start_mask: bool, + Return binary mask list indicating whether each position is the start of a chain Returns ------- @@ -58,37 +61,63 @@ def get_sequence(pdb_file: str, Get a protein sequence from a PDB file :param pdb: PDB file location - :param chain: chain name of PDB file to get sequence + :param chain: chain name(s) of PDB file to get sequence :return: protein sequence """ - if ".cif" in str(pdb_file): - parser = MMCIFParser() - else: - parser = PDBParser() - structure = parser.get_structure('name', pdb_file) + if ".cif" in str(pdb_file): # BIOPYTHON + parser = MMCIFParser() # BIOPYTHON + else: # BIOPYTHON + parser = PDBParser() # BIOPYTHON + structure = parser.get_structure('name', pdb_file) #BIOPYTHON + #structure = prody.parsePDB(str(pdb_file)) # PRODY + #hv = structure.getHierView() # PRODY if chain==None: - all_chains=[i.get_id() for i in structure.get_chains()] + all_chains=[i.get_id() for i in structure.get_chains()] # BIOPYTHON + #all_chains = [structure_chain.getChid() for structure_chain in hv] # PRODY else: - all_chains=[chain] + if type(chain) == list: + all_chains = chain + elif type(chain) == str: + all_chains = [id for id in chain if id != " "] # remove spaces if present in string + else: + raise TypeError(f"chain must be list or str but was {type(chain)}") sequence = "" - for chain in all_chains: - c = structure[0][chain] + start_mask = [] + for single_chain in all_chains: + c = structure[0][single_chain] # BIOPYTHON + #c = hv[single_chain] # PRODY chain_seq = "" for residue in c: - is_regular_res = residue.has_id('CA') and residue.has_id('O') - res_id = residue.get_id()[0] - if (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res: - residue_name = residue.get_resname() + is_regular_res = residue.has_id('CA') and residue.has_id('O') # BIOPYTHON + #atom_names = [atom.getName() for atom in residue] # PRODY + #is_regular_res = ("CA" in atom_names and "O" in atom_names) # PRODY + res_id = residue.get_id()[0] #BIOPYTHON + if (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res: # BIOPYTHON + # i don't know what H_HSE, H_M3L, and H_CAS are doing + # because they aren't in three_to_one, so those should throw an error + # long story short, I don't think we have to worry about them when switching from biopython to prody + #if is_regular_res: # PRODY + residue_name = residue.get_resname() # BIOPYTHON + #residue_name = residue.getResname() # PRODY chain_seq += three_to_one[residue_name] + if chain_seq == "": # empty chain, like a nucleic acid chain (see 8ZWK) + continue # FYI, currently, a non-empty chain with certain invalid residues will throw an error at the three_to_one[residue_name] above sequence += chain_seq - return sequence + start_mask.append(1) + for _ in range(1,len(chain_seq)): + start_mask.append(0) + if return_start_mask: + return (sequence,start_mask) + else: + return sequence def get_distance_matrix(pdb_file: Union[Path,str], chain: str, - method: str = 'CB' + method: str = 'CB', + return_distance_midpoints: bool = False, ) -> np.array: """ Calculate the distance matrix of the specified atoms in a PDB file. @@ -106,6 +135,10 @@ def get_distance_matrix(pdb_file: Union[Path,str], 'CA' for using only the CA atom, 'minimum' for using the minimum distance between all atoms in each residue, 'CB_force' computes a new coordinate for the CB atom based on the CA, C, and N atoms and uses CB distance even for glycine. + return_distance_midpoints: bool + Whether to return a matrix of the same shape as distance_matrix representing the same contacts as distance_matrix + that indicates the absolute coordinates of the midpoint between the pair of atoms. This helps us compute the pair distribution + functions of the different classes of contacts. So this matrix isn't really a matrix because each "element" has 3 channels: x, y, and z Returns: np.array: The distance matrix of the selected atoms. @@ -121,7 +154,7 @@ def get_distance_matrix(pdb_file: Union[Path,str], if method == 'CA': coords = structure.select('protein and name CA' + chain_selection).getCoords() elif method == 'CB': - coords = structure.select('(protein and (name CB) or (resname GLY and name CA))' + chain_selection).getCoords() + coords = structure.select('(protein and (name CB) or (resname GLY IGL and name CA))' + chain_selection).getCoords() elif method == 'minimum': selection = structure.select('protein' + chain_selection) coords = selection.getCoords() @@ -163,8 +196,20 @@ def get_distance_matrix(pdb_file: Union[Path,str], if len(coords) == 0: raise IndexError('Empty selection for distance map') + # coords should be a numpy array of shape (N,3) distance_matrix = sdist.squareform(sdist.pdist(coords)) - return distance_matrix + assert distance_matrix.shape[0] == distance_matrix.shape[1] + if return_distance_midpoints: + midpoint_matrix = np.zeros((distance_matrix.shape[0],distance_matrix.shape[1],3)) + for i in range(distance_matrix.shape[0]): + for j in range(distance_matrix.shape[1]): + midpoint_matrix[i,j,:] = (coords[None,i,:] + coords[None,j,:])/2 + # check that indexing is consistent with distance_matrix + assert np.allclose(np.linalg.norm(coords[i,:]-coords[j,:]),distance_matrix[i,j]) + assert np.allclose(midpoint_matrix,midpoint_matrix.transpose(1,0,2)) # check symmetry + return distance_matrix, midpoint_matrix + else: + return distance_matrix def full_to_filtered_aligned_mapping(aligned_sequence: str, From 6068d3c152a2786ffec549c138a6a8c7f54fa8cb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 11:36:46 -0500 Subject: [PATCH 15/76] undid unnecessary modification to structure processing in AWSEM --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index bf90fa86..38176582 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -220,7 +220,7 @@ def __init__(self, def setup_structure(self, pdb_structure): # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') - resid = list(set(selection_CB.getResindices())) # sometimes, it decides to split a single residue in 2 + resid = selection_CB.getResindices() N=len(resid) self.resid = resid self.N = N From 6d5586b09070472f895d687fe671f985e79e671a Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 11:37:22 -0500 Subject: [PATCH 16/76] changed name of DecoyEnsemble.average to DecoyEnsemble.avg --- frustratometer/classes/AWSEM.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 38176582..5ba8e171 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -566,7 +566,7 @@ def get_indicators(self, filename): except EOFError: break - def average(self): + def avg(self): # average burial computation from generator avg_burial = 0 counter = 0 @@ -617,7 +617,7 @@ def std(self): if self.avg_burial is None or self.avg_direct is None or \ self.avg_prot is None or self.avg_wat is None or \ self.avg_elec is None: - self.average() # compute averages if not already done + self.avg() # compute averages if not already done # std burial computation from generator and previously computed average std_burial = 0 counter = 0 From 1c1a5effa9e55eb816eda23be8f75bb2ff76db16 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 12:59:00 -0500 Subject: [PATCH 17/76] fix wrong variable used --- frustratometer/classes/AWSEM.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 5ba8e171..56914986 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -566,13 +566,14 @@ def get_indicators(self, filename): except EOFError: break + # average indicator functions over all decoys def avg(self): # average burial computation from generator avg_burial = 0 counter = 0 for array in self.burial_indicators: counter += 1 - burial_indicator += array + avg_burial += array avg_burial /= counter self.avg_burial = avg_burial # average direct computation from generator @@ -580,7 +581,7 @@ def avg(self): counter = 0 for array in self.direct_indicators: counter += 1 - direct_indicator += array + avg_direct += array avg_direct /= counter self.avg_direct = avg_direct # average prot computation from generator @@ -588,7 +589,7 @@ def avg(self): counter = 0 for array in self.protein_indicators: counter += 1 - protein_indicator += array + avg_prot += array avg_prot /= counter self.avg_prot = avg_prot # average wat computation from generator @@ -596,7 +597,7 @@ def avg(self): counter = 0 for array in self.water_indicators: counter += 1 - water_indicator += array + avg_wat += array avg_wat /= counter self.avg_wat = avg_wat # average elec computation from generator @@ -613,6 +614,7 @@ def avg(self): self.avg_elec = avg_elec return self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec + # standard deviation of indicator functions over all decoys def std(self): if self.avg_burial is None or self.avg_direct is None or \ self.avg_prot is None or self.avg_wat is None or \ From 3b64349b2d571c745d0d7ee4f37e570995e5b00b Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 13:01:52 -0500 Subject: [PATCH 18/76] reinitialize indicator function generator each time attribute is accessed --- frustratometer/classes/AWSEM.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 56914986..ca4ac5bb 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -532,20 +532,15 @@ def __init__(self, np.save(f,awsem_obj.electrostatics_indicator) else: np.save(f,None) - # Stack and average indicators, ensuring correct shape for calculate_energy_and_potts - self.burial_indicators = self.get_indicators("burial_indicators.npy") - self.direct_indicators = self.get_indicators("direct_indicators.npy") - self.protein_indicators = self.get_indicators("protein_indicators.npy") - self.water_indicators = self.get_indicators("water_indicators.npy") - self.electrostatics_indicators = self.get_indicators("electrostatics_indicators.npy") - - # averages are needed to compute standard deviation, so it's useful to have them as attributes + # averages are needed to compute standard deviation + # having these be attributes allows them to be easily passed + # from the avg method to the std method self.avg_burial = None self.avg_direct = None self.avg_prot = None self.avg_wat = None self.avg_elect = None - + ################################################ ## Attach gamma parameters from the AWSEM object ## Kind of off-topic from my current use of this class #self.burial_gamma = awsem_obj.burial_gamma @@ -553,10 +548,31 @@ def __init__(self, #self.protein_gamma = awsem_obj.protein_gamma #self.water_gamma = awsem_obj.water_gamma #self.electrostatics_gamma = getattr(awsem_obj, 'electrostatics_gamma', None) - + ################################################ # this might help with memory del awsem_obj + # To manage memory, we need the indicator attributes to be generators (see self.get_indicators). + # But to ensure that we can iterate over them more than once, we need to + # be able to reinitialize the generators. We accomplish this with properties + @property + def burial_indicators(self): + return self.get_indicators("burial_indicators.npy") + @property + def direct_indicators(self): + return self.get_indicators("direct_indicators.npy") + @property + def protein_indicators(self): + return self.get_indicators("protein_indicators.npy") + @property + def water_indicators(self): + return self.get_indicators("water_indicators.npy") + @property + def electrostatics_indicators(self): + return self.get_indicators("electrostatics_indicators.npy") + + # allows us to process indicators without holding them all in memory + # this requires that every method that acts on the indicators iterates over them def get_indicators(self, filename): # expecting a numpy file with open(filename, 'rb') as f: From 3a281d72044b9c2cd85f77c7ec6e337c43696296 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 11 Jul 2025 18:31:43 -0500 Subject: [PATCH 19/76] take absolute value of gamma (sqrt(gamma**2)) for approximate variance calculation --- frustratometer/classes/AWSEM.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index ca4ac5bb..4802135d 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -449,6 +449,7 @@ def __init__(self, electrostatics_indicator: Union[np.ndarray, None], sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, + absolute_value_gamma: bool=False, **parameters)->object: """ A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions @@ -470,7 +471,9 @@ def __init__(self, The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. - + absolute_value_gamma: bool + If True, replace gammas with their absolute values. This is helpful for the standard deviation approximation + Returns ------- AWSEMIndicators object @@ -484,6 +487,13 @@ def __init__(self, self.electrostatics_indicator = electrostatics_indicator self.sequence_mask_contact = np.full((self.N,self.N), True) self.mask = np.full((self.N,self.N), True) + if absolute_value_gamma: + self.burial_gamma = np.abs(self.burial_gamma) + self.direct_gamma = np.abs(self.direct_gamma) + self.protein_gamma = np.abs(self.protein_gamma) + self.water_gamma = np.abs(self.water_gamma) + self.electrostatics_gamma = np.abs(self.electrostatics_gamma) + self.absolute_value_gamma = absolute_value_gamma # mask should have been applied when calculating the indicator functions, # so we set it such that no further masking is performed self.setup_model() From 80612e0c5279c6d536e966defb14ea4a5d6d4ec1 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 13 Jul 2025 16:14:58 -0500 Subject: [PATCH 20/76] updated parallel tempering writer to write total energy and other energies from the same frame as the sequence --- frustratometer/classes/AWSEM.py | 2 +- frustratometer/optimization/optimization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 4802135d..72295d1c 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -439,7 +439,7 @@ def compute_configurational_energies(self): return configurational_energies #, pd.DataFrame(decoy_data, columns=decoy_data_columns) -class AWSEMIndicators(AWSEMBase): +class AWSEMIndicators(AWSEMBase): # PottsEvaluatorFromIndicators or PottsEnergyEvaluatorFromIndicators? def __init__(self, burial_indicator: np.ndarray, diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 416f195f..89806962 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -988,7 +988,7 @@ def parallel_tempering(self, seq_indices=None, temperatures=np.logspace(0,6,25), # Run the simulation and append data periodically for s, updated_seq_indices, total_energy in self.parallel_tempering_steps(seq_indices, temperatures, n_steps, n_steps_per_cycle): # Prepare data for this chunk - energies={key:energy_term.energies(seq_indices) for key,energy_term in self.evaluation_energies.items()} + energies={key:energy_term.energies(updated_seq_indices) for key,energy_term in self.evaluation_energies.items()} for i, temp in enumerate(temperatures): sequence_str = index_to_sequence(updated_seq_indices[i],alphabet=self.alphabet) # Convert sequence index back to string step_data=({'Step': (s+1) * n_steps_per_cycle, 'Temperature': temp, 'Sequence': sequence_str, 'Total Energy': total_energy[i]}) From 21f1504769f9f677093e1ee92014ec42284ea3cf Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 13 Jul 2025 20:58:26 -0500 Subject: [PATCH 21/76] added covariance matrix method to DecoyEnsemble --- frustratometer/classes/AWSEM.py | 87 +++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 72295d1c..87555a8a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -526,6 +526,7 @@ def __init__(self, # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, repair_pdb=True, **parameters) + self.N = awsem_obj.N for pdb_structure in pdb_structures: awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object @@ -549,7 +550,14 @@ def __init__(self, self.avg_direct = None self.avg_prot = None self.avg_wat = None - self.avg_elect = None + self.avg_elec = None + # and standard deviations can help us check our work + # on the covariance matrix calculation + self.std_burial = None + self.std_direct = None + self.std_prot = None + self.std_wat = None + self.std_elec = None ################################################ ## Attach gamma parameters from the AWSEM object ## Kind of off-topic from my current use of this class @@ -593,6 +601,7 @@ def get_indicators(self, filename): break # average indicator functions over all decoys + # these averages can then be averaged to get the average of all indicator functions over all decoys def avg(self): # average burial computation from generator avg_burial = 0 @@ -640,7 +649,10 @@ def avg(self): self.avg_elec = avg_elec return self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec - # standard deviation of indicator functions over all decoys + # standard deviation of each indicator function over all decoys + # ** averaging these averages + # DOES NOT equal + # the variance over all structures of the sum of the indicator functions of each structure ** def std(self): if self.avg_burial is None or self.avg_direct is None or \ self.avg_prot is None or self.avg_wat is None or \ @@ -653,6 +665,7 @@ def std(self): counter += 1 std_burial += (array - self.avg_burial) ** 2 std_burial = np.sqrt(std_burial / counter) + self.std_burial = std_burial # std direct computation from generator and previously computed average std_direct = 0 counter = 0 @@ -660,6 +673,7 @@ def std(self): counter += 1 std_direct += (array - self.avg_direct) ** 2 std_direct = np.sqrt(std_direct / counter) + self.std_direct = std_direct # std prot computation from generator and previously computed average std_prot = 0 counter = 0 @@ -667,6 +681,7 @@ def std(self): counter += 1 std_prot += (array - self.avg_prot) ** 2 std_prot = np.sqrt(std_prot / counter) + self.std_prot = std_prot # std wat computation from generator and previously computed average std_wat = 0 counter = 0 @@ -674,6 +689,7 @@ def std(self): counter += 1 std_wat += (array - self.avg_wat) ** 2 std_wat = np.sqrt(std_wat / counter) + self.std_wat = std_wat # std elec computation from generator and previously computed average std_elec = 0 counter = 0 @@ -681,4 +697,69 @@ def std(self): counter += 1 std_elec += (array - self.avg_elec) ** 2 std_elec = np.sqrt(std_elec / counter) - return std_burial, std_direct, std_prot, std_wat, std_elec \ No newline at end of file + self.std_elec = std_elec + return std_burial, std_direct, std_prot, std_wat, std_elec + + def covariance_matrix(self): + # return covariance matrix of the set of all indicator functions + # the indicator functions will be grouped by residue: + # low density burial | <-- Residue 1 + # medium density burial | <-- Residue 1 + # high density burial | <-- Residue 1 + # direct with residue 1 | <-- Residue 1 + # protein-mediated with residue 1 | <-- Residue 1 + # water-mediated with residue 1 | <-- Residue 1 + # electrostatics with residue 1 | <-- Residue 1 + # ... | <-- Residue 1 + # direct with residue N | <-- Residue 1 + # protein-mediated with residue N | <-- Residue 1 + # water-mediated with residue N | <-- Residue 1 + # electrostatics with residue N | <-- Residue 1 + # ... | <-- Residue 2 + # ... | <-- ... + # ... | <-- Residue N + # however, the reindexing is performed at the end + # + # compute averages + if self.avg_burial is None or self.avg_direct is None or \ + self.avg_prot is None or self.avg_wat is None or \ + self.avg_elec is None: + self.avg() + all_avg = np.concatenate([indicators.flatten() for indicators in + [self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec]]) + ex_ey = np.outer(all_avg, all_avg) + # compute covariances + number_indicators = 4*self.N**2 + 3*self.N + exy = np.zeros((number_indicators, number_indicators)) + num_decoys = 0 + for b, d, p, w, e in zip(self.burial_indicators, self.direct_indicators, + self.protein_indicators, self.water_indicators, + self.electrostatics_indicators): + all_decoy = np.concatenate([b.flatten(), d.flatten(), p.flatten(), w.flatten(), e.flatten()]) + exy += np.outer(all_decoy, all_decoy) + num_decoys += 1 + exy /= num_decoys + covariance_matrix = exy - ex_ey + # check our work + variances = np.concatenate([self.std_burial.flatten(), self.std_direct.flatten(), + self.std_prot.flatten(), self.std_wat.flatten(), + self.std_elec.flatten()])**2 + assert np.allclose(variances, np.diag(covariance_matrix)) + assert np.all(covariance_matrix==covariance_matrix.T) + # reindex to implement convention expressed in comments above + #n*3:(n+1)*3 burial where n ranges from 0 to 62 + #3*N + n*N:(n+1)*N direct + #3*N + N**2 + n*N:(n+1)*N protein + #3*N + 2*N**2 + n*N:(n+1)*N water + #3*N + 3*N**2 + n*N:(n+1)*N electrostatics + #so we reindex each axis by the same index array + #this index array is + index_array = np.concatenate( + [[3*n, 3*n+1, 3*n+2] \ + + list(range(3*self.N + self.N*n, 3*self.N + self.N*(n+1))) \ + + list(range(3*self.N + self.N**2 + self.N*n, 3*self.N + self.N**2 + self.N*(n+1))) \ + + list(range(3*self.N + 2*self.N**2 + self.N*n, 3*self.N + 2*self.N**2 + self.N*(n+1))) \ + + list(range(3*self.N + 3*self.N**2 + self.N*n, 3*self.N + 3*self.N**2 + self.N*(n+1))) \ + for n in range(self.N)]) + covariance_matrix = covariance_matrix[index_array, index_array] + return covariance_matrix \ No newline at end of file From 002c8cf521a2b4740b651aeb8fb20182df5a4842 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 13 Jul 2025 21:05:39 -0500 Subject: [PATCH 22/76] saving reindexing array as attribute --- frustratometer/classes/AWSEM.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 87555a8a..e3b0d8b9 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -762,4 +762,5 @@ def covariance_matrix(self): + list(range(3*self.N + 3*self.N**2 + self.N*n, 3*self.N + 3*self.N**2 + self.N*(n+1))) \ for n in range(self.N)]) covariance_matrix = covariance_matrix[index_array, index_array] + self.residue_order_index_array = index_array return covariance_matrix \ No newline at end of file From 1d2d833271583618bfefd87888eeeb9ad1c310cb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 14 Jul 2025 18:25:35 -0500 Subject: [PATCH 23/76] cleaned up decoyensemble class slightly --- frustratometer/classes/AWSEM.py | 105 +++++++++++++++++++------------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index e3b0d8b9..45279a51 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -131,6 +131,8 @@ def __init__(self, self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ def setup_model(self): + # some methods that should be called to complete the initialization of subclass instances + # subclasses should (re)define these methods as needed self.calculate_indicators() self.calculate_energy_and_potts() @@ -501,7 +503,49 @@ def __init__(self, def calculate_indicators(self): pass # the function was initialized with indicators, so there's nothing to do +class AWSEMVariancePotts(AWSEMBase): + def __init__(self, + covariance_matrix: np.ndarray, + sequence: str, # sequence is optional if we initialize from a Structure but not here + expose_indicator_functions: bool=False, + absolute_value_gamma: bool=False, + **parameters)->object: + """ + A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions + + Parameters + ---------- + covariance_matrix: np.ndarray + Covariance matrix of all __indicator functions___ (not residues) over a decoy set + sequence : str + The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. + expose_indicator_functions: bool + If set to True, indicator functions of the contact and burial energy terms can be accessed by user. + absolute_value_gamma: bool + If True, replace gammas with their absolute values. This is helpful for the standard deviation approximation + + Returns + ------- + AWSEMIndicators object + + """ + super().__init__(sequence, expose_indicator_functions, **parameters) + self.covariance_matrix = covariance_matrix + self.sequence_mask_contact = np.full((self.N,self.N), True) + self.mask = np.full((self.N,self.N), True) + self.setup_model() + def calculate_indicators(self): + # we need to cluster covariance indicators by intra-residue and inter-residue pairs + # there are 7 intra-residue indicator functions per residue + assert len(self.covariance_matrix.shape)==2, self.covariance_matrix.shape + assert self.covariance_matrix.shape[0] == self.covariance_matrix.shape[1] + xy_indices = np.meshgrid(range(self.covariance_matrix.shape[0]), range(self.covariance_matrix.shape[1])) + diagonal_distances = np.abs(xy_indices[0]-xy_indices[1]) + #block_diagonal_mask = np.ma.make_mask(np.flatten(np.array())) + intra_residue = self.covariance_matrix[diagonal_distances<7].reshape((self.N,7)) + inter_residue = self.covariance_matrix[diagonal_distances>=7].reshape((self.N,7*(self.N-1))) + class DecoyEnsemble(): def __init__(self, @@ -701,66 +745,39 @@ def std(self): return std_burial, std_direct, std_prot, std_wat, std_elec def covariance_matrix(self): - # return covariance matrix of the set of all indicator functions - # the indicator functions will be grouped by residue: - # low density burial | <-- Residue 1 - # medium density burial | <-- Residue 1 - # high density burial | <-- Residue 1 - # direct with residue 1 | <-- Residue 1 - # protein-mediated with residue 1 | <-- Residue 1 - # water-mediated with residue 1 | <-- Residue 1 - # electrostatics with residue 1 | <-- Residue 1 - # ... | <-- Residue 1 - # direct with residue N | <-- Residue 1 - # protein-mediated with residue N | <-- Residue 1 - # water-mediated with residue N | <-- Residue 1 - # electrostatics with residue N | <-- Residue 1 - # ... | <-- Residue 2 - # ... | <-- ... - # ... | <-- Residue N - # however, the reindexing is performed at the end # # compute averages if self.avg_burial is None or self.avg_direct is None or \ self.avg_prot is None or self.avg_wat is None or \ self.avg_elec is None: self.avg() - all_avg = np.concatenate([indicators.flatten() for indicators in - [self.avg_burial, self.avg_direct, self.avg_prot, self.avg_wat, self.avg_elec]]) + triu_indices = np.triu_indices(self.N, k=1) + all_avg = np.concatenate([self.avg_burial.flatten(), + self.avg_direct[triu_indices].squeeze(), self.avg_prot[triu_indices].squeeze(), + self.avg_wat[triu_indices].squeeze(), self.avg_elec[triu_indices].squeeze()]) ex_ey = np.outer(all_avg, all_avg) # compute covariances - number_indicators = 4*self.N**2 + 3*self.N + number_indicators = 3*self.N + 4*int((self.N**2 - self.N)/2) + assert ex_ey.shape == (number_indicators, number_indicators), f"ex_ey.shape: {ex_ey.shape}, number_indicators: {number_indicators}" exy = np.zeros((number_indicators, number_indicators)) num_decoys = 0 + # we want all the burial indicators, + # but only the unique pairwise indicators (no need to double count) for b, d, p, w, e in zip(self.burial_indicators, self.direct_indicators, - self.protein_indicators, self.water_indicators, + self.protein_indicators, self.water_indicators, self.electrostatics_indicators): - all_decoy = np.concatenate([b.flatten(), d.flatten(), p.flatten(), w.flatten(), e.flatten()]) + all_decoy = np.concatenate([b.flatten(), d[triu_indices].squeeze(), + p[triu_indices].squeeze(), w[triu_indices].squeeze(), e[triu_indices].squeeze()]) exy += np.outer(all_decoy, all_decoy) num_decoys += 1 exy /= num_decoys covariance_matrix = exy - ex_ey # check our work - variances = np.concatenate([self.std_burial.flatten(), self.std_direct.flatten(), - self.std_prot.flatten(), self.std_wat.flatten(), - self.std_elec.flatten()])**2 - assert np.allclose(variances, np.diag(covariance_matrix)) + #variances = np.concatenate([np.triu(self.std_burial).flatten(), np.triu(self.std_direct).flatten(), + # np.triu(self.std_prot).flatten(), np.triu(self.std_wat).flatten(), + # np.triu(self.std_elec).flatten()])**2 + #variances = variances[variances!=0] + #assert np.allclose(variances, np.diag(covariance_matrix)) assert np.all(covariance_matrix==covariance_matrix.T) - # reindex to implement convention expressed in comments above - #n*3:(n+1)*3 burial where n ranges from 0 to 62 - #3*N + n*N:(n+1)*N direct - #3*N + N**2 + n*N:(n+1)*N protein - #3*N + 2*N**2 + n*N:(n+1)*N water - #3*N + 3*N**2 + n*N:(n+1)*N electrostatics - #so we reindex each axis by the same index array - #this index array is - index_array = np.concatenate( - [[3*n, 3*n+1, 3*n+2] \ - + list(range(3*self.N + self.N*n, 3*self.N + self.N*(n+1))) \ - + list(range(3*self.N + self.N**2 + self.N*n, 3*self.N + self.N**2 + self.N*(n+1))) \ - + list(range(3*self.N + 2*self.N**2 + self.N*n, 3*self.N + 2*self.N**2 + self.N*(n+1))) \ - + list(range(3*self.N + 3*self.N**2 + self.N*n, 3*self.N + 3*self.N**2 + self.N*(n+1))) \ - for n in range(self.N)]) - covariance_matrix = covariance_matrix[index_array, index_array] - self.residue_order_index_array = index_array + assert covariance_matrix.shape == exy.shape == ex_ey.shape return covariance_matrix \ No newline at end of file From 3f0dd8d2973af7f491e978b66d40ef18aa9f8c20 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 27 Jul 2025 12:18:15 -0500 Subject: [PATCH 24/76] partial implementation of covariance matrix potts model; still need to do a lot more work --- frustratometer/classes/AWSEM.py | 245 ++++++++++++++++++-- frustratometer/classes/__init__.py | 2 +- frustratometer/optimization/optimization.py | 149 +++++++++++- 3 files changed, 369 insertions(+), 27 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 45279a51..baafda3a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -7,7 +7,7 @@ from pydantic.types import Path from typing import List,Optional,Union,Generator -__all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble'] +__all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble', 'AWSEMVariancePotts'] class AWSEMParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) @@ -211,11 +211,11 @@ def __init__(self, with open('my_data.txt','w') as f: f.write(f"self.distance_cutoff: {self.distance_cutoff}\n") f.write(f"self.sequence_cutoff: {self.sequence_cutoff}\n") - np.save('my_distance_matrix.npy',self.distance_matrix) + #np.save('my_distance_matrix.npy',self.distance_matrix) self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) - np.save('my_mask_new.npy',self.mask) + #np.save('my_mask_new.npy',self.mask) self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function self.setup_model() @@ -487,17 +487,32 @@ def __init__(self, self.protein_indicator = protein_indicator self.water_indicator = water_indicator self.electrostatics_indicator = electrostatics_indicator - self.sequence_mask_contact = np.full((self.N,self.N), True) - self.mask = np.full((self.N,self.N), True) + # we don't have a distance matrix to a apply a minimum sequence separation to-- + # we have to assume that this consideration was already made when computing the indicators. + # So we just set the "distance" matrix to zeros and set no maximum cutoff, so that nothing changes + # however, we can apply a minimum sequence separation-based mask to the matrix + self.sequence_mask_contact = frustration.compute_mask(np.zeros((self.N,self.N)), + maximum_contact_distance=None, + minimum_sequence_separation = self.p.min_sequence_separation_contact) + self.electrostatics_mask = frustration.compute_mask(np.zeros((self.N,self.N)), + maximum_contact_distance=None, + minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + self.mask = frustration.compute_mask(np.zeros((self.N,self.N)), + maximum_contact_distance=self.distance_cutoff, + minimum_sequence_separation = self.sequence_cutoff) if absolute_value_gamma: self.burial_gamma = np.abs(self.burial_gamma) self.direct_gamma = np.abs(self.direct_gamma) self.protein_gamma = np.abs(self.protein_gamma) self.water_gamma = np.abs(self.water_gamma) self.electrostatics_gamma = np.abs(self.electrostatics_gamma) - self.absolute_value_gamma = absolute_value_gamma - # mask should have been applied when calculating the indicator functions, - # so we set it such that no further masking is performed + self.absolute_value_gamma = absolute_value_gamma + #np.save('absolute_value_gamma_1.npy',absolute_value_gamma) + #np.save('burial_indicator_1.npy',burial_indicator) + #np.save('direct_indicator_1.npy', direct_indicator) + #np.save('protein_indicator_1.npy', protein_indicator) + #np.save('water_indicator_1.npy', water_indicator) + #np.save('electrostatics_indicator_1.npy', electrostatics_indicator) self.setup_model() def calculate_indicators(self): @@ -511,7 +526,6 @@ def __init__(self, absolute_value_gamma: bool=False, **parameters)->object: """ - A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions Parameters ---------- @@ -526,26 +540,178 @@ def __init__(self, Returns ------- - AWSEMIndicators object + AWSEMVariancePotts object """ super().__init__(sequence, expose_indicator_functions, **parameters) self.covariance_matrix = covariance_matrix - self.sequence_mask_contact = np.full((self.N,self.N), True) - self.mask = np.full((self.N,self.N), True) + self.num_indicators = 3*self.N + 4*(self.N**2-self.N)/2 # low, med, high burial for each N, 4 classes of pair interactions self.setup_model() + @staticmethod # trying to avoid loading down memory with too many permanent attributes + def pairwise_mask(l): # l for length + # Helps us figure out where a 1D array's elements were in an upper triangular matrix, + # assuming the matrix was flattened row-major style and the main diagonal was excluded. + # Each index i of this list gives us the indices of the 1D array that were in row i of the matrix + #NbyN_matrix_rows = [[range(n*l-int(((n**2)+n)/2),(n+1)*l-int((((n+1)**2)+(n+1))/2)), + # ] for n in range(l)] + mask = np.zeros((l,l)) + for i in range(l): + temp = np.zeros((l,l)) + # set elements involving i equal to 1 + temp[:,i] = 1 + temp[i,:] = 1 + temp = temp[np.triu_indices(l,k=1)] # this flattens the array, keeping only the upper triangle + found = np.where(temp==1)[0] + try: + mask[i, :] = [1 if index in found else 0 for index in range(l)]#[1 if index in NbyN_matrix_rows[i] else 0 for index in range(l)] + except: + import pdb; pdb.set_trace() + return mask + def calculate_indicators(self): - # we need to cluster covariance indicators by intra-residue and inter-residue pairs - # there are 7 intra-residue indicator functions per residue + print("start calculate indicators") assert len(self.covariance_matrix.shape)==2, self.covariance_matrix.shape assert self.covariance_matrix.shape[0] == self.covariance_matrix.shape[1] - xy_indices = np.meshgrid(range(self.covariance_matrix.shape[0]), range(self.covariance_matrix.shape[1])) - diagonal_distances = np.abs(xy_indices[0]-xy_indices[1]) - #block_diagonal_mask = np.ma.make_mask(np.flatten(np.array())) - intra_residue = self.covariance_matrix[diagonal_distances<7].reshape((self.N,7)) - inter_residue = self.covariance_matrix[diagonal_distances>=7].reshape((self.N,7*(self.N-1))) - + print("assertions complete") + # each "indicator function" is actually a covariance of two indicator functions. + # There are a few different kinds of pairs of indicator functions. + # The first kind is self-covariances, AKA variances, which we further break down + # into burial (dependent on AA identity at a single position) + # and pairwise (dependent on two positions) + self.burial_variances = np.diag(self.covariance_matrix[:3*self.N,:3*self.N]) # shape (3N,) + self.pairwise_variances = np.diag(self.covariance_matrix[3*self.N:,3*self.N:]) # shape (4N,) + np.save("burial_variances.npy", self.burial_variances) + np.save("pairwise_variances.npy", self.pairwise_variances) + print("variances calculated") + # The other kind of covariance is a covarience between indicator functions. + # We break these down into burial-burial covariances (dependent on two identities), + # burial-pairwise indicator covariances (some dependent on 2, others dependent on 3 identities), + # and pairwise indicator-pairwise indicator covariances (dependent on 3 or 4 identities) + self.burial_burial_covariances = self.covariance_matrix[np.triu_indices(3*self.N,k=1)] # shape (((3N)**2-3N)/2,) + print("first covariances calculated") + num_upper = int((self.N**2-self.N)/2) + burial_pairwise_covariances_2 = np.zeros((3*self.N, 4*num_upper)) # shape (3N, 4((N**2-N)/2)) + #self.burial_pairwise_covariances_2 = np.concatenate([ # can be represented by 2-body term in Potts model + # pairwise_mask tells us which pairwise indicator functions in a given row + # involve the residue whose burial covariances are evaluated in that row + # (the covariance matrix has more elements than there are energy terms-- + # some elements represent relationships between residues that don't interact directly, + # so there are many indicators in each row i involving residues j and k but not i) + # + # We repeat pairwise_mask 3 times because each residue is repeated 3 times (low, med, high density) + # self.covariance_matrix[:3*self.N, 3*self.N+i*num_upper:3*self.N+(i+1)*num_upper]\ + # *self.pairwise_mask(num_upper)[:self.N,:].repeat(3,axis=0)\ + # for i in range(4)], axis=1) # shape (3N,4((N**2-N)/2)) + for counter in range(4): + burial_pairwise_covariances_2[:,counter*num_upper:(counter+1)*num_upper] =\ + (self.covariance_matrix[:3*self.N, 3*self.N+counter*num_upper:3*self.N+(counter+1)*num_upper]+1E-10)\ + *self.pairwise_mask(num_upper)[:self.N,:].repeat(3,axis=0) # tiny shift of 1E-10 ensures that the only contacts at exactly 0 are those that fail the mask + self.burial_pairwise_covariances_2 = burial_pairwise_covariances_2 + print("second covariances calculated") + # these last three components contain many more covariances than the others + # and are likely to be sparse + self.burial_pairwise_covariances_3 = None # set to every burial-pairwise covariance not in the previous one + self.pairwise_pairwise_covariances_3 = None # set to every pairwise where one residue is common between the two + self.pairwise_pairwise_covariances_4 = None # everything not in pairwise_pairwise_covariances_3 + + def calculate_energy_and_potts(self): + + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) + + # compute burial and contact energies + # the "energy" of our potts model representing the covariance, not a physical energy + # this "burial energy" is the sum of variances of the burial indicators (the one-body part of the model) + self.burial_energy = (0.5*self.p.k_contact*self.burial_gamma[h_index[1]])**2 * self.burial_variances.reshape((self.N,1,3)) + # the "contact energy" is ordinarily the sum of all two-body components of the model + # (direct, protein, water, electrostatics), so we do the analogous thing here + template = np.zeros((self.N,self.N)) + num_upper = int((self.N**2-self.N)/2) + triu_indices = np.triu_indices(self.N,k=1) + template[triu_indices] = self.pairwise_variances[:num_upper] + direct = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.direct_gamma[J_index[2], J_index[3]]**2 + template[triu_indices] = self.pairwise_variances[num_upper:2*num_upper] + protein_mediated = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.protein_gamma[J_index[2], J_index[3]]**2 + template[triu_indices] = self.pairwise_variances[2*num_upper:3*num_upper] + water_mediated = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.water_gamma[J_index[2], J_index[3]]**2 + contact_energy = self.p.k_contact * np.array([direct, protein_mediated, water_mediated]) + if self.p.k_electrostatics!=0: + template[triu_indices] = self.pairwise_variances[3*num_upper:] + electrostatics_energy = self.electrostatics_gamma * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 + contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) + # for the variance potts model, there is one more kind of two-body interaction: + # burial-pairwise covariance when the pairwise energy term involves the residue in the burial term + # self.burial_pairwise_covariances_2 has shape (3N, 4(N^2-N)/2) + # we first multiply each row by the appropriate burial energy + temp = self.burial_pairwise_covariances_2 + low = temp[::3,:,np.newaxis]*0.5*self.p.k_contact*self.burial_gamma[h_index[1],0] + med = temp[1::3,:,:]*0.5*self.p.k_contact*self.burial_gamma[h_index[1],1] + high = temp[2::3,:,:]*0.5*self.p.k_contact*self.burial_gamma[h_index[1],2] + # we can now collapse our 3 burial indicator types + temp = np.sum(np.concatenate((low[None,...], med[None,...], high[None,...]), axis=0), axis=0) + assert temp.shape == (self.N, 4*((self.N**2-self.N)/2)), temp.shape + # now we split into our 4 pairwise contact types, keeping only the elements of each row + # that represent a pairwise interaction involving the residue whose burial covariances are found in that row + direct, prot, wat, elec = np.split(temp[temp!=0], 4, axis=1) + # now we need to go from shape (N, (N^2-N)/2) to (N,N) + # (each residue burial indicator covaries with (N^2-N)/2 pairwise indicators, + # but only N of them include the same residue from the burial indicator; + # others have a value of 0, which we can easily eliminate) + # we also need to multiply by our pairwise gammas + direct = direct[direct != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.direct_gamma[J_index[3]]*self.p.k_contact + prot = prot[prot != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.protein_gamma[J_index[3]]*self.p.k_contact + wat = wat[wat != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.water_gamma[J_index[3]]*self.p.k_contact + elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.electrostatics_gamma[J_index[3]]*self.p.k_contact + + contact_energy = np.append(contact_energy, direct[np.newaxis,...], axis=0) + contact_energy = np.append(contact_energy, prot[np.newaxis,...], axis=0) + contact_energy = np.append(contact_energy, wat[np.newaxis,...], axis=0) + contact_energy = np.append(contact_energy, elec[np.newaxis,...], axis=0) + + """ + direct = np.zeros((self.N,self.N)) + direct[triu_indices] = + direct + + num_upper = int((self.N**2-self.N)/2) + triu_indices = np.triu_indices(self.N,k=1) + template = np.zeros((self.N,self.N)) + + assert direct.shape==(self.N,self.N), direct.shape + assert prot.shape==(self.N,self.N), prot.shape + assert wat.shape==(self.N,self.N), wat.shape + assert elec.shape==(self.N,self.N), elec.shape + + + for counter,row in enumerate(self.burial_pairwise_covariances_2): + direct_indicators = row[:len(row)//4] + direct_energy = direct_indicators[direct_indicators>0].reshape((-1,1))\ + *0.5*self.p.k_contact*self.burial_gamma[:,counter%3]\ + *self.p.k_contact*self.direct_gamma[] + direct = row[row!=0] * 0.5*self.p.k_contact*self.burial_gamma[] + template[counter] = row[row==1] # where the residue corresponding to the burial row is involved in the pairwise indicator + burial_pairwise_2 = self.burial_pairwise_covariances_2[] + """ + #################################################################### + # the potts model that we're using (AWSEMEnergy) multiplies each of the J terms by 1/2, + # so we should multiply them by 2 to cancel that out + contact_energy[np.diag_indices(contact_energy.shape[0])] *= 1/2 + contact_energy *= 2 + #################################################################### + + self.contact_energy = contact_energy + + # Compute potts model + self.potts_model = {} + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + # Set the gap energy to zero + self.potts_model['h'][:, 0] = 0 + self.potts_model['J'][:, :, 0, :] = 0 + self.potts_model['J'][:, :, :, 0] = 0 + self._native_energy=None # don't know what this does + class DecoyEnsemble(): def __init__(self, @@ -560,7 +726,7 @@ def __init__(self, yields Structure objects representing decoy structures other parameters: masks and cutoffs affecting the AWSEM class's indicator function calculations; - they must be the same for all structures; burial_in_context also available, but use at your own risk + they are applied to all structures in the ensemble; burial_in_context also available, but use at your own risk Returns ------- @@ -570,8 +736,21 @@ def __init__(self, # AWSEM normally accepts an amino acid sequence argument, but we don't need that here # However, we do need to pass through parameters used to generate the indicator functions awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, repair_pdb=True, **parameters) - self.N = awsem_obj.N - for pdb_structure in pdb_structures: + self.N = awsem_obj.N # number of residues + with open('burial_indicators.npy','ab') as f: + np.save(f,awsem_obj.burial_indicator) + with open('direct_indicators.npy','ab') as f: + np.save(f,awsem_obj.direct_indicator) + with open('protein_indicators.npy','ab') as f: + np.save(f,awsem_obj.protein_indicator) + with open('water_indicators.npy','ab') as f: + np.save(f,awsem_obj.water_indicator) + with open('electrostatics_indicators.npy','ab') as f: + if hasattr(awsem_obj, 'electrostatics_indicator'): + np.save(f,awsem_obj.electrostatics_indicator) + else: + np.save(f,None) + for pdb_structure in pdb_structures: # iterate over the rest of the structures without re-initializing the entire AWSEM class awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object with open('burial_indicators.npy','ab') as f: @@ -780,4 +959,22 @@ def covariance_matrix(self): #assert np.allclose(variances, np.diag(covariance_matrix)) assert np.all(covariance_matrix==covariance_matrix.T) assert covariance_matrix.shape == exy.shape == ex_ey.shape - return covariance_matrix \ No newline at end of file + self.covariance_matrix = covariance_matrix + return covariance_matrix + + def all_decoy_indicators(self): + # returns lists of indicator functions for each decoy + # memory scales with the size of the structure and decoy set + all_burial = [] + all_direct = [] + all_prot = [] + all_wat = [] + all_elec = [] + for burial, direct, prot, wat, elec in zip(self.burial_indicators,self.direct_indicators, + self.protein_indicators, self.water_indicators, self.electrostatics_indicators): + all_burial.append(burial) + all_direct.append(direct) + all_prot.append(prot) + all_wat.append(wat) + all_elec.append(elec) + return all_burial, all_direct, all_prot, all_wat, all_elec \ No newline at end of file diff --git a/frustratometer/classes/__init__.py b/frustratometer/classes/__init__.py index fd8b79b8..d469d8c0 100644 --- a/frustratometer/classes/__init__.py +++ b/frustratometer/classes/__init__.py @@ -7,7 +7,7 @@ """ from .DCA import DCA -from .AWSEM import AWSEM, AWSEMIndicators, DecoyEnsemble +from .AWSEM import AWSEM, AWSEMIndicators, DecoyEnsemble, AWSEMVariancePotts from .Structure import Structure from .Map import Map from .Gamma import Gamma diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 89806962..39d70b12 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -7,7 +7,7 @@ from frustratometer.classes import Frustratometer from frustratometer.classes import Structure -from frustratometer.classes import AWSEM, AWSEMIndicators, DecoyEnsemble +from frustratometer.classes import AWSEM, AWSEMIndicators, DecoyEnsemble, AWSEMVariancePotts from frustratometer.optimization.EnergyTerm import EnergyTerm from frustratometer.optimization.inner_product import compute_all_region_means from frustratometer.optimization.inner_product import build_mean_inner_product_matrix @@ -365,6 +365,152 @@ def regression_test(self): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" +#@numba.experimental.jitclass([('_use_numba',numba.float32),('std',numba.float32),('total_energies',numba.float32[:])]) +class AwsemStdSlow(EnergyTerm): + """ Computes the standard deviation of the AWSEM energies of a set of decoy structures + by computing the energy of each decoy structure and then computing the std of the energies + """ + def __init__(self, all_burial, all_direct, all_prot, all_wat, all_elec, sequence, + alphabet=_AA, use_numba=True, **parameters): + + self._use_numba=use_numba + self.alphabet=alphabet + + self.models_h = [] + self.models_J = [] + for burial, direct, prot, wat, elec in zip(all_burial, all_direct, all_prot, all_wat, all_elec): + model = AWSEMIndicators(burial, direct, prot, wat, elec, sequence, **parameters) + self.models_h.append(model.potts_model['h']) + self.models_J.append(model.potts_model['J']) + self.mask = model.mask # should be the same for all, so we put this outside the loop + + if alphabet!=_AA: + raise NotImplementedError("Reindex your potts models according to your alphabet") + self.reindex_dca=[_AA.index(aa) for aa in alphabet] + self.model_h=self.model_h[:,self.reindex_dca] + self.model_J=self.model_J[:,:,self.reindex_dca][:,:,:,self.reindex_dca] + + self.stds = [] + self.total_energies = [] + + self.initialize_functions() + + def initialize_functions(self): + mask=self.mask.copy() + models_h=self.models_h.copy() + models_J=self.models_J.copy() + + def compute_energy(seq_index: np.array) -> float: + seq_len = len(seq_index) + to_append = np.zeros(len(models_h)) # a new array for each seq index, with a length equal to the number of decoys + for counter, models in enumerate(zip(models_h, models_J)): + model_h = models[0] + model_J = models[1] + energy_h = 0.0 + energy_J = 0.0 + for i in range(seq_len): + energy_h -= model_h[i, seq_index[i]] + for i in range(seq_len): + for j in range(seq_len): + aa_i = seq_index[i] + aa_j = seq_index[j] + energy_J -= model_J[i, j, aa_i, aa_j] * mask[i, j] + decoy_energy = energy_h + energy_J / 2 + to_append[counter] = decoy_energy + + self.total_energies.append(to_append) + + std = to_append.std() + self.stds.append(std) + #std = np.array([1,2]).var()#total_energies.var() # doing variance for now because variances are additive + #self.stds.append(std) + return std + + def compute_denergy_mutation(seq_index: np.ndarray, pos: int, aa_new: int) -> float: + aa_old=seq_index[pos] + + for counter, models in enumerate(zip(models_h, models_J)): + model_h = models[0] + model_J = models[1] + #import pdb; pdb.set_trace() + energy_difference = -model_h[pos,aa_new] + model_h[pos,aa_old] + # Initialize j_correction to 0 + j_correction = 0.0 + # Manually iterate over the sequence indices + for idx in range(len(seq_index)): + aa_idx = seq_index[idx] # The amino acid at the current position + # Accumulate corrections for positions other than the mutated one + j_correction += model_J[idx, pos, aa_idx, aa_old] * mask[idx, pos] + j_correction -= model_J[idx, pos, aa_idx, aa_new] * mask[idx, pos] + # For self-interaction, subtract the old interaction and add the new one + j_correction -= model_J[pos, pos, aa_old, aa_old] * mask[pos, pos] + j_correction += model_J[pos, pos, aa_new, aa_new] * mask[pos, pos] + energy_difference += j_correction + self.total_energies[pos][counter] += energy_difference + #import pdb; pdb.set_trace() + new_std = self.total_energies[pos].std() + delta_std = new_std - self.stds[pos] + self.stds[pos] = new_std + return delta_std + + def compute_denergy_swap(seq_index, pos1, pos2): + aa2 , aa1 = seq_index[pos1],seq_index[pos2] + + for counter, models in enumerate(zip(models_h, models_J)): + model_h = models[0] + model_J = models[1] + #Compute fields + energy_difference = 0 + energy_difference -= (model_h[pos1, aa1] - model_h[pos1, seq_index[pos1]]) # h correction aa1 + energy_difference -= (model_h[pos2, aa2] - model_h[pos2, seq_index[pos2]]) # h correction aa2 + + #Compute couplings + j_correction = 0.0 + for pos in range(len(seq_index)): + aa = seq_index[pos] + # Corrections for interactions with pos1 and pos2 + j_correction += model_J[pos, pos1, aa, seq_index[pos1]] * mask[pos, pos1] + j_correction -= model_J[pos, pos1, aa, aa1] * mask[pos, pos1] + j_correction += model_J[pos, pos2, aa, seq_index[pos2]] * mask[pos, pos2] + j_correction -= model_J[pos, pos2, aa, aa2] * mask[pos, pos2] + + # J correction, interaction with self aminoacids + j_correction -= model_J[pos1, pos2, seq_index[pos1], seq_index[pos2]] * mask[pos1, pos2] # Taken two times + j_correction += model_J[pos1, pos2, aa1, seq_index[pos2]] * mask[pos1, pos2] # Correction for incorrect addition in the for loop + j_correction += model_J[pos1, pos2, seq_index[pos1], aa2] * mask[pos1, pos2] # Correction for incorrect addition in the for loop + j_correction -= model_J[pos1, pos2, aa1, aa2] * mask[pos1, pos2] # Correct combination + energy_difference += j_correction + #import pdb; pdb.set_trace() + #self.total_energies[counter] += energy_difference + self.total_energies[pos][counter] += energy_difference + + new_std = self.total_energies[pos].std() + delta_std = new_std - self.stds[pos] + self.stds[pos] = new_std + return delta_std + + self.compute_energy=compute_energy + self.compute_denergy_mutation=compute_denergy_mutation + self.compute_denergy_swap=compute_denergy_swap + +class FourBodyPottsModel(EnergyTerm): + """ Potts model with 3-body and 4-body terms. + This is mainly the same as the AWSEMEnergy class, but 2 important differences: + 1. We don't recognize any mask that may be associated with the "model" input + that's because this class is intended to evaluate changes in a covariance matrix, + which should not be masked + 2. By definition, all coefficients are 1: "Energy" = h + J + K + L + unlike real interactions, the sum over the entire covariance matrix + isn't double counting, so we don't need to multiply by 1/2 + """ + def __init__(self, model:Frustratometer, alphabet=_AA, use_numba=True): + self._use_numba=use_numba + self.model=model + self.alphabet=alphabet + self.model_h = model.potts_model['h'] + self.model_J = model.potts_model['J'] + + class AwsemEnergyAverage(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self._use_numba=use_numba @@ -512,7 +658,6 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" - class AwsemEnergyVariance(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self._use_numba=use_numba From 3ead94c0ea957e2b2f4287e95697724d04934ed0 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 28 Jul 2025 18:28:02 -0500 Subject: [PATCH 25/76] temporary md decoys thing for single temperature only --- frustratometer/optimization/optimization.py | 37 +++++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 39d70b12..dcf2651a 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -418,10 +418,17 @@ def compute_energy(seq_index: np.array) -> float: decoy_energy = energy_h + energy_J / 2 to_append[counter] = decoy_energy - self.total_energies.append(to_append) + if len(self.total_energies) == 0: + self.total_energies.append(to_append) # this may result in the total_energies list being repeated a few times because compute_energy is called a few times + else: + self.total_energies[0] = to_append + #breakpoint() std = to_append.std() - self.stds.append(std) + if len(self.stds) == 0: + self.stds.append(std) # this may result in the list being too long because compute_energy is called a few times + else: + self.stds[0] = std #std = np.array([1,2]).var()#total_energies.var() # doing variance for now because variances are additive #self.stds.append(std) return std @@ -446,11 +453,14 @@ def compute_denergy_mutation(seq_index: np.ndarray, pos: int, aa_new: int) -> fl j_correction -= model_J[pos, pos, aa_old, aa_old] * mask[pos, pos] j_correction += model_J[pos, pos, aa_new, aa_new] * mask[pos, pos] energy_difference += j_correction - self.total_energies[pos][counter] += energy_difference + try: + self.total_energies[0][counter] += energy_difference + except IndexError: + import pdb; pdb.set_trace() #import pdb; pdb.set_trace() - new_std = self.total_energies[pos].std() - delta_std = new_std - self.stds[pos] - self.stds[pos] = new_std + new_std = self.total_energies[0].std() + delta_std = new_std - self.stds[0] + self.stds[0] = new_std return delta_std def compute_denergy_swap(seq_index, pos1, pos2): @@ -482,11 +492,11 @@ def compute_denergy_swap(seq_index, pos1, pos2): energy_difference += j_correction #import pdb; pdb.set_trace() #self.total_energies[counter] += energy_difference - self.total_energies[pos][counter] += energy_difference + self.total_energies[0][counter] += energy_difference - new_std = self.total_energies[pos].std() - delta_std = new_std - self.stds[pos] - self.stds[pos] = new_std + new_std = self.total_energies[0].std() + delta_std = new_std - self.stds[0] + self.stds[0] = new_std return delta_std self.compute_energy=compute_energy @@ -1054,9 +1064,13 @@ def montecarlo_steps(temperature, seq_index, n_steps = 1000, kb = 0.008314) -> n for _ in range(n_steps): new_sequence, energy_difference = sequence_swap(seq_index) if np.random.random() > 0.5 else sequence_mutation(seq_index) exponent= (-energy_difference) / (kb * temperature + 1E-10) + #breakpoint() acceptance_probability = np.exp(min(0, exponent)) + assert ((acceptance_probability == 0) or (acceptance_probability ==1)), acceptance_probability + #print(acceptance_probability) if np.random.random() < acceptance_probability: seq_index = new_sequence + print(f"energy_difference: {energy_difference}") return seq_index montecarlo_steps=self.numbify(montecarlo_steps) @@ -1146,7 +1160,7 @@ def annealing(self, seq_index=None, temperatures=np.arange(500,0,-1), n_steps=in seq_index = self.generate_random_sequences(1)[0] done_steps=0 - total_energy = self.energy.energy(seq_index) + total_energy = self.energy.energy(seq_index, ) #Write data to file step_data={'Step': done_steps, 'Temperature': temperatures[0], 'Sequence': index_to_sequence(seq_index,alphabet=self.alphabet), 'TotalEnergy': total_energy} @@ -1155,6 +1169,7 @@ def annealing(self, seq_index=None, temperatures=np.arange(500,0,-1), n_steps=in for t,temp in enumerate(temperatures): steps=(n_steps-done_steps)//(len(temperatures)-t) + assert steps >= 1, f"steps: {steps}" seq_index= self.montecarlo_steps(temp, seq_index, n_steps=steps) total_energy = self.energy.energy(seq_index) done_steps+=steps From 7f04db53b2d0e4f117aac60e375a27429299e215 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 31 Jul 2025 10:01:11 -0500 Subject: [PATCH 26/76] updated indicator function saving --- frustratometer/classes/AWSEM.py | 53 +++++++++++---------- frustratometer/optimization/optimization.py | 4 +- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index baafda3a..e012078e 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -6,6 +6,7 @@ from pydantic import BaseModel, Field, ConfigDict from pydantic.types import Path from typing import List,Optional,Union,Generator +import os __all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble', 'AWSEMVariancePotts'] @@ -737,31 +738,31 @@ def __init__(self, # However, we do need to pass through parameters used to generate the indicator functions awsem_obj = AWSEM(next(pdb_structures), expose_indicator_functions=True, repair_pdb=True, **parameters) self.N = awsem_obj.N # number of residues - with open('burial_indicators.npy','ab') as f: + with open('burial_indicators/burial_indicator_0.npy','ab') as f: np.save(f,awsem_obj.burial_indicator) - with open('direct_indicators.npy','ab') as f: + with open('direct_indicators/direct_indicator_0.npy','ab') as f: np.save(f,awsem_obj.direct_indicator) - with open('protein_indicators.npy','ab') as f: + with open('protein_indicators/protein_indicator_0.npy','ab') as f: np.save(f,awsem_obj.protein_indicator) - with open('water_indicators.npy','ab') as f: + with open('water_indicators/water_indicator_0.npy','ab') as f: np.save(f,awsem_obj.water_indicator) - with open('electrostatics_indicators.npy','ab') as f: + with open('electrostatics_indicators/electrostatics_indicator_0.npy','ab') as f: if hasattr(awsem_obj, 'electrostatics_indicator'): np.save(f,awsem_obj.electrostatics_indicator) else: np.save(f,None) - for pdb_structure in pdb_structures: # iterate over the rest of the structures without re-initializing the entire AWSEM class + for counter, pdb_structure in enumerate(pdb_structures): # iterate over the rest of the structures without re-initializing the entire AWSEM class awsem_obj.pdb_structure = pdb_structure # we can use the pdb_structure setter to update structural # stuff without fully re-initializing the object - with open('burial_indicators.npy','ab') as f: + with open(f'burial_indicators/burial_indicator_{counter+1}.npy','ab') as f: np.save(f,awsem_obj.burial_indicator) - with open('direct_indicators.npy','ab') as f: + with open(f'direct_indicators/direct_indicator_{counter+1}.npy','ab') as f: np.save(f,awsem_obj.direct_indicator) - with open('protein_indicators.npy','ab') as f: + with open(f'protein_indicators/protein_indicator_{counter+1}.npy','ab') as f: np.save(f,awsem_obj.protein_indicator) - with open('water_indicators.npy','ab') as f: + with open(f'water_indicators/water_indicator_{counter+1}.npy','ab') as f: np.save(f,awsem_obj.water_indicator) - with open('electrostatics_indicators.npy','ab') as f: + with open(f'electrostatics_indicators/electrostatics_indicator_{counter+1}.npy','ab') as f: if hasattr(awsem_obj, 'electrostatics_indicator'): np.save(f,awsem_obj.electrostatics_indicator) else: @@ -798,30 +799,32 @@ def __init__(self, # be able to reinitialize the generators. We accomplish this with properties @property def burial_indicators(self): - return self.get_indicators("burial_indicators.npy") + return self.get_indicators("burial_indicators") @property def direct_indicators(self): - return self.get_indicators("direct_indicators.npy") + return self.get_indicators("direct_indicators") @property def protein_indicators(self): - return self.get_indicators("protein_indicators.npy") + return self.get_indicators("protein_indicators") @property def water_indicators(self): - return self.get_indicators("water_indicators.npy") + return self.get_indicators("water_indicators") @property def electrostatics_indicators(self): - return self.get_indicators("electrostatics_indicators.npy") + return self.get_indicators("electrostatics_indicators") # allows us to process indicators without holding them all in memory # this requires that every method that acts on the indicators iterates over them - def get_indicators(self, filename): - # expecting a numpy file - with open(filename, 'rb') as f: - while True: - try: - yield np.load(f, allow_pickle=True) # needed to load None if not electrostatics - except EOFError: - break + def get_indicators(self, directory): + # expecting a directory containing numpy files + for filename in sorted(os.listdir(directory)): + yield np.load(f"{directory}/{filename}") + #with open(filename, 'rb') as f: + # while True: + # try: + # yield np.load(f, allow_pickle=True) # allow_pickle=True needed to load None if not electrostatics + # except EOFError: + # break # average indicator functions over all decoys # these averages can then be averaged to get the average of all indicator functions over all decoys @@ -977,4 +980,4 @@ def all_decoy_indicators(self): all_prot.append(prot) all_wat.append(wat) all_elec.append(elec) - return all_burial, all_direct, all_prot, all_wat, all_elec \ No newline at end of file + return all_burial, all_direct, all_prot, all_wat, all_elec diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index dcf2651a..663347a3 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1066,11 +1066,11 @@ def montecarlo_steps(temperature, seq_index, n_steps = 1000, kb = 0.008314) -> n exponent= (-energy_difference) / (kb * temperature + 1E-10) #breakpoint() acceptance_probability = np.exp(min(0, exponent)) - assert ((acceptance_probability == 0) or (acceptance_probability ==1)), acceptance_probability + #assert ((acceptance_probability == 0) or (acceptance_probability ==1)), acceptance_probability #print(acceptance_probability) if np.random.random() < acceptance_probability: seq_index = new_sequence - print(f"energy_difference: {energy_difference}") + #print(f"energy_difference: {energy_difference}") return seq_index montecarlo_steps=self.numbify(montecarlo_steps) From 7965940e1bcf9ba38c0a984ea798448a970503df Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 31 Jul 2025 10:04:50 -0500 Subject: [PATCH 27/76] fixed handling of standard deviation calculation --- frustratometer/optimization/EnergyTerm.py | 5 +++ frustratometer/optimization/optimization.py | 38 +++++++++++++++------ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/frustratometer/optimization/EnergyTerm.py b/frustratometer/optimization/EnergyTerm.py index 6cd87bc6..f75ee8b6 100644 --- a/frustratometer/optimization/EnergyTerm.py +++ b/frustratometer/optimization/EnergyTerm.py @@ -216,8 +216,13 @@ def __sub__(self, other): def __truediv__(self, other): new_energy_term = EnergyTerm() + if isinstance(other, EnergyTerm): new_energy_term.use_numba = self.use_numba and other.use_numba + new_energy_term.total_energies = other.total_energies + new_energy_term.consider_total_energies = other.consider_total_energies + new_energy_term.stds = other.stds + new_energy_term.consider_stds = other.consider_stds e1=self.energy_function; e2=other.energy_function m1=self.denergy_mutation_function; m2=other.denergy_mutation_function s1=self.denergy_swap_function; s2=other.denergy_swap_function diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index dcf2651a..e429aa78 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -4,6 +4,7 @@ import csv from functools import wraps from datetime import datetime +import copy from frustratometer.classes import Frustratometer from frustratometer.classes import Structure @@ -392,6 +393,8 @@ def __init__(self, all_burial, all_direct, all_prot, all_wat, all_elec, sequence self.stds = [] self.total_energies = [] + self.consider_stds = [] + self.consider_total_energies = [] self.initialize_functions() @@ -420,15 +423,19 @@ def compute_energy(seq_index: np.array) -> float: if len(self.total_energies) == 0: self.total_energies.append(to_append) # this may result in the total_energies list being repeated a few times because compute_energy is called a few times + self.consider_total_energies.append(copy.deepcopy(to_append)) else: self.total_energies[0] = to_append + self.consider_total_energies[0] = copy.deepcopy(to_append) #breakpoint() std = to_append.std() if len(self.stds) == 0: self.stds.append(std) # this may result in the list being too long because compute_energy is called a few times + self.consider_stds.append(std) else: self.stds[0] = std + self.consider_stds[0] = std #std = np.array([1,2]).var()#total_energies.var() # doing variance for now because variances are additive #self.stds.append(std) return std @@ -453,14 +460,15 @@ def compute_denergy_mutation(seq_index: np.ndarray, pos: int, aa_new: int) -> fl j_correction -= model_J[pos, pos, aa_old, aa_old] * mask[pos, pos] j_correction += model_J[pos, pos, aa_new, aa_new] * mask[pos, pos] energy_difference += j_correction - try: - self.total_energies[0][counter] += energy_difference - except IndexError: - import pdb; pdb.set_trace() + # our mutation might be rejected, so we don't want to overwrite self.total_energies + self.consider_total_energies[0][counter] = self.total_energies[0][counter] + energy_difference + #print(f"energy difference: {energy_difference}") + #assert not np.all(np.array(self.total_energies[0])==np.array(self.consider_total_energies[0])) #import pdb; pdb.set_trace() - new_std = self.total_energies[0].std() + new_std = self.consider_total_energies[0].std() + self.consider_stds[0] = new_std # our mutation might be rejected, so we don't want to overwrite self.stds delta_std = new_std - self.stds[0] - self.stds[0] = new_std + #print(f"mutation: {delta_std}") return delta_std def compute_denergy_swap(seq_index, pos1, pos2): @@ -492,11 +500,15 @@ def compute_denergy_swap(seq_index, pos1, pos2): energy_difference += j_correction #import pdb; pdb.set_trace() #self.total_energies[counter] += energy_difference - self.total_energies[0][counter] += energy_difference - - new_std = self.total_energies[0].std() + self.consider_total_energies[0][counter] = self.total_energies[0][counter] + energy_difference + #print(f"energy difference: {energy_difference}") + #breakpoint() + #import pdb; pdb.set_trace() + #assert not np.all(np.array(self.total_energies[0])==np.array(self.consider_total_energies[0])) + new_std = self.consider_total_energies[0].std() + self.consider_stds[0] = new_std delta_std = new_std - self.stds[0] - self.stds[0] = new_std + #print(f"swap: {delta_std}") return delta_std self.compute_energy=compute_energy @@ -1070,7 +1082,11 @@ def montecarlo_steps(temperature, seq_index, n_steps = 1000, kb = 0.008314) -> n #print(acceptance_probability) if np.random.random() < acceptance_probability: seq_index = new_sequence - print(f"energy_difference: {energy_difference}") + #print(f"before reassignment: {self.energy.stds}") + self.energy.stds = copy.deepcopy(self.energy.consider_stds) + #print(f"after reassignment: {self.energy.stds}") + self.energy.total_energies = copy.deepcopy(self.energy.consider_total_energies) + #print(f"energy_difference: {energy_difference}") return seq_index montecarlo_steps=self.numbify(montecarlo_steps) From d19638a9c338c4db4d3eefec9dadd51643702834 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 2 Aug 2025 15:24:25 -0500 Subject: [PATCH 28/76] added code to support alternative md decoy variance calculation but need to remove the total_energies, consider_total_energies, stds, and consider_stds tracking --- frustratometer/optimization/optimization.py | 215 +++++++++++++++++++- 1 file changed, 210 insertions(+), 5 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 7f67699a..a666bffd 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -532,7 +532,6 @@ def __init__(self, model:Frustratometer, alphabet=_AA, use_numba=True): self.model_h = model.potts_model['h'] self.model_J = model.potts_model['J'] - class AwsemEnergyAverage(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self._use_numba=use_numba @@ -942,11 +941,29 @@ def compute_energy(seq_index): for i in range(len_indicators2D): for j in range(len_alphabet): for k in range(len_alphabet): - t=1 if j==k else 0 - phi_mean[c] = indicator_means[i+ len_indicators1D] * counts[j] * (counts[k] - t) + t=1 if j==k else 0 # I don't know why we do this + phi_mean[c] = indicator_means[i+ len_indicators1D] * counts[j] * (counts[k] - t) c += 1 B = build_mean_inner_product_matrix(counts,indicators1D,indicators2D,region_means) + # B[i,j] - phi_mean[i]*phi_mean[j] is the covariance of some avg-indicator/gamma + # product i with some other avg-indicator/gamma product j + # + # we can think of computing the total variance (summing this matrix) + # as evaluating a potts model (covariances playing the role of "energies") that has + # 3 fields (the burial indicator function for each density bin) and + # 4 couplings (direct, prot, wat, and elec pairwise indicators) + # + # because we averaged over all indicators and they're playing the role of + # couplings and fields in our model, this is a "mean field" approach + # + # more precisely, the above-described strategy is mean-field + # with respect to the indicators, representing sequence shuffling; + # to represent structure shuffling, we want to do the mean-field calculation + # with respect to the gammas, meaning that we average the gammas to get the + # couplings and fields, then multiply by the indicators + # + energy=0 for i in range(phi_len): for j in range(phi_len): @@ -980,6 +997,196 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" +class AwsemEnergyStdFromCovMatrix(EnergyTerm): + def __init__(self, covariance_matrix: np.ndarray, + burial_gamma: np.ndarray, + direct_gamma: np.ndarray, + protein_gamma: np.ndarray, + water_gamma: np.ndarray, + electrostatics_gamma: np.ndarray, + use_numba = True, alphabet = _AA): + """ + covariance_matrix: np.ndarray + Covariance matrix of all __indicator functions___ (not residues) over a decoy set. + Should have the following structure: + ___________________________________________________________________________________________ + burial pairwise + _____________________________________________ + position 1 low | . | + ... | . | + position N low | . | + position 1 med | burial-burial . burial-pairwise | + burial ... | . | + position N med | . | + position 1 high | covariances . covariances | + ... | . | + position N high | . | + ------------------------------------------------------------------------------------------- + direct interaction 1 | . | + ... | . | + direct interaction (N**2-N)/2 | . | + prot interaction 1 | . | + ... | . | + pairwise prot interaction (N**2-N)/2 | burial-pairwise . pairwise-pairwise | + wat interaction 1 | covariances . covariances | + ... | . | + wat interaction (N**2-N)/2 | . | + elec interaction 1 | . | + ... | . | + elec interaction (N**2-N)/2 | . | + ___________________________________________________________________________________________| + + This matrix should have the same form as the covariance matrix that would be passed into + AwsemVariancePotts, if we were doing things that way. + + gamma arrays: INPUTS MUST BE REINDEXED according to the alphabet used! + + """ + # check input + if not len(covariance_matrix.shape) == 2: + raise ValueError(f"covariance_matrix must have dimension 2 but was {len(covariance_matrix.shape)}") + if not covariance_matrix.shape[0] == covariance_matrix.shape[1]: + raise ValueError(f"covariance_matrix dimensions were not equal. covariance_matrix.shape: {covariance_matrix.shape}") + if not burial_gamma.shape[1] == 3: + raise ValueError(f"burial_gamma.shape[1] should be 3 but was {burial_gamma.shape[1]}") + if not direct_gamma.shape[0]==direct_gamma.shape[1]\ + or not protein_gamma.shape[0]==protein_gamma.shape[1]\ + or not water_gamma.shape[0]==water_gamma.shape[1]\ + or not electrostatics_gamma.shape[0]==electrostatics_gamma.shape[1]: + raise ValueError("check gamma shapes") + if not burial_gamma.shape[0] == len(alphabet): + raise ValueError(f"alphabet {alphabet} and burial_gamma shape {burial_gamma.shape} are inconsistent") + if not direct_gamma.shape[0] == len(alphabet): + raise ValueError(f"alphabet {alphabet} and direct_gamma shape {direct_gamma.shape} are inconsistent") + if not protein_gamma.shape[0] == len(alphabet): + raise ValueError(f"alphabet {alphabet} and protein_gamma shape {protein_gamma.shape} are inconsistent") + if not water_gamma.shape[0] == len(alphabet): + raise ValueError(f"alphabet {alphabet} and water_gamma shape {water_gamma.shape} are inconsistent") + if not electrostatics_gamma.shape[0] == len(alphabet): + raise ValueError(f"alphabet {alphabet} and electrostatics_gamma shape {electrostatics_gamma.shape} are inconsistent") + # set attributes + self.covariance_matrix = covariance_matrix + self._use_numba = use_numba + self.alphabet = alphabet + self.alphabet_size = len(alphabet) + N = 0 + while 3*N + 4*((N**2-N)/2) < self.covariance_matrix.shape[0]: + N += 1 + if not 3*N + 4*((N**2-N)/2) == self.covariance_matrix.shape[0]: + raise ValueError(f"the covariance matrix seems to have been constructed incorrectly. covariance_matrix.shape: {covariance_matrix.shape}") + self.N = N # number of amino acids + # compute products of gamma parameters for each indicator class + # (burial low density, burial med, burial high, direct, prot, wat, elec) + # for each combination of amino acids (in general, 4 total because we have + # to evaluate the covariance of two pairwise indicators each depending on + # the amino acid identity at 2 different positions in the sequence) + gamma = np.zeros((7,7,len(alphabet),len(alphabet),len(alphabet),len(alphabet))) + # we don't need the third and fourth axes for burial-burial covariances, so we copy the + # 2D outer product along both new axes so that the gamma array is not ragged + # (the third and fourth axes are needed for other terms) + #gamma[0,0] = np.repeat(np.outer(burial_gamma[:,0],burial_gamma[:,0])[:,None,:,None], len(alphabet), axis=1)# low burial- low burial + gamma[0,0] = np.outer(burial_gamma[:,0],burial_gamma[:,0])[:,None,:,None]# low burial- low burial + gamma[0,1] = np.outer(burial_gamma[:,0],burial_gamma[:,1])[:,None,:,None]# low burial- med burial + gamma[0,2] = np.outer(burial_gamma[:,0],burial_gamma[:,2])[:,None,:,None]# low burial- high burial + gamma[0,3] = np.einsum('i,jk->ijk', burial_gamma[:,0], direct_gamma)[:,None,:,:] # low burial- direct + gamma[0,4] = np.einsum('i,jk->ijk', burial_gamma[:,0], protein_gamma)[:,None,:,:] # low burial- prot + gamma[0,5] = np.einsum('i,jk->ijk', burial_gamma[:,0], water_gamma)[:,None,:,:] # low burial- wat + gamma[0,6] = np.einsum('i,jk->ijk', burial_gamma[:,0], electrostatics_gamma)[:,None,:,:] # low burial- elec + gamma[1,1] = np.outer(burial_gamma[:,1],burial_gamma[:,1])[:,None,:,None]# med burial- med burial + gamma[1,2] = np.outer(burial_gamma[:,1],burial_gamma[:,2])[:,None,:,None]# med burial- high burial + gamma[1,3] = np.einsum('i,jk->ijk', burial_gamma[:,1], direct_gamma)[:,None,:,:] # med burial- direct + gamma[1,4] = np.einsum('i,jk->ijk', burial_gamma[:,1], protein_gamma)[:,None,:,:] # med burial- prot + gamma[1,5] = np.einsum('i,jk->ijk', burial_gamma[:,1], water_gamma)[:,None,:,:] # med burial- wat + gamma[1,6] = np.einsum('i,jk->ijk', burial_gamma[:,1], electrostatics_gamma)[:,None,:,:] # med burial- elec + gamma[2,2] = np.outer(burial_gamma[:,2],burial_gamma[:,2])[:,None,:,None]# high burial- high burial + gamma[2,3] = np.einsum('i,jk->ijk', burial_gamma[:,2], direct_gamma)[:,None,:,:] # high burial- direct + gamma[2,4] = np.einsum('i,jk->ijk', burial_gamma[:,2], protein_gamma)[:,None,:,:] # high burial- prot + gamma[2,5] = np.einsum('i,jk->ijk', burial_gamma[:,2], water_gamma)[:,None,:,:] # high burial- wat + gamma[2,6] = np.einsum('i,jk->ijk', burial_gamma[:,2], electrostatics_gamma)[:,None,:,:] # high burial- elec + gamma[3,3] = np.einsum('ij,kl->ijkl', direct_gamma, direct_gamma) # direct- direct + gamma[3,4] = np.einsum('ij,kl->ijkl', direct_gamma, protein_gamma) # direct- prot + gamma[3,5] = np.einsum('ij,kl->ijkl', direct_gamma, water_gamma) # direct- wat + gamma[3,6] = np.einsum('ij,kl->ijkl', direct_gamma, electrostatics_gamma) # direct- elec + gamma[4,4] = np.einsum('ij,kl->ijkl', protein_gamma, protein_gamma) # prot- prot + gamma[4,5] = np.einsum('ij,kl->ijkl', protein_gamma, water_gamma) # prot- wat + gamma[4,6] = np.einsum('ij,kl->ijkl', protein_gamma, electrostatics_gamma) # prot- elec + gamma[5,5] = np.einsum('ij,kl->ijkl', water_gamma, water_gamma) # wat- wat + gamma[5,6] = np.einsum('ij,kl->ijkl', water_gamma, electrostatics_gamma) # wat- elec + gamma[6,6] = np.einsum('ij,kl->ijkl', electrostatics_gamma, electrostatics_gamma) # elec- elec + self.gamma = gamma + gamma.transpose((0,1,5,4,3,2)) # keep the indicator class axes the same + # but transpose the gamma values + + @staticmethod + def covariance_type(N, i, j): + # N: total number of residues + # i: first position in the covariance matrix + # j: second position in the covariance matrix + if i < 3*N: + type_i = i//N + else: + type_i = (i-3*N)//((N**2-N)/2) + if j < 3*N: + type_j = j//N + else: + type_j = (j-3*N)//((N**2-N)/2) + return (type_i, type_j) + + @staticmethod + def residue_identities(i, j, seq_index, indexing_helper_rowflatten, indexing_helper_columnflatten): + # indexing helper rowflatten looks like [0, ..., 0, 1, ..., 1, ..., N] + # ^ repeated N times + # ^ repeated N-1 times + # ^ repeated once + # indexing helper columnflatten looks like [0, ..., N, 1, ..., N, ..., N] + if i < 3*N: + i_pos = i%3 + i_aa = (seq_index[i_pos], seq_index[i_pos]) + else: + i_pos1 = indexing_helper_rowflatten[(i-3*N)%((N**2-N)/2)] + i_pos2 = indexing_helper_columnflatten[(i-3*N)%((N**2-N)/2)] + i_aa = (seq_index[i_pos1, i_pos2]) + if j < 3*N: + j_pos = j%3 + j_aa = (seq_index[j_pos],seq_index[j_pos]) + else: + j_pos1 = indexing_helper_rowflatten[(j-3*N)%((N**2-N)/2)] + j_pos2 = indexing_helper_columnflatten[(j-3*N)%((N**2-N)/2)] + j_aa = (seq_index[j_pos1, j_pos2]) + return i_pos + j_pos # concatenating tuples + + def initialize_functions(self): + covariance_matrix = self.covariance_matrix + gamma = self.gamma + N = self.N + indexing_helper_rowflatten = np.repeat(np.arange(N).reshape((1,N)),N,axis=1)[np.triu_indices(N)] + indexing_helper_columnflatten = np.transpose(np.repeat(np.arange(N).reshape((1,N)),N,axis=1))[np.triu_indices(N)] + covariance_type = self.covariance_type + residue_identities = self.residue_identities + + def compute_energy(seq_index): + energy = 0 + for i in range(covariance_matrix.shape[0]): + for j in range(i,covariance_matrix.shape[1]): + energy += covariance_matrix[i,j] * gamma[covariance_type(N,i,j)+residue_identities(i,j)] + return energy**0.5 + compute_energy_numba=self.numbify(compute_energy) + + def compute_denergy_mutation(seq_index, pos, aa): + seq_index_new = seq_index.copy() + seq_index_new[pos] = aa + return compute_energy_numba(seq_index_new) - compute_energy_numba(seq_index) + + def compute_denergy_swap(seq_index, pos1, pos2): + seq_index_new = seq_index.copy() + aa2 , aa1 = seq_index[pos1],seq_index[pos2] + seq_index_new[pos1] = aa1 + seq_index_new[pos2] = aa2 + return compute_energy_numba(seq_index_new) - compute_energy_numba(seq_index) + + self.compute_energy = compute_energy + self.compute_denergy_mutation = compute_denergy_mutation + self.compute_denergy_swap = compute_denergy_swap + class Similarity(EnergyTerm): """ Computes the energy of a sequence based on the similarity to a target sequence. The similarity is calculated as the number of positions that are the same in the two sequences. @@ -1011,8 +1218,6 @@ def denergy_swap(seq_index, pos1, pos2): self.compute_denergy_mutation = denergy_mutation self.compute_denergy_swap = denergy_swap - - class MonteCarlo: def __init__(self, sequence: str, energy: EnergyTerm, alphabet:str=_AA, use_numba:bool=True, evaluation_energies:dict={}): self.seq_len=len(sequence) From 2d8ff9d710d1db0ce45fffb0e491bee71bab97ca Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 2 Aug 2025 16:59:16 -0500 Subject: [PATCH 29/76] fixed errors --- frustratometer/optimization/optimization.py | 39 ++++++++++++--------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index a666bffd..89eb0a30 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1115,7 +1115,9 @@ def __init__(self, covariance_matrix: np.ndarray, gamma[6,6] = np.einsum('ij,kl->ijkl', electrostatics_gamma, electrostatics_gamma) # elec- elec self.gamma = gamma + gamma.transpose((0,1,5,4,3,2)) # keep the indicator class axes the same # but transpose the gamma values - + # define energy evaluation functions + self.initialize_functions() + @staticmethod def covariance_type(N, i, j): # N: total number of residues @@ -1124,15 +1126,15 @@ def covariance_type(N, i, j): if i < 3*N: type_i = i//N else: - type_i = (i-3*N)//((N**2-N)/2) + type_i = (i-3*N)//((N**2-N)//2) if j < 3*N: type_j = j//N else: - type_j = (j-3*N)//((N**2-N)/2) + type_j = (j-3*N)//((N**2-N)//2) return (type_i, type_j) @staticmethod - def residue_identities(i, j, seq_index, indexing_helper_rowflatten, indexing_helper_columnflatten): + def residue_identities(N, i, j, seq_index, indexing_helper_rowflatten, indexing_helper_columnflatten): # indexing helper rowflatten looks like [0, ..., 0, 1, ..., 1, ..., N] # ^ repeated N times # ^ repeated N-1 times @@ -1142,32 +1144,35 @@ def residue_identities(i, j, seq_index, indexing_helper_rowflatten, indexing_hel i_pos = i%3 i_aa = (seq_index[i_pos], seq_index[i_pos]) else: - i_pos1 = indexing_helper_rowflatten[(i-3*N)%((N**2-N)/2)] - i_pos2 = indexing_helper_columnflatten[(i-3*N)%((N**2-N)/2)] - i_aa = (seq_index[i_pos1, i_pos2]) + i_pos1 = indexing_helper_rowflatten[(i-3*N)%((N**2-N)//2)] + i_pos2 = indexing_helper_columnflatten[(i-3*N)%((N**2-N)//2)] + i_aa = (seq_index[i_pos1], seq_index[i_pos2]) if j < 3*N: j_pos = j%3 - j_aa = (seq_index[j_pos],seq_index[j_pos]) + j_aa = (seq_index[j_pos], seq_index[j_pos]) else: - j_pos1 = indexing_helper_rowflatten[(j-3*N)%((N**2-N)/2)] - j_pos2 = indexing_helper_columnflatten[(j-3*N)%((N**2-N)/2)] - j_aa = (seq_index[j_pos1, j_pos2]) - return i_pos + j_pos # concatenating tuples + j_pos1 = indexing_helper_rowflatten[(j-3*N)%((N**2-N)//2)] + j_pos2 = indexing_helper_columnflatten[(j-3*N)%((N**2-N)//2)] + j_aa = (seq_index[j_pos1], seq_index[j_pos2]) + return i_aa + j_aa # concatenating tuples def initialize_functions(self): covariance_matrix = self.covariance_matrix gamma = self.gamma - N = self.N - indexing_helper_rowflatten = np.repeat(np.arange(N).reshape((1,N)),N,axis=1)[np.triu_indices(N)] - indexing_helper_columnflatten = np.transpose(np.repeat(np.arange(N).reshape((1,N)),N,axis=1))[np.triu_indices(N)] + N = self.N # number of amino acids + indexing_helper_rowflatten = np.repeat(np.arange(N).reshape((N,1)),N,axis=1)[np.triu_indices(N)] + indexing_helper_columnflatten = np.transpose(np.repeat(np.arange(N).reshape((N,1)),N,axis=1))[np.triu_indices(N)] covariance_type = self.covariance_type residue_identities = self.residue_identities - + def compute_energy(seq_index): energy = 0 for i in range(covariance_matrix.shape[0]): for j in range(i,covariance_matrix.shape[1]): - energy += covariance_matrix[i,j] * gamma[covariance_type(N,i,j)+residue_identities(i,j)] + try: + energy += covariance_matrix[i,j] * gamma[covariance_type(N,i,j)+residue_identities(N, i,j,seq_index, indexing_helper_rowflatten, indexing_helper_columnflatten )] + except: + breakpoint() return energy**0.5 compute_energy_numba=self.numbify(compute_energy) From bac836199d787aea7283176cc7c821bbb9bfd7ac Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 2 Aug 2025 17:00:44 -0500 Subject: [PATCH 30/76] removed total_energies, total_energies_consider, stds, and stds_consider --- frustratometer/optimization/EnergyTerm.py | 8 ++++---- frustratometer/optimization/optimization.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/frustratometer/optimization/EnergyTerm.py b/frustratometer/optimization/EnergyTerm.py index f75ee8b6..db6f558a 100644 --- a/frustratometer/optimization/EnergyTerm.py +++ b/frustratometer/optimization/EnergyTerm.py @@ -219,10 +219,10 @@ def __truediv__(self, other): if isinstance(other, EnergyTerm): new_energy_term.use_numba = self.use_numba and other.use_numba - new_energy_term.total_energies = other.total_energies - new_energy_term.consider_total_energies = other.consider_total_energies - new_energy_term.stds = other.stds - new_energy_term.consider_stds = other.consider_stds + #new_energy_term.total_energies = other.total_energies + #new_energy_term.consider_total_energies = other.consider_total_energies + #new_energy_term.stds = other.stds + #new_energy_term.consider_stds = other.consider_stds e1=self.energy_function; e2=other.energy_function m1=self.denergy_mutation_function; m2=other.denergy_mutation_function s1=self.denergy_swap_function; s2=other.denergy_swap_function diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 89eb0a30..c17fd2b8 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1293,9 +1293,9 @@ def montecarlo_steps(temperature, seq_index, n_steps = 1000, kb = 0.008314) -> n if np.random.random() < acceptance_probability: seq_index = new_sequence #print(f"before reassignment: {self.energy.stds}") - self.energy.stds = copy.deepcopy(self.energy.consider_stds) + #self.energy.stds = copy.deepcopy(self.energy.consider_stds) #print(f"after reassignment: {self.energy.stds}") - self.energy.total_energies = copy.deepcopy(self.energy.consider_total_energies) + #self.energy.total_energies = copy.deepcopy(self.energy.consider_total_energies) #print(f"energy_difference: {energy_difference}") return seq_index From 3e29dd3824a1b81fa7249656870d543192114be0 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 6 Aug 2025 01:21:02 -0500 Subject: [PATCH 31/76] frustration-inspired calculation for group meeting --- frustratometer/classes/AWSEM.py | 1 + frustratometer/optimization/optimization.py | 245 +++++++++++++++++++- 2 files changed, 237 insertions(+), 9 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index e012078e..fd21f6d6 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -305,6 +305,7 @@ def calculate_indicators(self): self.direct_indicator = direct_indicator # probably could get rid of either this or indicators list self.water_indicator = water_indicator # probably could get rid of either this or indicators list self.protein_indicator = protein_indicator # probably could get rid of either this or indicators list + #breakpoint() if self.p.k_electrostatics != 0: electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index c17fd2b8..9669ac0e 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -6,6 +6,7 @@ from datetime import datetime import copy +from frustratometer import frustration from frustratometer.classes import Frustratometer from frustratometer.classes import Structure from frustratometer.classes import AWSEM, AWSEMIndicators, DecoyEnsemble, AWSEMVariancePotts @@ -679,6 +680,233 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" +class PairEnergyAverage(EnergyTerm): + """ + Computes the average pairwise energy for a given sequence. + This class is designed to compute the average pairwise energy of a sequence + using the AWSEM model. It calculates the energy contributions from pairwise interactions + between amino acids in the sequence, averaged over all possible pairs. + """ + def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): + self._use_numba=use_numba + self.model=model + self.alphabet=alphabet + self.reindex_dca=[_AA.index(aa) for aa in alphabet] + assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." + self.indicators = model.indicators + self.alphabet_size=len(alphabet) + self.model_h = model.potts_model['h'][:,self.reindex_dca] + self.model_J = model.potts_model['J'][:,:,self.reindex_dca][:,:,:,self.reindex_dca] + self.mask = model.mask + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.initialize_functions() + + def initialize_functions(self): + len_alphabet=self.alphabet_size + + distances = np.triu(self.model.distance_matrix) + ########################################################################################### + distances = distances[(distances0)] # USE THIS NORMALLY + #distances = distances[distances>0] # USE THIS FOR TESTING THING WHERE WE NEED ALL PAIRS + ########################################################################################### + len_distances = len(distances) + + rho_b = np.expand_dims(self.model.rho_r, 1) #(n,1) + rho1 = np.expand_dims(self.model.rho_r, 0) #(1,n) + rho2 = np.expand_dims(self.model.rho_r, 1) #(n,1) + + sigma_water = 0.25 * (1 - np.tanh(self.model.eta_sigma * (rho1 - self.model.rho_0))) * (1 - np.tanh(self.model.eta_sigma * (rho2 - self.model.rho_0))) #(n,n) + sigma_protein = 1 - sigma_water #(n,n) + + #Calculate theta and indicators + theta = 0.25 * (1 + np.tanh(self.model.eta * (distances - self.model.r_min))) * (1 + np.tanh(self.model.eta * (self.model.r_max - distances))) # (c,) + thetaII = 0.25 * (1 + np.tanh(self.model.eta * (distances - self.model.r_minII))) * (1 + np.tanh(self.model.eta * (self.model.r_maxII - distances))) #(c,) + burial_indicator = np.tanh(self.model.burial_kappa * (rho_b - self.model.burial_ro_min)) + np.tanh(self.model.burial_kappa * (self.model.burial_ro_max - rho_b)) #(n,3) + # gap has 0 charge + # gap, A,C,D, E, F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y + charges = np.array([0, 0,0,-1,-1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0]) + charges = charges[self.reindex_dca] # remove unused gap, C, and P + electrostatics_indicator = np.exp(-distances / self.model.electrostatics_screening_length) / distances + + N = self.model.N + k_contact = self.model.k_contact + burial_gamma = self.model.burial_gamma[self.model.aa_map_awsem_list][self.reindex_dca] + direct_gamma = self.model.direct_gamma[self.model.aa_map_awsem_x, self.model.aa_map_awsem_y][self.reindex_dca][:,self.reindex_dca] + water_gamma = self.model.water_gamma[self.model.aa_map_awsem_x, self.model.aa_map_awsem_y][self.reindex_dca][:,self.reindex_dca] + protein_gamma = self.model.protein_gamma[self.model.aa_map_awsem_x, self.model.aa_map_awsem_y][self.reindex_dca][:,self.reindex_dca] + k_contact = self.model.k_contact + k_electrostatics = self.model.k_electrostatics + mask = self.mask + sequence_mask_contact = self.model.sequence_mask_contact + assert np.all(mask==mask.T), "Mask should be symmetric" + assert mask.shape == (N, N), f"Mask shape {mask.shape} does not match expected shape {(N, N)}" + + n_decoys=4000 + + def compute_energy(seq_index): + # adapted from AWSEM.compute_configurational_decoy_statistics, + # modified to be numba-friendly + + decoy_energies=np.zeros(n_decoys) + for i in range(n_decoys): + c=np.random.randint(0,len_distances) + n1=np.random.randint(0,N) + n2=np.random.randint(0,N) + qi1=np.random.randint(0,N) + qi2=np.random.randint(0,N) + q1=seq_index[qi1] + q2=seq_index[qi2] + + burial_energy1 = (-0.5 * k_contact * burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) + burial_energy2 = (-0.5 * k_contact * burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) + burial_energy = (burial_energy1+burial_energy2)/(N-1) # normalize because double counting carlos thing + + direct = theta[c] * direct_gamma[q1, q2] + water_mediated = sigma_water[n1,n2] * thetaII[c] * water_gamma[q1,q2] + protein_mediated = sigma_protein[n1,n2] * thetaII[c] * protein_gamma[q1,q2] + contact_energy = -k_contact * (direct+water_mediated+protein_mediated) + electrostatics_energy = k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] + + decoy_energies[i]=(burial_energy+contact_energy+electrostatics_energy) + mean_decoy_energy = np.mean(decoy_energies) + #std_decoy_energy = np.std(decoy_energies) + + """ + assert len(distances) == (N**2-N)/2, f"len(distances): {len(distances)} != (N**2-N)/2: {(N**2-N)/2}" + decoy_energies = np.zeros((len(seq_index)**2 - len(seq_index))//2) + index = 0 + #breakpoint() + for i in range(len(seq_index)): + aa1 = seq_index[i] + for j in range(i+1, len(seq_index)): + aa2 = seq_index[j] + + burial_energy1 = (-0.5 * k_contact * burial_gamma[aa1] * burial_indicator[i]).sum(axis=0) + burial_energy2 = (-0.5 * k_contact * burial_gamma[aa2] * burial_indicator[j]).sum(axis=0) + burial_energy = (burial_energy1 + burial_energy2) / ((N - 1)) #/ 2) + + direct = theta[index] * direct_gamma[aa1, aa2] + water_mediated = sigma_water[i, j] * thetaII[index] * water_gamma[aa1, aa2] + protein_mediated = sigma_protein[i, j] * thetaII[index] * protein_gamma[aa1, aa2] + contact_energy = -k_contact * (direct+water_mediated+protein_mediated)*mask[i, j]*sequence_mask_contact[i, j] + electrostatics_energy = k_electrostatics * electrostatics_indicator[index]*charges[aa1]*charges[aa2]*mask[i,j] + decoy_energies[index] = contact_energy+burial_energy+electrostatics_energy#(contact_energy+electrostatics_energy)#(burial_energy + contact_energy + electrostatics_energy) + index += 1 + mean_decoy_energy = np.sum(decoy_energies) # for testing, we return the total energy, not the average + """ + + + return mean_decoy_energy#, std_decoy_energy + + compute_energy_numba = self.numbify(compute_energy) + + def denergy_mutation(seq_index, pos, aa): + seq_index_new = seq_index.copy() + seq_index_new[pos] = aa + return compute_energy_numba(seq_index_new) - compute_energy_numba(seq_index) + + self.compute_energy = compute_energy + self.compute_denergy_mutation = denergy_mutation + + def regression_test(self, seq_index): + raise NotImplementedError("sorry") + +class PairEnergyStd(EnergyTerm): + def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): + self._use_numba=use_numba + self.model=model + self.alphabet=alphabet + self.reindex_dca=[_AA.index(aa) for aa in alphabet] + assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." + self.indicators = model.indicators + self.alphabet_size=len(alphabet) + self.model_h = model.potts_model['h'][:,self.reindex_dca] + self.model_J = model.potts_model['J'][:,:,self.reindex_dca][:,:,:,self.reindex_dca] + self.mask = model.mask + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.initialize_functions() + + def initialize_functions(self): + len_alphabet=self.alphabet_size + + distances = np.triu(self.model.distance_matrix) + distances = distances[(distances0)] + len_distances = len(distances) + print(f"len_distances: {len_distances}, distances: {distances}") + + rho_b = np.expand_dims(self.model.rho_r, 1) #(n,1) + rho1 = np.expand_dims(self.model.rho_r, 0) #(1,n) + rho2 = np.expand_dims(self.model.rho_r, 1) #(n,1) + + sigma_water = 0.25 * (1 - np.tanh(self.model.eta_sigma * (rho1 - self.model.rho_0))) * (1 - np.tanh(self.model.eta_sigma * (rho2 - self.model.rho_0))) #(n,n) + sigma_protein = 1 - sigma_water #(n,n) + + #Calculate theta and indicators + theta = 0.25 * (1 + np.tanh(self.model.eta * (distances - self.model.r_min))) * (1 + np.tanh(self.model.eta * (self.model.r_max - distances))) # (c,) + thetaII = 0.25 * (1 + np.tanh(self.model.eta * (distances - self.model.r_minII))) * (1 + np.tanh(self.model.eta * (self.model.r_maxII - distances))) #(c,) + burial_indicator = np.tanh(self.model.burial_kappa * (rho_b - self.model.burial_ro_min)) + np.tanh(self.model.burial_kappa * (self.model.burial_ro_max - rho_b)) #(n,3) + + charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) + electrostatics_indicator = np.exp(-distances / self.model.electrostatics_screening_length) / distances + + N = self.model.N + k_contact = self.model.k_contact + burial_gamma = self.model.burial_gamma + direct_gamma = self.model.direct_gamma + water_gamma = self.model.water_gamma + protein_gamma = self.model.protein_gamma + k_contact = self.model.k_contact + k_electrostatics = self.model.k_electrostatics + + mask = self.mask + sequence_mask_contact = self.model.sequence_mask_contact + + n_decoys=4000 + + def compute_energy(seq_index): + # adapted from AWSEM.compute_configurational_decoy_statistics, + # modified to be numba-friendly + + decoy_energies=np.zeros(n_decoys) + for i in range(n_decoys): + c=np.random.randint(0,len_distances) + n1=np.random.randint(0,N) + n2=np.random.randint(0,N) + qi1=np.random.randint(0,N) + qi2=np.random.randint(0,N) + q1=seq_index[qi1] + q2=seq_index[qi2] + + burial_energy1 = (-0.5 * k_contact * burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) + burial_energy2 = (-0.5 * k_contact * burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) + burial_energy = (burial_energy1+burial_energy2)/(N-1) # normalize because double counting carlos thing + + direct = theta[c] * direct_gamma[q1, q2] + water_mediated = sigma_water[n1,n2] * thetaII[c] * water_gamma[q1,q2] + protein_mediated = sigma_protein[n1,n2] * thetaII[c] * protein_gamma[q1,q2] + contact_energy = -k_contact * (direct+water_mediated+protein_mediated) + electrostatics_energy = k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] + + decoy_energies[i]=(burial_energy+contact_energy+electrostatics_energy) + + #mean_decoy_energy = np.mean(decoy_energies) + std_decoy_energy = np.std(decoy_energies) + return 1#std_decoy_energy + + compute_energy_numba = self.numbify(compute_energy) + + def denergy_mutation(seq_index, pos, aa): + seq_index_new = seq_index.copy() + seq_index_new[pos] = aa + return compute_energy_numba(seq_index_new) - compute_energy_numba(seq_index) + + self.compute_energy = compute_energy + self.compute_denergy_mutation = denergy_mutation + + def regression_test(self, seq_index): + raise NotImplementedError("sorry") + + class AwsemEnergyVariance(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self._use_numba=use_numba @@ -864,6 +1092,7 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" + class AwsemEnergyStd(EnergyTerm): def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA, n_decoys=None): self._use_numba=use_numba @@ -893,7 +1122,7 @@ def initialize_functions(self): len_alphabet=self.alphabet_size phi_len= indicators1D.shape[0]*len_alphabet + indicators2D.shape[0]*len_alphabet**2 gamma=self.gamma - + rng = np.random.default_rng() # Precompute the mean of the indicators indicator_means=np.zeros(len(indicators1D)+len(indicators2D)) c=0 @@ -917,7 +1146,10 @@ def compute_energy(seq_index): """ Function to compute the variance of the energy of permutations of a sequence using random shuffling. This function is much faster than compute_energy_permutation but is an approximation""" energies=np.zeros(n_decoys) - shuffled_index=seq_index.copy() + shuffled_index=seq_index.copy() + for _ in numba.prange(20): + to_replace = rng.integers(low=0,high=len(seq_index)) + shuffled_index[to_replace] = rng.integers(low=0,high=len_alphabet) for i in numba.prange(n_decoys): energies[i]=awsem_energy(shuffled_index[np.random.permutation(len(shuffled_index))]) return np.var(energies) @@ -957,13 +1189,7 @@ def compute_energy(seq_index): # because we averaged over all indicators and they're playing the role of # couplings and fields in our model, this is a "mean field" approach # - # more precisely, the above-described strategy is mean-field - # with respect to the indicators, representing sequence shuffling; - # to represent structure shuffling, we want to do the mean-field calculation - # with respect to the gammas, meaning that we average the gammas to get the - # couplings and fields, then multiply by the indicators - # - + energy=0 for i in range(phi_len): for j in range(phi_len): @@ -997,6 +1223,7 @@ def regression_test(self, seq_index): energy=self.compute_energy(seq_index) assert np.isclose(energy,expected_energy), f"Expected energy {expected_energy} but got {energy}" + class AwsemEnergyStdFromCovMatrix(EnergyTerm): def __init__(self, covariance_matrix: np.ndarray, burial_gamma: np.ndarray, From 5405a148190136f403d34df2114df1f16d79bea1 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 22 Aug 2025 19:38:17 -0500 Subject: [PATCH 32/76] saving in the state where we can at least calculate the total energy by the looping method. now need to figure out how to get analytical mean and variance from the looping method --- frustratometer/optimization/optimization.py | 83 ++++++++++++++++----- 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 9669ac0e..6d60d08b 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -706,8 +706,8 @@ def initialize_functions(self): distances = np.triu(self.model.distance_matrix) ########################################################################################### - distances = distances[(distances0)] # USE THIS NORMALLY - #distances = distances[distances>0] # USE THIS FOR TESTING THING WHERE WE NEED ALL PAIRS + #distances = distances[(distances0)] # USE THIS NORMALLY + distances = distances[distances>0] # USE THIS FOR TESTING THING WHERE WE NEED ALL PAIRS ########################################################################################### len_distances = len(distances) @@ -741,12 +741,48 @@ def initialize_functions(self): assert np.all(mask==mask.T), "Mask should be symmetric" assert mask.shape == (N, N), f"Mask shape {mask.shape} does not match expected shape {(N, N)}" - n_decoys=4000 + n_decoys=9000 + + #foo = np.triu(self.model.distance_matrix) + #sigma_water = sigma_water[(foo0)] + #sigma_protein = sigma_protein[(foo0)] def compute_energy(seq_index): # adapted from AWSEM.compute_configurational_decoy_statistics, # modified to be numba-friendly + # analytic calculation + """ + aa_freq = np.array([(seq_index == i).sum() for i in range(len_alphabet)]) # frequency of each amino acid in the sequence + + temp = burial_gamma * aa_freq[:, np.newaxis] + scaled_burial_gamma = np.zeros((3,)) # burial gamma is a 3D vector, one for each burial indicator (low, med, high) + for counter in range(temp.shape[0]): + scaled_burial_gamma += temp[counter] + scaled_burial_gamma /= temp.shape[0] + #burial_energy = np.average(-1 * k_contact * scaled_burial_gamma * burial_indicator) + avg_burial_indicator = np.zeros((3,)) + for counter in range(burial_indicator.shape[0]): + avg_burial_indicator += burial_indicator[counter] + avg_burial_indicator /= burial_indicator.shape[0] + burial_energy = -1* k_contact * avg_burial_indicator * scaled_burial_gamma + burial_energy = (burial_energy[0] + burial_energy[1] + burial_energy[2])/(N-1) # sum over the three burial indicators + #breakpoint() + #assert type(burial_energy) == float + # direct, water-mediated, and protein-mediated contact energies + direct = np.average(theta) * np.average(direct_gamma * np.outer(aa_freq, aa_freq)) + #assert type(direct) == float + water_mediated = np.average(thetaII*sigma_water) * np.average(water_gamma * np.outer(aa_freq, aa_freq)) + #assert type(water_mediated) == float + protein_mediated = np.average(thetaII*sigma_protein) * np.average(protein_gamma * np.outer(aa_freq, aa_freq)) + #assert type(protein_mediated) == float + contact_energy = -k_contact * (direct + water_mediated + protein_mediated) + electrostatics_energy = k_electrostatics * np.average(electrostatics_indicator) * np.average(np.outer(aa_freq, aa_freq)*charges[:, np.newaxis]*charges[np.newaxis, :]) + #assert type(electrostatics_energy) == float + mean_decoy_energy = burial_energy + contact_energy + electrostatics_energy + #import pdb; pdb.set_trace() + """ + """# constructing the distribution by sampling, then computing the average decoy_energies=np.zeros(n_decoys) for i in range(n_decoys): c=np.random.randint(0,len_distances) @@ -770,8 +806,8 @@ def compute_energy(seq_index): decoy_energies[i]=(burial_energy+contact_energy+electrostatics_energy) mean_decoy_energy = np.mean(decoy_energies) #std_decoy_energy = np.std(decoy_energies) - """ + # checking that these energy functions are able to compute the total energy of the sequence assert len(distances) == (N**2-N)/2, f"len(distances): {len(distances)} != (N**2-N)/2: {(N**2-N)/2}" decoy_energies = np.zeros((len(seq_index)**2 - len(seq_index))//2) index = 0 @@ -793,9 +829,9 @@ def compute_energy(seq_index): decoy_energies[index] = contact_energy+burial_energy+electrostatics_energy#(contact_energy+electrostatics_energy)#(burial_energy + contact_energy + electrostatics_energy) index += 1 mean_decoy_energy = np.sum(decoy_energies) # for testing, we return the total energy, not the average - """ - + #""" + #import pdb; pdb.set_trace() return mean_decoy_energy#, std_decoy_energy compute_energy_numba = self.numbify(compute_energy) @@ -830,10 +866,12 @@ def initialize_functions(self): len_alphabet=self.alphabet_size distances = np.triu(self.model.distance_matrix) - distances = distances[(distances0)] + ########################################################################################### + distances = distances[(distances0)] # USE THIS NORMALLY + #distances = distances[distances>0] # USE THIS FOR TESTING THING WHERE WE NEED ALL PAIRS + ########################################################################################### len_distances = len(distances) - print(f"len_distances: {len_distances}, distances: {distances}") - + rho_b = np.expand_dims(self.model.rho_r, 1) #(n,1) rho1 = np.expand_dims(self.model.rho_r, 0) #(1,n) rho2 = np.expand_dims(self.model.rho_r, 1) #(n,1) @@ -845,28 +883,33 @@ def initialize_functions(self): theta = 0.25 * (1 + np.tanh(self.model.eta * (distances - self.model.r_min))) * (1 + np.tanh(self.model.eta * (self.model.r_max - distances))) # (c,) thetaII = 0.25 * (1 + np.tanh(self.model.eta * (distances - self.model.r_minII))) * (1 + np.tanh(self.model.eta * (self.model.r_maxII - distances))) #(c,) burial_indicator = np.tanh(self.model.burial_kappa * (rho_b - self.model.burial_ro_min)) + np.tanh(self.model.burial_kappa * (self.model.burial_ro_max - rho_b)) #(n,3) - - charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) + # gap has 0 charge + # gap, A,C,D, E, F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y + charges = np.array([0, 0,0,-1,-1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0]) + charges = charges[self.reindex_dca] # remove unused gap, C, and P electrostatics_indicator = np.exp(-distances / self.model.electrostatics_screening_length) / distances N = self.model.N k_contact = self.model.k_contact - burial_gamma = self.model.burial_gamma - direct_gamma = self.model.direct_gamma - water_gamma = self.model.water_gamma - protein_gamma = self.model.protein_gamma + burial_gamma = self.model.burial_gamma[self.model.aa_map_awsem_list][self.reindex_dca] + direct_gamma = self.model.direct_gamma[self.model.aa_map_awsem_x, self.model.aa_map_awsem_y][self.reindex_dca][:,self.reindex_dca] + water_gamma = self.model.water_gamma[self.model.aa_map_awsem_x, self.model.aa_map_awsem_y][self.reindex_dca][:,self.reindex_dca] + protein_gamma = self.model.protein_gamma[self.model.aa_map_awsem_x, self.model.aa_map_awsem_y][self.reindex_dca][:,self.reindex_dca] k_contact = self.model.k_contact k_electrostatics = self.model.k_electrostatics - mask = self.mask sequence_mask_contact = self.model.sequence_mask_contact + assert np.all(mask==mask.T), "Mask should be symmetric" + assert mask.shape == (N, N), f"Mask shape {mask.shape} does not match expected shape {(N, N)}" - n_decoys=4000 + n_decoys=9000 + + #foo = np.triu(self.model.distance_matrix) + #sigma_water = sigma_water[(foo0)] + #sigma_protein = sigma_protein[(foo0)] def compute_energy(seq_index): - # adapted from AWSEM.compute_configurational_decoy_statistics, - # modified to be numba-friendly - + # constructing the distribution by sampling, then computing the average decoy_energies=np.zeros(n_decoys) for i in range(n_decoys): c=np.random.randint(0,len_distances) From 7454c0941402de37a8a6d9c44cf8c727230319f1 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 16 Sep 2025 21:25:41 -0500 Subject: [PATCH 33/76] brough over Structure.py from amyloid_atlas branch --- frustratometer/classes/Structure.py | 43 +++++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/frustratometer/classes/Structure.py b/frustratometer/classes/Structure.py index cf71c2fe..1021e4ea 100644 --- a/frustratometer/classes/Structure.py +++ b/frustratometer/classes/Structure.py @@ -14,7 +14,7 @@ class Structure: def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_selection: str = None, aligned_sequence: str = None, filtered_aligned_sequence: str = None, - distance_matrix_method:str = 'CB', pdb_directory: Path = Path.cwd(), repair_pdb:bool = True)->object: + distance_matrix_method:str = 'CB', pdb_directory: Path = Path.cwd(), repair_pdb:bool = True, return_distance_midpoints:bool = False)->object: """ Generates structure object. Both PDB and CIF format files are accepted as input. @@ -55,6 +55,11 @@ def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_s If True, provided pdb file will be repaired with missing residues inserted and heteroatoms removed. Note that a pdb file will be produced, regardless of input file format. + return_distance_midpoints: bool + Whether to return a matrix of the same shape as distance_matrix representing the same contacts as distance_matrix + that indicates the absolute coordinates of the midpoint between the pair of atoms. This helps us compute the pair distribution + functions of the different classes of contacts. So this matrix isn't really a matrix because each "element" has 3 channels: x, y, and z + Returns ------- Structure object @@ -77,7 +82,7 @@ def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_s self.pdbID=pdb_file.stem self.pdb_file=pdb_file - self.chain=chain + self.chain=chain # will be None if no chain supplied self.distance_matrix_method=distance_matrix_method self.filtered_aligned_sequence=filtered_aligned_sequence self.aligned_sequence=aligned_sequence @@ -88,18 +93,18 @@ def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_s self.init_index_shift=0 if repair_pdb: - fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) + fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) # for this function, chain can be str or list (or None) self.pdb_file=str(pdb_directory/f"{self.pdbID}_cleaned.pdb") if ".pdb" in str(pdb_file) or repair_pdb==True: - self.structure = prody.parsePDB(str(self.pdb_file), chain=self.chain).select(f"protein") + self.structure = prody.parsePDB(str(self.pdb_file), chain=self.chain).select(f"protein") # for this function, chain should be a string containing the chain ids like "AB" or "A B" else: - self.structure=prody.parseMMCIF(str(self.pdb_file),chain=self.chain).select(f"protein") + self.structure=prody.parseMMCIF(str(self.pdb_file),chain=self.chain).select(f"protein") # for this function, chain should be a string containing the chain ids like "AB" or "A B" else: assert len(self.seq_selection.replace("to"," to ").replace(":"," : ").split())>=4, "Please correctly input your residue selection" if self.chain==None: - raise ValueError("Please provide a chain name") + raise ValueError("self.chain==None. Please provide chain name(s)") self.init_index=int(self.seq_selection.replace("to"," to ").replace(":"," : ").split()[1].replace("`","")) self.fin_index=int(self.seq_selection.replace("to"," to ").replace(":"," : ").split()[3].replace("`","")) @@ -116,7 +121,7 @@ def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_s with open(pdb_file,"r") as f: for line in f: - if line.split()[0]=="ATOM" and line.split()[4+shift]==self.chain: + if line.split()[0]=="ATOM" and (line.split()[4+shift] in self.chain): try: res_index=''.join(i for i in line.split()[5+index_shift] if i.isdigit()) next_res_index=''.join(i for i in next(f).split()[5+index_shift] if i.isdigit()) @@ -133,27 +138,35 @@ def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_s self.init_index_shift=self.init_index-self.pdb_init_index self.fin_index_shift=self.fin_index-self.pdb_init_index+1 if repair_pdb: - fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) + fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) # for this function, chain can be str or list (or None) self.pdb_file=f"{pdb_directory}/{self.pdbID}_cleaned.pdb" self.select_gap_indices=[i for i in gap_indices if self.init_index<=i<=self.fin_index] self.fin_index_shift-=len(self.select_gap_indices) - self.seq_selection=f"resnum `{self.init_index_shift+1}to{self.fin_index_shift}`" + #self.seq_selection=f"resnum `{self.init_index_shift+1}to{self.fin_index_shift}`" # WE'RE KEEPING IDs NOW SO DON'T WANT TO DO THIS!!!! elif "resindex" in self.seq_selection: self.init_index_shift=self.init_index self.fin_index_shift=self.fin_index+1 if repair_pdb: - fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) + fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) # for this function, chain can be str or list (or None) self.pdb_file=f"{pdb_directory}/{self.pdbID}_cleaned.pdb" - self.chain="A" + # self.chain="A" I don't know why we would want to change the chain ID here if ".pdb" in str(pdb_file) or repair_pdb==True: self.structure = prody.parsePDB(str(self.pdb_file), chain=self.chain).select(f"protein and {self.seq_selection}") else: self.structure=prody.parseMMCIF(str(self.pdb_file),chain=self.chain).select(f"protein and {self.seq_selection}") - self.sequence=pdb.get_sequence(self.pdb_file,self.chain) - self.distance_matrix=pdb.get_distance_matrix(pdb_file=self.pdb_file,chain=self.chain, - method=self.distance_matrix_method) + self.sequence, self.start_mask = pdb.get_sequence(self.pdb_file,self.chain,return_start_mask=True) # this function can now accept chain as list or string + if return_distance_midpoints: + self.distance_matrix, self.midpoint_matrix = pdb.get_distance_matrix(pdb_file=self.pdb_file,chain=self.chain, # for this function, chain should be a string containing the chain ids + method=self.distance_matrix_method, # separated by a space, like "A B" + return_distance_midpoints=True) + else: + self.distance_matrix=pdb.get_distance_matrix(pdb_file=self.pdb_file,chain=self.chain, # for this function, chain should be a string containing the chain ids + method=self.distance_matrix_method, # separated by a space, like "A B" + return_distance_midpoints=False) + self.midpoint_matrix = None + self.full_pdb_distance_matrix=self.distance_matrix self.z_coordinates=self.structure.select('((name CB) or (resname GLY and name CA))').getCoords() @@ -180,7 +193,7 @@ def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_s else: self.full_to_aligned_index_dict=dict(zip(range(self.init_index_shift,self.fin_index_shift+1), range(len(self.sequence)))) self.mapped_distance_matrix=self.distance_matrix - + @classmethod def full_pdb(cls,pdb_file: Union[Path,str], chain: Union[str,None]=None, aligned_sequence: str = None, filtered_aligned_sequence: str = None, distance_matrix_method:str = 'CB', pdb_directory: Path = Path.cwd(), repair_pdb:bool = True): From e92c350d53ba790f37335fa8015d82dae47b0b04 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 16 Sep 2025 21:26:19 -0500 Subject: [PATCH 34/76] made biopython residue name list more inclusive --- frustratometer/pdb/pdb.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/frustratometer/pdb/pdb.py b/frustratometer/pdb/pdb.py index 2feeb658..b5d5f0bb 100644 --- a/frustratometer/pdb/pdb.py +++ b/frustratometer/pdb/pdb.py @@ -93,7 +93,10 @@ def get_sequence(pdb_file: str, #atom_names = [atom.getName() for atom in residue] # PRODY #is_regular_res = ("CA" in atom_names and "O" in atom_names) # PRODY res_id = residue.get_id()[0] #BIOPYTHON - if (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res: # BIOPYTHON + okay_resids = [' ', 'H_MSE', 'H_M3L', 'H_CAS', 'H_ALA', 'H_CYS', 'H_ASP', + 'H_GLU', 'H_PHE', 'H_GLY', 'H_HIS', 'H_ILE', 'H_LYS', 'H_LEU', 'H_MET', + 'H_ASN', 'H_PRO', 'H_GLN', 'H_ARG', 'H_SER', 'H_THR', 'H_VAL', 'H_TRP', 'H_TYR'] + if res_id in okay_resids and is_regular_res: # BIOPYTHON # i don't know what H_HSE, H_M3L, and H_CAS are doing # because they aren't in three_to_one, so those should throw an error # long story short, I don't think we have to worry about them when switching from biopython to prody From 60c55cac9ceba8ecaf57fa22ae594178e8b5bbe5 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 16 Sep 2025 21:28:43 -0500 Subject: [PATCH 35/76] got fix.py from amyloid_atlas branch, which improves multi chain handling --- frustratometer/pdb/fix.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/frustratometer/pdb/fix.py b/frustratometer/pdb/fix.py index 142656cb..e8c0185f 100644 --- a/frustratometer/pdb/fix.py +++ b/frustratometer/pdb/fix.py @@ -13,7 +13,7 @@ def repair_pdb(pdb_file: str, chain: str, pdb_directory: Path= Path.cwd()) -> PD pdb_file: str, PDB file location. chain: str, - Chain ID + Chain ID -- can be formatted as str or list (or None) pdb_directory: str, PDB file location @@ -51,5 +51,19 @@ def repair_pdb(pdb_file: str, chain: str, pdb_directory: Path= Path.cwd()) -> PD print("Unable to add missing atoms") fixer.addMissingHydrogens(7.0) - PDBFile.writeFile(fixer.topology, fixer.positions, open(f"{pdb_directory}/{pdbID}_cleaned.pdb", 'w')) + + # renumber residues so that each chain starts at 1 + new_top = type(fixer.topology)() # an openmm.app.Topology accesed without needing to import it separately + for old_chain in fixer.topology.chains(): + new_chain = new_top.addChain(id=old_chain.id) + for old_residue in old_chain.residues(): + new_residue = new_top.addResidue(old_residue.name,new_chain,id=None) # allow the class to choose residue id + for old_atom in old_residue.atoms(): + new_atom = new_top.addAtom(old_atom.name,old_atom.element,new_residue,id=old_atom.id) + + # use keepIds=True when writing to preserve chain IDs + # changing it to False causes test_multichain_density to fail for structure_file1-density_file1 + # because chains a, b, c, ... get renamed to A, B, C, ... and end up getting confused + # with the real chains A, B, C, ... + PDBFile.writeFile(new_top, fixer.positions, open(f"{pdb_directory}/{pdbID}_cleaned.pdb", 'w'),keepIds=True) return fixer \ No newline at end of file From 535d370ae2293cd2efdcae60e493855ccb8b35f2 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 21 Oct 2025 14:14:00 -0500 Subject: [PATCH 36/76] added option to not call vmd after writing tcl script --- frustratometer/classes/Frustratometer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/frustratometer/classes/Frustratometer.py b/frustratometer/classes/Frustratometer.py index d8cceec1..36a79542 100644 --- a/frustratometer/classes/Frustratometer.py +++ b/frustratometer/classes/Frustratometer.py @@ -292,6 +292,7 @@ def auc(self): return frustration.compute_auc(self.roc()) def vmd(self, sequence: str = None, single:Union[str,np.array] = 'singleresidue', pair:Union[str,np.array] = 'mutational', + tcl_script:str = 'frustration.tcl', call_vmd:bool=True, aa_freq:np.array = None, correction:int = 0, max_connections:Union[int,None] = None, movie_name=None, still_image_name=None): """ Calculates frustration indices and superimposes frustration patterns onto PDB structure using the VMD software. @@ -317,12 +318,14 @@ def vmd(self, sequence: str = None, single:Union[str,np.array] = 'singleresidue' from the sequence that was passed to this vmd function. Proceeding further may not\n\ perform the computation that you intend to perform.") - + #breakpoint() tcl_script = frustration.write_tcl_script(self.pdb_file, self.chain, self.mask, self.distance_matrix, self.distance_cutoff, -self.frustration(kind=single, sequence=sequence, aa_freq=aa_freq), -self.frustration(kind=pair, sequence=sequence, aa_freq=aa_freq), - max_connections=max_connections, movie_name=movie_name, still_image_name=still_image_name) - frustration.call_vmd(self.pdb_file, tcl_script) + max_connections=max_connections, movie_name=movie_name, still_image_name=still_image_name, + tcl_script=tcl_script,) + if call_vmd: + frustration.call_vmd(self.pdb_file, tcl_script) def view_pair_frustration(self, sequence:str = None, pair:str = 'mutational', aa_freq:np.array = None): """ From 35bc7dc227107bf4dc83bac340c7328d60b9b9d2 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 21 Oct 2025 14:25:09 -0500 Subject: [PATCH 37/76] multi chain support --- frustratometer/frustration/frustration.py | 36 ++++++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/frustratometer/frustration/frustration.py b/frustratometer/frustration/frustration.py index 6f2b9baa..c5345be5 100644 --- a/frustratometer/frustration/frustration.py +++ b/frustratometer/frustration/frustration.py @@ -890,20 +890,28 @@ def write_tcl_script(pdb_file: Union[Path,str], chain: str, mask: np.array, dist tcl_script : Path or str tcl script file """ + fo = open(tcl_script, 'w+') single_frustration = np.nan_to_num(single_frustration,nan=0,posinf=0,neginf=0) pair_frustration = np.nan_to_num(pair_frustration,nan=0,posinf=0,neginf=0) structure = prody.parsePDB(str(pdb_file)) - selection = structure.select('protein', chain=chain) + if chain is not None: + selection = structure.select('protein', chain=chain) + else: + selection = structure.select('protein', chain='_') # select all chains residues = np.unique(selection.getResnums()) fo.write(f'[atomselect top all] set beta 0\n') # Single residue frustration for r, f in zip(residues, single_frustration): # print(f) - fo.write(f'[atomselect top "chain {chain} and residue {int(r)}"] set beta {f}\n') + if chain is not None: + fo.write(f'[atomselect top "chain {chain} and residue {int(r)}"] set beta {f}\n') + else: + fo.write(f'[atomselect top "residue {int(r)}"] set beta {f}\n') # 'residue' corresponds to unique residue id in vmd, + # so this is okay if there are multiple chains # Mutational frustration: r1, r2 = np.meshgrid(residues, residues, indexing='ij') @@ -929,13 +937,21 @@ def write_tcl_script(pdb_file: Union[Path,str], chain: str, mask: np.array, dist r2=int(r2) if abs(r1-r2) == 1: # don't draw interactions between residues adjacent in sequence continue - pos1 = selection.select(f'resid {r1} and chain {chain} and (name CB or (resname GLY and name CA))').getCoords()[0] - pos2 = selection.select(f'resid {r2} and chain {chain} and (name CB or (resname GLY and name CA))').getCoords()[0] + if chain is not None: + pos1 = selection.select(f'resid {r1} and chain {chain} and (name CB or (resname GLY and name CA))').getCoords()[0] + pos2 = selection.select(f'resid {r2} and chain {chain} and (name CB or (resname GLY and name CA))').getCoords()[0] + else: + pos1 = selection.select(f'resid {r1} and (name CB or (resname GLY and name CA))').getCoords()[0] + pos2 = selection.select(f'resid {r2} and (name CB or (resname GLY and name CA))').getCoords()[0] distance = np.linalg.norm(pos1 - pos2) if d > 9.5 or d < 3.5: continue - fo.write(f'lassign [[atomselect top "resid {r1} and name CA and chain {chain}"] get {{x y z}}] pos1\n') - fo.write(f'lassign [[atomselect top "resid {r2} and name CA and chain {chain}"] get {{x y z}}] pos2\n') + if chain is not None: + fo.write(f'lassign [[atomselect top "resid {r1} and name CA and chain {chain}"] get {{x y z}}] pos1\n') + fo.write(f'lassign [[atomselect top "resid {r2} and name CA and chain {chain}"] get {{x y z}}] pos2\n') + else: + fo.write(f'lassign [[atomselect top "resid {r1} and name CA"] get {{x y z}}] pos1\n') + fo.write(f'lassign [[atomselect top "resid {r2} and name CA"] get {{x y z}}] pos2\n') if 3.5 <= distance <= 6.5: fo.write(f'draw line $pos1 $pos2 style solid width 2\n') else: @@ -953,8 +969,12 @@ def write_tcl_script(pdb_file: Union[Path,str], chain: str, mask: np.array, dist r2=int(r2) if d > 9.5 or d < 3.5: continue - fo.write(f'lassign [[atomselect top "resid {r1} and name CA and chain {chain}"] get {{x y z}}] pos1\n') - fo.write(f'lassign [[atomselect top "resid {r2} and name CA and chain {chain}"] get {{x y z}}] pos2\n') + if chain is not None: + fo.write(f'lassign [[atomselect top "resid {r1} and name CA and chain {chain}"] get {{x y z}}] pos1\n') + fo.write(f'lassign [[atomselect top "resid {r2} and name CA and chain {chain}"] get {{x y z}}] pos2\n') + else: + fo.write(f'lassign [[atomselect top "resid {r1} and name CA"] get {{x y z}}] pos1\n') + fo.write(f'lassign [[atomselect top "resid {r2} and name CA"] get {{x y z}}] pos2\n') if 3.5 <= d <= 6.5: fo.write(f'draw line $pos1 $pos2 style solid width 2\n') else: From f0d2b47e1faf6bee974a618986663af812cefe59 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 21 Oct 2025 14:25:57 -0500 Subject: [PATCH 38/76] Added comments and edited for testing --- frustratometer/optimization/inner_product.py | 79 ++++++++++++++------ 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/frustratometer/optimization/inner_product.py b/frustratometer/optimization/inner_product.py index f8421890..17934177 100644 --- a/frustratometer/optimization/inner_product.py +++ b/frustratometer/optimization/inner_product.py @@ -156,19 +156,31 @@ def compute_region_means_1_by_2(indicator_0, indicator_1): @jit(types.Array(types.float64, 1, 'C')(types.Array(types.float64, 1, 'A', readonly=True), types.Array(types.float64, 1, 'A', readonly=True)), nopython=True, cache=True) def compute_region_means_1_by_1(indicator_0, indicator_1): + # indicator_0: an element of an indicator1D + # indicator_1: also an element of an indicator1D (may be the same or different) + # in other words, these are 1D numpy arrays with axis length equal to the + # number of residues in the protein + # + # the calculation of region_mean n = indicator_0.shape[0] region_sum = np.zeros(2, dtype=np.float64) region_count = np.zeros(2, dtype=np.int64) - (ij, ii) = range(2) + (ij, ii) = range(2) # so ij=0, ii=1 + # ii: correlation between burial indicators (varying the density well) + # for a single residue + # ij: correlation between burial indicators (varying the density well) + # for a pair of residues for i in range(n): region_sum[ii] += indicator_0[i] * indicator_1[i] region_count[ii]=n region_mean = np.zeros(2, dtype=np.float64) if region_count[ii] > 0: - region_mean[ii] = region_sum[ii] / region_count[ii] + region_mean[ii] = region_sum[ii] / region_count[ii] # inner product of indicators / number of residues if n>1: + # it looks like region_sum[0] is always zero, so region_sum.sum()==region_sum[1]==region_sum[ii] region_mean[ij]=indicator_0.mean()*indicator_1.mean()*(n/(n - 1))-region_sum.sum()/(n*(n-1)) - return region_mean + return region_mean # (product of means - normalized dot product of indicators, + # normalized dot product of indicators) @jit(types.Array(types.float64, 2, 'C')(types.Array(types.int64, 1, 'A', readonly=True), types.Array(types.float64, 1, 'A', readonly=True)),nopython=True, cache=True) def mean_inner_product_2_by_2(repetitions,region_mean): @@ -295,10 +307,15 @@ def mean_inner_product_1_by_2(repetitions,region_mean): @jit(types.Array(types.float64, 2, 'C')(types.Array(types.int64, 1, 'A', readonly=True), types.Array(types.float64, 1, 'A', readonly=True)),nopython=True, cache=True) def mean_inner_product_1_by_1(repetitions,region_mean): - ij, ii = range(2) + # repetitions: number of amino acids of each type (so probably shape (20,)) + # this is a parameter of build_mean_inner_product_matrix + # and is passed through without modification + # region_mean: see return value of compute_region_means functions + + ij, ii = range(2) # so ij=0, ii=1 n=repetitions - n_elements= len(repetitions) + n_elements= len(repetitions) # number of amino acid types mean_inner_product = np.zeros(n_elements**2) @@ -308,33 +325,40 @@ def mean_inner_product_1_by_1(repetitions,region_mean): if i==j: #ii mean_inner_product[id]=n[i]*region_mean[ii]+n[i]*(n[i]-1)*region_mean[ij] else: #ij + # multiply count of each amino acid type by mean_inner_product[id]=n[i]*n[j]*region_mean[ij] + # this return value has to be the outer product of the indicator function vector + # (weighted by number of contacts of each type), averaged element-wise return mean_inner_product.reshape(n_elements, n_elements) -@jit(types.Array(types.float64, 2, 'C')( - types.Array(types.int64, 1, 'A', readonly=True), - types.Array(types.float64, 2, 'A', readonly=True), - types.Array(types.float64, 3, 'A', readonly=True), - types.Array(types.float64, 3, 'A', readonly=True)), - nopython=True, parallel=False, cache=True) +#@jit(types.Array(types.float64, 2, 'C')( +# types.Array(types.int64, 1, 'A', readonly=True), +# types.Array(types.float64, 2, 'A', readonly=True), +# types.Array(types.float64, 3, 'A', readonly=True), +# types.Array(types.float64, 3, 'A', readonly=True)), +# nopython=True, parallel=False, cache=True) def build_mean_inner_product_matrix(repetitions, indicators1d, indicators2d, region_means): + # repetitions: number of amino acids of each type (so probably shape (20,)) + # indicators1D: list of 3 elements (low density, medium density, high density) + # indicators2D: list of 3 or 4 elements (dir, prot, wat, possibly elec) + num_matrices1d = len(indicators1d) num_matrices2d = len(indicators2d) n_elements = len(repetitions) - num_matrices = num_matrices1d + num_matrices2d + num_matrices = num_matrices1d + num_matrices2d # probably equal to 6 or 7 # Compute the size of each block and the total size - block_sizes = np.empty(num_matrices, dtype=np.int64) + block_sizes = np.empty(num_matrices, dtype=np.int64) # creates an array without setting elements block_sizes[:num_matrices1d] = n_elements block_sizes[num_matrices1d:] = n_elements**2 + # at this point, block_sizes looks something like [20,20,20,400,400,400] total_size = np.sum(block_sizes) - # Create the resulting matrix filled with zeros + # Create the resulting matrix (which is returned by this function) filled with zeros R = np.zeros((total_size, total_size)) # Compute the starting indices for each matrix - #start_indices = np.cumsum([0] + block_sizes[:-1]) start_indices=np.zeros(len(block_sizes),dtype=np.int64) start=0 for i in range(1,len(block_sizes)): @@ -365,21 +389,30 @@ def build_mean_inner_product_matrix(repetitions, indicators1d, indicators2d, reg if i != j: R[sj:ej, si:ei] = R[si:ei, sj:ej].T + # if we have i==j, then the transposed region is the original region and there's nothing to fill in - return R + return R # The average (over sequence shuffles) of the outer product + # of the vector formed from the set of all indicator types. + # The shuffling average was performed by multiplying by the + # proportion of amino acids in the sequence by each indicator type -@jit(types.Array(types.float64, 3, 'C')( - types.Array(types.float64, 2, 'A', readonly=True), - types.Array(types.float64, 3, 'A', readonly=True)), - nopython=True, cache=True) +#@jit(types.Array(types.float64, 3, 'C')( +# types.Array(types.float64, 2, 'A', readonly=True), +# types.Array(types.float64, 3, 'A', readonly=True)), +# nopython=True, cache=True) def compute_all_region_means(indicators1d, indicators2d): - num_matrices1d = len(indicators1d) - num_matrices2d = len(indicators2d) - num_matrices = num_matrices1d + num_matrices2d + num_matrices1d = len(indicators1d) # 3 (low density, med density, high density) + num_matrices2d = len(indicators2d) # 3 or 4 (dir, prot, wat, possibly elec) + num_matrices = num_matrices1d + num_matrices2d # Create the resulting matrix filled with zeros R = np.zeros((num_matrices,num_matrices,15),dtype=np.float64) + # 15 deep because we need to unpack 15 return values in the cases + # where i and j correspond to 2d matrices; in the cases that + # i and j represent 1 or 0 2d matrices, we won't need to unpack + # as many return values, so we'll just fill in the first few elements + # of the third axis and the rest will remain as 0 for ij in prange(num_matrices**2): i=ij//num_matrices From 493eb8f32d6fc2020ef3f99372f1ebbb584ce511 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 21 Oct 2025 14:27:06 -0500 Subject: [PATCH 39/76] added comments and edits for testing --- frustratometer/optimization/optimization.py | 43 ++++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 6d60d08b..6a551923 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -706,8 +706,8 @@ def initialize_functions(self): distances = np.triu(self.model.distance_matrix) ########################################################################################### - #distances = distances[(distances0)] # USE THIS NORMALLY - distances = distances[distances>0] # USE THIS FOR TESTING THING WHERE WE NEED ALL PAIRS + distances = distances[(distances0)] # USE THIS NORMALLY + #distances = distances[distances>0] # USE THIS FOR TESTING THING WHERE WE NEED ALL PAIRS ########################################################################################### len_distances = len(distances) @@ -743,30 +743,30 @@ def initialize_functions(self): n_decoys=9000 - #foo = np.triu(self.model.distance_matrix) - #sigma_water = sigma_water[(foo0)] - #sigma_protein = sigma_protein[(foo0)] + # these lines used for the random sampling and analytic calculation + foo = np.triu(self.model.distance_matrix) + sigma_water = sigma_water[(foo0)] + sigma_protein = sigma_protein[(foo0)] def compute_energy(seq_index): # adapted from AWSEM.compute_configurational_decoy_statistics, # modified to be numba-friendly # analytic calculation - """ aa_freq = np.array([(seq_index == i).sum() for i in range(len_alphabet)]) # frequency of each amino acid in the sequence - temp = burial_gamma * aa_freq[:, np.newaxis] + temp = burial_gamma * aa_freq[:, np.newaxis] # (20,3) * (20,1) -> (20,3) scaled_burial_gamma = np.zeros((3,)) # burial gamma is a 3D vector, one for each burial indicator (low, med, high) for counter in range(temp.shape[0]): scaled_burial_gamma += temp[counter] - scaled_burial_gamma /= temp.shape[0] + scaled_burial_gamma /= temp.shape[0] # average burial gammas, weighted by amino acid frequencies #burial_energy = np.average(-1 * k_contact * scaled_burial_gamma * burial_indicator) avg_burial_indicator = np.zeros((3,)) for counter in range(burial_indicator.shape[0]): avg_burial_indicator += burial_indicator[counter] - avg_burial_indicator /= burial_indicator.shape[0] + avg_burial_indicator /= burial_indicator.shape[0] # average burial indicator burial_energy = -1* k_contact * avg_burial_indicator * scaled_burial_gamma - burial_energy = (burial_energy[0] + burial_energy[1] + burial_energy[2])/(N-1) # sum over the three burial indicators + burial_energy = (burial_energy[0] + burial_energy[1] + burial_energy[2])/(N-1)#*N #/(N-1) # sum over the three burial indicators #breakpoint() #assert type(burial_energy) == float # direct, water-mediated, and protein-mediated contact energies @@ -776,12 +776,14 @@ def compute_energy(seq_index): #assert type(water_mediated) == float protein_mediated = np.average(thetaII*sigma_protein) * np.average(protein_gamma * np.outer(aa_freq, aa_freq)) #assert type(protein_mediated) == float - contact_energy = -k_contact * (direct + water_mediated + protein_mediated) + #contact_energy = -k_contact * (direct*len(theta) + (water_mediated+protein_mediated)*len(thetaII)) # multiply by number of contacts + contact_energy = -k_contact * (direct + (water_mediated+protein_mediated)) # multiply by number of contacts + #electrostatics_energy = k_electrostatics * np.average(electrostatics_indicator) * np.average(np.outer(aa_freq, aa_freq)*charges[:, np.newaxis]*charges[np.newaxis, :]) * len(electrostatics_indicator) # multiply by number of contacts electrostatics_energy = k_electrostatics * np.average(electrostatics_indicator) * np.average(np.outer(aa_freq, aa_freq)*charges[:, np.newaxis]*charges[np.newaxis, :]) #assert type(electrostatics_energy) == float mean_decoy_energy = burial_energy + contact_energy + electrostatics_energy #import pdb; pdb.set_trace() - """ + """# constructing the distribution by sampling, then computing the average decoy_energies=np.zeros(n_decoys) for i in range(n_decoys): @@ -807,7 +809,7 @@ def compute_energy(seq_index): mean_decoy_energy = np.mean(decoy_energies) #std_decoy_energy = np.std(decoy_energies) """ - # checking that these energy functions are able to compute the total energy of the sequence + """# checking that these energy functions are able to compute the total energy of the sequence assert len(distances) == (N**2-N)/2, f"len(distances): {len(distances)} != (N**2-N)/2: {(N**2-N)/2}" decoy_energies = np.zeros((len(seq_index)**2 - len(seq_index))//2) index = 0 @@ -829,7 +831,7 @@ def compute_energy(seq_index): decoy_energies[index] = contact_energy+burial_energy+electrostatics_energy#(contact_energy+electrostatics_energy)#(burial_energy + contact_energy + electrostatics_energy) index += 1 mean_decoy_energy = np.sum(decoy_energies) # for testing, we return the total energy, not the average - #""" + """ #import pdb; pdb.set_trace() return mean_decoy_energy#, std_decoy_energy @@ -956,7 +958,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.model=model self.alphabet=alphabet self.reindex_dca=[_AA.index(aa) for aa in alphabet] - + assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." self.indicators = model.indicators self.alphabet_size=len(alphabet) @@ -1766,6 +1768,17 @@ def find_optimal_replicas(self, max_replicas=32, n_repeats=5, n_steps=10000): if __name__ == '__main__': + reduced_alphabet = 'ADEFGHIKLMNQRSTVWY' + pdb = "tests/data/1r69.pdb" + s = Structure(pdb, chain=None) + model = AWSEM(s, expose_indicator_functions=True, + distance_cutoff_contact=10, min_sequence_separation_contact=2,) + variance = AwsemEnergyVariance(model, alphabet=reduced_alphabet) + monte_carlo = MonteCarlo(sequence="SISSRVKSKRIQLGLNQAELAQKVGTTQQSIEQLENGKTKRPRFLPELASALGVSVDWLLNGT", + energy=variance, alphabet=reduced_alphabet) + monte_carlo.annealing(n_steps=10) + exit() + pdb_list = ["tests/data/1r69.pdb","tests/data/1r69.pdb","tests/data/1r69.pdb"] pdb_structures = (Structure(pdb, chain=None) for pdb in pdb_list) ensemble = DecoyEnsemble(pdb_structures, distance_cutoff_contact=10, min_sequence_separation_contact=10) From 0038075498aeebfb0778b0093eeed36c02d30fcf Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 26 Oct 2025 20:59:12 -0500 Subject: [PATCH 40/76] improved comments --- frustratometer/optimization/inner_product.py | 49 ++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/frustratometer/optimization/inner_product.py b/frustratometer/optimization/inner_product.py index 17934177..a619bd23 100644 --- a/frustratometer/optimization/inner_product.py +++ b/frustratometer/optimization/inner_product.py @@ -156,8 +156,11 @@ def compute_region_means_1_by_2(indicator_0, indicator_1): @jit(types.Array(types.float64, 1, 'C')(types.Array(types.float64, 1, 'A', readonly=True), types.Array(types.float64, 1, 'A', readonly=True)), nopython=True, cache=True) def compute_region_means_1_by_1(indicator_0, indicator_1): - # indicator_0: an element of an indicator1D - # indicator_1: also an element of an indicator1D (may be the same or different) + # indicator_0: an element of an indicator1D list + # (so either low, med, or high density burial, + # with length equal to the number of residues in the protein) + # indicator_1: also an element of an indicator1D list + # (may be identical to indicator_0) # in other words, these are 1D numpy arrays with axis length equal to the # number of residues in the protein # @@ -166,9 +169,9 @@ def compute_region_means_1_by_1(indicator_0, indicator_1): region_sum = np.zeros(2, dtype=np.float64) region_count = np.zeros(2, dtype=np.int64) (ij, ii) = range(2) # so ij=0, ii=1 - # ii: correlation between burial indicators (varying the density well) + # ii: covariance between burial indicators (varying the density well) # for a single residue - # ij: correlation between burial indicators (varying the density well) + # ij: covariance between burial indicators (varying the density well) # for a pair of residues for i in range(n): region_sum[ii] += indicator_0[i] * indicator_1[i] @@ -307,10 +310,37 @@ def mean_inner_product_1_by_2(repetitions,region_mean): @jit(types.Array(types.float64, 2, 'C')(types.Array(types.int64, 1, 'A', readonly=True), types.Array(types.float64, 1, 'A', readonly=True)),nopython=True, cache=True) def mean_inner_product_1_by_1(repetitions,region_mean): + # This function computes a 20x20 block of the matrix , + # which represents the covariances between different classes + # of burial indicator functions (the 1-body terms). + # The 20x20 block may or may not be centered on the main diagonal of the full matrix. + # In the case that the 20x20 block is centered on the main diagonal, + # it represents the variances of each amino acid types within + # a burial indicator class (low, medium, or high), + # AND the covariances between each combination of amino acid types + # within this same indicator class + # In the case that the 20x20 block is not centered on the main diagonal, + # it represents the covariances of each amino acid type across + # two burial indicator classes (low-med, low-high, or med-high), + # AND the covariances between all combinations of amino acid types + # between those two indicator classes. # repetitions: number of amino acids of each type (so probably shape (20,)) - # this is a parameter of build_mean_inner_product_matrix - # and is passed through without modification - # region_mean: see return value of compute_region_means functions + # This is a parameter of build_mean_inner_product_matrix, + # which is the only important function that calls this function. + # The repetitions argument is not modified by build_mean_inner_product_matrix + # before or after this function is called; in other words, it is passed straight through + # region_mean: list[] + # The build_mean_inner_product_matrix function has a parameter called region_means + # that is indexed during the call to this function. So it must be that region_mean + # is specific to this particular combination of burial indicator classes + # (low-low, low-med, low-high, med-med, med-high, or high-high). + # Digging deeper, we find that the region_means passed to build_mean_inner_product_matrix + # comes from the output of compute_all_region_means, which repeatedly + # calls compute_region_means_1_by_1, compute_region_means_1_by_2, and + # compute_region_means_2_by_2 to populate a 2D array. So, to understand + # the region_mean parameter of this function, we should look at + # compute_region_means_1_by_1 for different burial indicator class arguments + # (low-low, low-med, low-high, med-med, med-high, or high-high) ij, ii = range(2) # so ij=0, ii=1 @@ -325,7 +355,7 @@ def mean_inner_product_1_by_1(repetitions,region_mean): if i==j: #ii mean_inner_product[id]=n[i]*region_mean[ii]+n[i]*(n[i]-1)*region_mean[ij] else: #ij - # multiply count of each amino acid type by + # for different amino acid types, we scale the average value by the number of each type mean_inner_product[id]=n[i]*n[j]*region_mean[ij] # this return value has to be the outer product of the indicator function vector @@ -402,6 +432,9 @@ def build_mean_inner_product_matrix(repetitions, indicators1d, indicators2d, reg # types.Array(types.float64, 3, 'A', readonly=True)), # nopython=True, cache=True) def compute_all_region_means(indicators1d, indicators2d): + """indicators1d: burial indicators, in the order of low, medium, high + indicators2d: contact indicators, in the order of direct, protein, water + Each array has axis length(s) equal to the number of residues in the protein""" num_matrices1d = len(indicators1d) # 3 (low density, med density, high density) num_matrices2d = len(indicators2d) # 3 or 4 (dir, prot, wat, possibly elec) num_matrices = num_matrices1d + num_matrices2d From 257537f41332e38edbb9a79b5ce673ccec6a3706 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 7 Nov 2025 13:25:22 -0600 Subject: [PATCH 41/76] merging fixed Gamma.py from carlos_main --- frustratometer/classes/Gamma.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/frustratometer/classes/Gamma.py b/frustratometer/classes/Gamma.py index 821bffea..6f9ab658 100644 --- a/frustratometer/classes/Gamma.py +++ b/frustratometer/classes/Gamma.py @@ -408,7 +408,8 @@ def plot_gamma(self, new_order=None): # Plot setup f, axes = plt.subplots(2, 2, figsize=(18, 16)) - titles = ['Burial Gammas', 'Direct Gammas', 'Water Gammas', 'Protein Gammas'] + f.subplots_adjust(hspace=50) # fix overlap between axis ticks of upper subplots and titles of lower subplots + titles = ['Burial Gammas', 'Direct Gammas', 'Protein Gammas', 'Water Gammas'] for i, (title, name) in enumerate(zip(titles, segments)): ax = axes[i // 2, i % 2] @@ -416,8 +417,12 @@ def plot_gamma(self, new_order=None): ax.set_title(title) ax.set_xticks(np.arange(len(self.alphabet)) + 0.5) ax.set_xticklabels(self.alphabet) - ax.set_yticks(np.arange(segments[name].shape[0] // 20) + 0.5) - ax.set_yticklabels(range(segments[name].shape[0] // 20)) + if i==0: # burial + ax.set_yticks([0.5,1.5,2.5]) + ax.set_yticklabels(['low','medium','high'], rotation=45, size=12) + else: # direct, prot, or wat + ax.set_yticks(np.arange(len(self.alphabet)) + 0.5) + ax.set_yticklabels(self.alphabet, rotation=0) plt.tight_layout() plt.show() @@ -648,4 +653,4 @@ class O(): self.gamma1 = Gamma(np.arange(0,1260,1)) self.gamma2 = Gamma(np.arange(0,1260,1)*5+10) - self.gamma3 = Gamma(np.arange(1260,0,-1)*2-4) \ No newline at end of file + self.gamma3 = Gamma(np.arange(1260,0,-1)*2-4) From 47123354522fe0c50e24a45e1f13bc75d8272f64 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 8 Nov 2025 14:59:07 -0600 Subject: [PATCH 42/76] added alt_sigma_wat option --- frustratometer/classes/AWSEM.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index fd21f6d6..294e91c7 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -187,7 +187,9 @@ def __init__(self, pdb_structure: object, sequence: str =None, expose_indicator_functions: bool=False, + alt_sigma_wat: bool=False, **parameters)->object: + self.alt_sigma_wat = alt_sigma_wat # assume the user wanted the sequence from the pdb structure if not given if not sequence: sequence = pdb_structure.sequence @@ -272,6 +274,8 @@ def calculate_indicators(self): rho1 = np.expand_dims(rho_r, 0) rho2 = np.expand_dims(rho_r, 1) sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) + if self.alt_sigma_wat: + sigma_water = -sigma_water + 0.5*( (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) + (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0)))) sigma_protein = 1 - sigma_water #Calculate theta and indicators theta = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - self.distance_matrix))) From c7d286c86896e919af35e34bc044a31a1b72e9dd Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 8 Nov 2025 16:21:47 -0600 Subject: [PATCH 43/76] rearranged AWSEMBase and subclass method calls; fixed bug where masks were not being recalculated following AWSEM/Structure distance matrix modification --- frustratometer/classes/AWSEM.py | 108 +++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 36 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 294e91c7..b2483fa8 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -74,7 +74,6 @@ def __init__(self, ------- AWSEM object """ - # set sequence based on argument self.N = len(sequence) self.sequence = sequence @@ -131,14 +130,74 @@ def __init__(self, self._decoy_fluctuation = {} # don't know what this does self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ - def setup_model(self): - # some methods that should be called to complete the initialization of subclass instances - # subclasses should (re)define these methods as needed - self.calculate_indicators() - self.calculate_energy_and_potts() + def subclass_setup_helper(self, potts): + """ + This method calls methods to calculate native indicator functions, + masks (based on the native distance matrix), and native energy, + then optionally sets up the potts model. + + This method is intended to be called as the last step of __init__ + in each subclass of AWSEMBase. The subclasses may differ in how + they load in the structural information (the part of __init__ + preceding the call to this method) and how they implement + the calculate_indicators and calculate_masks methods called + by subclass_setup_helper + Parameters + ---------- + potts: bool=True + Whether to build the potts model from the freshly evaluated indicators and masks. + For frustration calculations, this must always be done. However, if this class is + being used only to extract indicator functions from a structure, then building + the potts model may be a waste of RAM and time. + """ + self.calculate_masks() # subclasses should (re)define this method as needed + self.calculate_indicators() # subclasses should (re)define this method as needed + if potts: + self.calculate_energy_and_potts() + else: + if 'potts_model' in dir(self) or 'burial_energy' in dir(self)\ + or 'contact_energy' in dir(self) or '_native_energy' in dir(self): + # if one has been defined, they should all have been defined + assert 'potts_model' in dir(self), dir(self) + assert 'burial_energy' in dir(self), dir(self) + assert 'contact_energy' in dir(self), dir(self) + assert '_native_energy' in dir(self), dir(self) + # potts model and energies will be inaccurate once indicators are modified; + # if we don't care about the potts model, then we should delete the old + # data so it can't be accidentally misused in the future + del self.potts_model + del self.burial_energy + del self.contact_energy + del self._native_energy + def calculate_indicators(self): - raise NotImplementedError("Subclasses must this method") + raise NotImplementedError("Subclasses must implement this method") + + def calculate_masks(self): + # calculate masks + if self.burial_in_context==True: + selected_matrix=self.full_pdb_distance_matrix + else: + selected_matrix=self.distance_matrix + self.sequence_mask_rho = frustration.compute_mask(selected_matrix, + maximum_contact_distance=None, + minimum_sequence_separation = self.p.min_sequence_separation_rho) + self.sequence_mask_contact = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=self.p.distance_cutoff_contact, + minimum_sequence_separation = self.p.min_sequence_separation_contact) + self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=None, + minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) + #with open('my_data.txt','w') as f: + # f.write(f"self.distance_cutoff: {self.distance_cutoff}\n") + # f.write(f"self.sequence_cutoff: {self.sequence_cutoff}\n") + #np.save('my_distance_matrix.npy',self.distance_matrix) + self.mask = frustration.compute_mask(self.distance_matrix, + maximum_contact_distance=self.distance_cutoff, + minimum_sequence_separation = self.sequence_cutoff) + #np.save('my_mask_new.npy',self.mask) + self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function def calculate_energy_and_potts(self): @@ -197,30 +256,7 @@ def __init__(self, super().__init__(sequence, expose_indicator_functions, **parameters) # set up strucure self.setup_structure(pdb_structure) - # calculate masks - if self.burial_in_context==True: - selected_matrix=self.full_pdb_distance_matrix - else: - selected_matrix=self.distance_matrix - self.sequence_mask_rho = frustration.compute_mask(selected_matrix, - maximum_contact_distance=None, - minimum_sequence_separation = self.p.min_sequence_separation_rho) - self.sequence_mask_contact = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=self.p.distance_cutoff_contact, - minimum_sequence_separation = self.p.min_sequence_separation_contact) - self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=None, - minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) - with open('my_data.txt','w') as f: - f.write(f"self.distance_cutoff: {self.distance_cutoff}\n") - f.write(f"self.sequence_cutoff: {self.sequence_cutoff}\n") - #np.save('my_distance_matrix.npy',self.distance_matrix) - self.mask = frustration.compute_mask(self.distance_matrix, - maximum_contact_distance=self.distance_cutoff, - minimum_sequence_separation = self.sequence_cutoff) - #np.save('my_mask_new.npy',self.mask) - self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function - self.setup_model() + self.subclass_setup_helper(potts=True) def setup_structure(self, pdb_structure): # check structure @@ -229,7 +265,7 @@ def setup_structure(self, pdb_structure): N=len(resid) self.resid = resid self.N = N - # set structure-dependent proterties + # set structure-dependent properties self._pdb_structure = pdb_structure self.structure=pdb_structure.structure self.chain=pdb_structure.chain @@ -248,9 +284,9 @@ def pdb_structure(self,pdb_structure): self.setup_structure(pdb_structure) # check that our new structure is compatible with our old one if self.N != len(self.sequence): - import pdb; pdb.set_trace() + breakpoint() raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") - self.calculate_indicators() + self.subclass_setup_helper(potts=True) def change_conformation(alternative_pdb_structure): # this function is an alias for the pdb_structure setter self.pdb_structure = alternative_pdb_structure @@ -519,7 +555,7 @@ def __init__(self, #np.save('protein_indicator_1.npy', protein_indicator) #np.save('water_indicator_1.npy', water_indicator) #np.save('electrostatics_indicator_1.npy', electrostatics_indicator) - self.setup_model() + self.subclass_setup_helper(potts=True) def calculate_indicators(self): pass # the function was initialized with indicators, so there's nothing to do @@ -552,7 +588,7 @@ def __init__(self, super().__init__(sequence, expose_indicator_functions, **parameters) self.covariance_matrix = covariance_matrix self.num_indicators = 3*self.N + 4*(self.N**2-self.N)/2 # low, med, high burial for each N, 4 classes of pair interactions - self.setup_model() + self.subclass_setup_helper(potts=True) @staticmethod # trying to avoid loading down memory with too many permanent attributes def pairwise_mask(l): # l for length From 38e5d27427e6a8a2ebd5299b9d8ac1e1d6938c15 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 8 Nov 2025 17:12:47 -0600 Subject: [PATCH 44/76] updating handling of boolean used to determine whether potts model should be calculated --- frustratometer/classes/AWSEM.py | 41 +++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index b2483fa8..4736f6e2 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -59,6 +59,7 @@ class AWSEMBase(Frustratometer): def __init__(self, sequence: str, expose_indicator_functions: bool=False, + potts: bool=True, **parameters)->object: """ Generate AWSEM object @@ -69,11 +70,15 @@ def __init__(self, The amino acid sequence expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. + potts: bool + Whether to set up the potts model (can be RAM-intensive and time-intensive), + which is unnecessary if all you want to get is the indicator functions. Returns ------- AWSEM object """ + # set sequence based on argument self.N = len(sequence) self.sequence = sequence @@ -82,6 +87,9 @@ def __init__(self, # i guess not exposing indicator functions saves memory? self.expose_indicator_functions = expose_indicator_functions + # whether to compute potts model + self.potts = potts + # parse other arguments p = AWSEMParameters(**parameters) if p.min_sequence_separation_contact is None: @@ -130,7 +138,7 @@ def __init__(self, self._decoy_fluctuation = {} # don't know what this does self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ - def subclass_setup_helper(self, potts): + def subclass_setup_helper(self): """ This method calls methods to calculate native indicator functions, masks (based on the native distance matrix), and native energy, @@ -142,18 +150,10 @@ def subclass_setup_helper(self, potts): preceding the call to this method) and how they implement the calculate_indicators and calculate_masks methods called by subclass_setup_helper - - Parameters - ---------- - potts: bool=True - Whether to build the potts model from the freshly evaluated indicators and masks. - For frustration calculations, this must always be done. However, if this class is - being used only to extract indicator functions from a structure, then building - the potts model may be a waste of RAM and time. """ self.calculate_masks() # subclasses should (re)define this method as needed self.calculate_indicators() # subclasses should (re)define this method as needed - if potts: + if self.potts: self.calculate_energy_and_potts() else: if 'potts_model' in dir(self) or 'burial_energy' in dir(self)\ @@ -246,6 +246,7 @@ def __init__(self, pdb_structure: object, sequence: str =None, expose_indicator_functions: bool=False, + potts: bool=True, alt_sigma_wat: bool=False, **parameters)->object: self.alt_sigma_wat = alt_sigma_wat @@ -253,10 +254,10 @@ def __init__(self, if not sequence: sequence = pdb_structure.sequence # load structure-independent parameters and methods - super().__init__(sequence, expose_indicator_functions, **parameters) + super().__init__(sequence, expose_indicator_functions, potts, **parameters) # set up strucure self.setup_structure(pdb_structure) - self.subclass_setup_helper(potts=True) + self.subclass_setup_helper() def setup_structure(self, pdb_structure): # check structure @@ -286,7 +287,7 @@ def pdb_structure(self,pdb_structure): if self.N != len(self.sequence): breakpoint() raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") - self.subclass_setup_helper(potts=True) + self.subclass_setup_helper() def change_conformation(alternative_pdb_structure): # this function is an alias for the pdb_structure setter self.pdb_structure = alternative_pdb_structure @@ -523,7 +524,10 @@ def __init__(self, AWSEMIndicators object """ - super().__init__(sequence, expose_indicator_functions, **parameters) + # if we already have our indicator functions, + # our goal is probably to compute the potts model, + # so we'll just hard code a value of True for that argument VVVV + super().__init__(sequence, expose_indicator_functions, potts=True, **parameters) self.burial_indicator = burial_indicator self.direct_indicator = direct_indicator self.protein_indicator = protein_indicator @@ -555,7 +559,7 @@ def __init__(self, #np.save('protein_indicator_1.npy', protein_indicator) #np.save('water_indicator_1.npy', water_indicator) #np.save('electrostatics_indicator_1.npy', electrostatics_indicator) - self.subclass_setup_helper(potts=True) + self.subclass_setup_helper() def calculate_indicators(self): pass # the function was initialized with indicators, so there's nothing to do @@ -585,10 +589,13 @@ def __init__(self, AWSEMVariancePotts object """ - super().__init__(sequence, expose_indicator_functions, **parameters) + # if we already have our indicator functions, + # our goal is probably to compute the potts model, + # so we'll just hard code a value of True for that argument VVVV + super().__init__(sequence, expose_indicator_functions, potts=True, **parameters) self.covariance_matrix = covariance_matrix self.num_indicators = 3*self.N + 4*(self.N**2-self.N)/2 # low, med, high burial for each N, 4 classes of pair interactions - self.subclass_setup_helper(potts=True) + self.subclass_setup_helper() @staticmethod # trying to avoid loading down memory with too many permanent attributes def pairwise_mask(l): # l for length From 0e80f46ac93e773e6e2d153a9786b7eecafd14c1 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 8 Nov 2025 20:30:32 -0600 Subject: [PATCH 45/76] add distance matrix-based conformer update capability to AWSEM class --- frustratometer/classes/AWSEM.py | 80 ++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 4736f6e2..119c9993 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -243,38 +243,76 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): class AWSEM(AWSEMBase): def __init__(self, - pdb_structure: object, + pdb_structure: object | tuple, # tuple is an object, but this clarifies what we expect sequence: str =None, expose_indicator_functions: bool=False, potts: bool=True, alt_sigma_wat: bool=False, **parameters)->object: - self.alt_sigma_wat = alt_sigma_wat # assume the user wanted the sequence from the pdb structure if not given if not sequence: - sequence = pdb_structure.sequence + try: + sequence = pdb_structure.sequence + except: + if isinstance(pdb_structure,tuple): + raise ValueError("""It seems that you are trying to use + the tuple pdb_structure format, which + specifies a conformation but not a sequence. + In this case, you must provide the sequence + as a separate argument to this class.""") + else: + raise # load structure-independent parameters and methods super().__init__(sequence, expose_indicator_functions, potts, **parameters) + self.alt_sigma_wat = alt_sigma_wat # set up strucure self.setup_structure(pdb_structure) self.subclass_setup_helper() def setup_structure(self, pdb_structure): - # check structure - selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') - resid = selection_CB.getResindices() - N=len(resid) - self.resid = resid - self.N = N - # set structure-dependent properties - self._pdb_structure = pdb_structure - self.structure=pdb_structure.structure - self.chain=pdb_structure.chain - self.pdb_file=pdb_structure.pdb_file - self.init_index_shift=pdb_structure.init_index_shift - self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict - self.distance_matrix=pdb_structure.distance_matrix - self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix + if not isinstance(pdb_structure, tuple): # alt_conf should our custom Structure object + # maybe our type check here should be more restrictive, + # but the __init__ only requires pdb_structure to be an object, + # so I'll take my cue from that + # check structure + selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') + resid = selection_CB.getResindices() + N=len(resid) + self.resid = resid + self.N = N + # set structure-dependent properties + self._pdb_structure = pdb_structure + self.structure=pdb_structure.structure + self.chain=pdb_structure.chain + self.pdb_file=pdb_structure.pdb_file + self.init_index_shift=pdb_structure.init_index_shift + self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict + self.distance_matrix=pdb_structure.distance_matrix + self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix + self.midpoint_matrix = pdb_structure.midpoint_matrix + # midpoint matrix is used to map interacting pairs to a single point in space + elif isinstance(pdb_structure, tuple): # pdb_structure is defined by a few distance matrices + if len(pdb_structure)==3\ + and isinstance(pdb_structure[0],np.ndarray)\ + and isinstance(pdb_structure[1],np.ndarray)\ + and isinstance(pdb_structure[2],np.ndarray) or pdb_structure[2] is None: + # pdb_structure is a full_pdb_distance_matrix + # followed by a distance_matrix + # followed by a midpoint matrix (or None) + self._pdb_structure = None # we're getting our conformer from within python, not a pdb file + self.structure = None # we're getting our conformer from within python, not a pdb file + self.full_pdb_distance_matrix = pdb_structure[0] + self.distance_matrix = pdb_structure[1] + self.midpoint_matrix = pdb_structure[2] + # midpoint matrix is used to map interacting pairs to a single point in space; + # usually not necessary, so it will usually be None + # + # the rest of the attributes that are set in the case that pdb_structure is a Structure + # either remain the same (if this method has been previously called with a Structure) + # or go undefined (if we are passing a list of arrays the first time that we are calling + # this method) + else: + raise AssertionError("unexpected else block") @property def pdb_structure(self): @@ -288,9 +326,9 @@ def pdb_structure(self,pdb_structure): breakpoint() raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") self.subclass_setup_helper() - def change_conformation(alternative_pdb_structure): - # this function is an alias for the pdb_structure setter - self.pdb_structure = alternative_pdb_structure + def change_conformation(alt_conf): + # this method is an alias for the setter + self.pdb_structure = alt_conf def calculate_indicators(self): # Calculate rho From c1c89cca49462779bec153c296decf20f34da87f Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 9 Nov 2025 17:05:44 -0600 Subject: [PATCH 46/76] fixed missing 'self' in method definition --- frustratometer/classes/AWSEM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 119c9993..d570a2ed 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -326,7 +326,7 @@ def pdb_structure(self,pdb_structure): breakpoint() raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") self.subclass_setup_helper() - def change_conformation(alt_conf): + def change_conformation(self,alt_conf): # this method is an alias for the setter self.pdb_structure = alt_conf From f64cbce069a12f93c89f1532ca13b4a8dc261f4b Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 16 Nov 2025 15:22:11 -0600 Subject: [PATCH 47/76] small optimization edits; should come back to this soon --- frustratometer/optimization/optimization.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 663347a3..1f24dcaf 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -303,6 +303,8 @@ def compute_energy(seq_index: np.array) -> float: energy_J -= model_J[i, j, aa_i, aa_j] * mask[i, j] total_energy = energy_h + energy_J / 2 + #with open('energies.txt','a') as f: + # f.write(f"{total_energy}\n") return total_energy def compute_denergy_mutation(seq_index: np.ndarray, pos: int, aa_new: int) -> float: @@ -873,7 +875,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA, n_decoys= self.indicators2D=np.array([ind for ind in self.indicators if len(ind.shape)==2]) #TODO: Fix the gamma matrix to account for elecrostatics self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) - + self.initialize_functions() def initialize_functions(self): @@ -882,6 +884,7 @@ def initialize_functions(self): len_alphabet=self.alphabet_size phi_len= indicators1D.shape[0]*len_alphabet + indicators2D.shape[0]*len_alphabet**2 gamma=self.gamma + rng = np.random.default_rng() # Precompute the mean of the indicators indicator_means=np.zeros(len(indicators1D)+len(indicators2D)) @@ -907,9 +910,15 @@ def compute_energy(seq_index): This function is much faster than compute_energy_permutation but is an approximation""" energies=np.zeros(n_decoys) shuffled_index=seq_index.copy() + # randomize the amino acid identities at 20 positions + for _ in numba.prange(1): + #to_replace = rng.integers(low=0,high=len(seq_index)) + #shuffled_index[to_replace] = rng.integers(0,high=len_alphabet) + to_replace = np.random.randint(0,high=len(seq_index)) + shuffled_index[to_replace] = np.random.randint(0,high=len_alphabet) for i in numba.prange(n_decoys): energies[i]=awsem_energy(shuffled_index[np.random.permutation(len(shuffled_index))]) - return np.var(energies) + return np.std(energies) else: def compute_energy(seq_index): counts = np.zeros(len_alphabet, dtype=np.int64) @@ -947,6 +956,9 @@ def denergy_mutation(seq_index, pos, aa): seq_index_new = seq_index.copy() seq_index_new[pos] = aa return compute_energy_numba(seq_index_new) - compute_energy_numba(seq_index) + + def denergy_swap(seq_index, pos1, pos2): + return 0 self.compute_energy = compute_energy self.compute_denergy_mutation = denergy_mutation From 1aa9d86f5274da61ed1c2811388e4f2ed89c4b50 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 16 Nov 2025 15:39:03 -0600 Subject: [PATCH 48/76] cleaning up tiny unmerged diff (just a newline) --- frustratometer/optimization/optimization.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 7ac9f999..0560bbdc 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -1170,10 +1170,7 @@ def initialize_functions(self): phi_len= indicators1D.shape[0]*len_alphabet + indicators2D.shape[0]*len_alphabet**2 gamma=self.gamma rng = np.random.default_rng() -<<<<<<< HEAD - -======= ->>>>>>> origin/temp_decoy + # Precompute the mean of the indicators indicator_means=np.zeros(len(indicators1D)+len(indicators2D)) c=0 From 000c41b05ab3c7d4c0a4c24f758a34de2f0415d1 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 26 Nov 2025 11:20:59 -0600 Subject: [PATCH 49/76] added colorbar scale and increased label fontsize --- frustratometer/classes/Gamma.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/frustratometer/classes/Gamma.py b/frustratometer/classes/Gamma.py index 6f9ab658..a31c361b 100644 --- a/frustratometer/classes/Gamma.py +++ b/frustratometer/classes/Gamma.py @@ -399,7 +399,7 @@ def correlate_segments(self, other): return correlations # Plotting - def plot_gamma(self, new_order=None): + def plot_gamma(self, new_order=None, scale=[-5,5]): import matplotlib.pyplot as plt import seaborn as sns if new_order: @@ -410,19 +410,19 @@ def plot_gamma(self, new_order=None): f, axes = plt.subplots(2, 2, figsize=(18, 16)) f.subplots_adjust(hspace=50) # fix overlap between axis ticks of upper subplots and titles of lower subplots titles = ['Burial Gammas', 'Direct Gammas', 'Protein Gammas', 'Water Gammas'] - for i, (title, name) in enumerate(zip(titles, segments)): ax = axes[i // 2, i % 2] - sns.heatmap(segments[name].reshape(-1, 20), ax=ax, cmap='RdBu_r', center=0) + foo = sns.heatmap(segments[name].reshape(-1, 20), ax=ax, cmap='RdBu_r', center=0, vmin=scale[0], vmax=scale[1]) + foo.collections[0].colorbar.ax.tick_params(labelsize=16) ax.set_title(title) ax.set_xticks(np.arange(len(self.alphabet)) + 0.5) - ax.set_xticklabels(self.alphabet) + ax.set_xticklabels(self.alphabet, size=16) if i==0: # burial ax.set_yticks([0.5,1.5,2.5]) - ax.set_yticklabels(['low','medium','high'], rotation=45, size=12) + ax.set_yticklabels(['low','medium','high'], rotation=45, size=16) else: # direct, prot, or wat ax.set_yticks(np.arange(len(self.alphabet)) + 0.5) - ax.set_yticklabels(self.alphabet, rotation=0) + ax.set_yticklabels(self.alphabet, rotation=0, fontsize=16) plt.tight_layout() plt.show() From d81b644a8dc3e3a53e7e93036d31cdce3e0dc6ab Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 5 Dec 2025 16:29:32 -0600 Subject: [PATCH 50/76] changed name of AWSEMParameters class to Parameters --- frustratometer/classes/AWSEM.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index d570a2ed..ede2141a 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -10,7 +10,7 @@ __all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble', 'AWSEMVariancePotts'] -class AWSEMParameters(BaseModel): +class Parameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) """Default parameters for AWSEM energy calculations.""" k_contact: float = Field(4.184, description="Coefficient for contact potential. (kJ/mol)") @@ -91,7 +91,7 @@ def __init__(self, self.potts = potts # parse other arguments - p = AWSEMParameters(**parameters) + p = Parameters(**parameters) if p.min_sequence_separation_contact is None: p.min_sequence_separation_contact = 1 if p.min_sequence_separation_rho is None: From f970469d7e5db17cc83048a4f02147d575cb7b86 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 5 Dec 2025 19:06:19 -0600 Subject: [PATCH 51/76] currently passing tests --- frustratometer/classes/AWSEM.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index ede2141a..a42be402 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -13,7 +13,13 @@ class Parameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) """Default parameters for AWSEM energy calculations.""" - k_contact: float = Field(4.184, description="Coefficient for contact potential. (kJ/mol)") + k_contact: float = Field(4.184, description=""" + Scale factor for contact potential. + Many parameters used to be given in kcal/mol, + but we want our results in kJ/mol, so this is + set to the appropriate conversion factor by default. + Note that the electrostatic parameter is not scaled + by k_contact.""") #Density eta: float = Field(5.0, description="Sharpness of the distance-based switching function (Angstrom^-1).") @@ -53,7 +59,19 @@ class AWSEMBase(Frustratometer): #Mapping to DCA q = 20 - aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy + ref_alphabet = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V'] + # ref_alphabet orders the amino acids alphabetically based on the 3-letter code; + # it was used historically, e.g. Tables 3-6 of "Water in protein structure prediction" + # (https://www.pnas.org/doi/10.1073/pnas.0307851100) + aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] + # when used to index ref_alphabet, aa_map_awsem_list gives a list of all amino acids alphabetized + # by one-letter code, with an extra A at the beginning + new_alphabet = [] + for aa in aa_map_awsem_list: + new_alphabet.append(ref_alphabet[aa]) + assert new_alphabet == ['A','A','C','D','E','F','G','H','I','K','L', + 'M','N','P','Q','R','S','T','V','W','Y'], new_alphabet + # we are trying to phase out aa_map_awsem_list aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') def __init__(self, @@ -109,11 +127,12 @@ def __init__(self, gamma = Gamma(self.p.gamma) else: raise ValueError("Gamma parameter must be a path or a Gamma object.") + #gamma = gamma.reorder(alphabet=) self.gamma=gamma - self.burial_gamma = gamma['Burial'].T - self.direct_gamma = gamma['Direct'][0] - self.protein_gamma = gamma['Protein'][0] - self.water_gamma = gamma['Water'][0] + self.burial_gamma = gamma['Burial'].T # (3,20) -> (20,3) + self.direct_gamma = gamma['Direct'][0] # (1,20,20) -> (20,20) + self.protein_gamma = gamma['Protein'][0] # (1,20,20) -> (20,20) + self.water_gamma = gamma['Water'][0] # (1,20,20) -> (20,20) # set other attributes self.burial_in_context = self.p.burial_in_context From c7f3ffdd506adcd6d5c5db1e90c7c304cd16f642 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 6 Dec 2025 14:30:28 -0600 Subject: [PATCH 52/76] _AA and gamma refactoring, but burial energy calculation broken (contact probably OK) --- frustratometer/classes/AWSEM.py | 114 ++++++++++++---------- frustratometer/classes/Frustratometer.py | 20 ++-- frustratometer/frustration/frustration.py | 52 +++++----- tests/test_awsem_frustratometer.py | 10 +- 4 files changed, 110 insertions(+), 86 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index a42be402..c6805290 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -55,25 +55,11 @@ class Parameters(BaseModel): charges: np.array = Field(np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]), description="Charge on each residue type") # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] + #charges: np.array = Field(np.array([0, 0, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]), description="Charge on each residue type") + #['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + class AWSEMBase(Frustratometer): - #Mapping to DCA - q = 20 - ref_alphabet = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V'] - # ref_alphabet orders the amino acids alphabetically based on the 3-letter code; - # it was used historically, e.g. Tables 3-6 of "Water in protein structure prediction" - # (https://www.pnas.org/doi/10.1073/pnas.0307851100) - aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] - # when used to index ref_alphabet, aa_map_awsem_list gives a list of all amino acids alphabetized - # by one-letter code, with an extra A at the beginning - new_alphabet = [] - for aa in aa_map_awsem_list: - new_alphabet.append(ref_alphabet[aa]) - assert new_alphabet == ['A','A','C','D','E','F','G','H','I','K','L', - 'M','N','P','Q','R','S','T','V','W','Y'], new_alphabet - # we are trying to phase out aa_map_awsem_list - aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') - def __init__(self, sequence: str, expose_indicator_functions: bool=False, @@ -125,14 +111,45 @@ def __init__(self, gamma = self.p.gamma elif isinstance(self.p.gamma, Path): gamma = Gamma(self.p.gamma) + self.p.gamma = gamma else: raise ValueError("Gamma parameter must be a path or a Gamma object.") - #gamma = gamma.reorder(alphabet=) - self.gamma=gamma - self.burial_gamma = gamma['Burial'].T # (3,20) -> (20,3) - self.direct_gamma = gamma['Direct'][0] # (1,20,20) -> (20,20) - self.protein_gamma = gamma['Protein'][0] # (1,20,20) -> (20,20) - self.water_gamma = gamma['Water'][0] # (1,20,20) -> (20,20) + """ + # CARLOS: if you really want to reorder, we can do something like this, + but it shouldn't be necessary--we always have access to the + order in self.gamma.alphabet + ordered_alphabet = ['A','C','D','E','F','G','H','I','K','L', + 'M','N','P','Q','R','S','T','V','W','Y'] + for aa in ordered_alphabet: + assert aa in gamma.alphabet, f'{aa} missing from gamma.alphabet!' + if len(gamma.alphabet) == 20: # alphabet is exactly the canonical AAs + gamma = gamma.reorder(ordered_alphabet) + elif len(gamma.alphabet) > 20: # includes noncanonical AA(s) (or a "gap") + ncAA = [] + for aa in gamma.alphabet: + if aa not in ordered_alphabet: + ncAA.append(AA) + ordered_alphabet = ncAA + ordered_alphabet # insert at the beginning + gamma = gamma.reorder(ordered_alphabet) + else: + raise ValueError(f"gamma file alphabet {gamma.alphabet} was too short") + """ + # burial gamma + self.q = len(gamma.alphabet) # most likely 20, but could be different + gb = gamma['Burial'] + if gb.shape == (3,self.q): + self.burial_gamma = gb.T + elif gb.shape == (self.q,3): + self.burial_gamma = gb + else: + raise ValueError(f"""Don't know how to parse burial gamma with shape {gb.shape}. + Expected ({self.q},3) or (3,{self.q}).""") + # pairwise gamma: squeeze to remove extra axis that is commonly present + self.direct_gamma = np.squeeze(gamma['Direct']) + self.protein_gamma = np.squeeze(gamma['Protein']) + self.water_gamma = np.squeeze(gamma['Water']) + assert self.direct_gamma.shape == self.protein_gamma.shape == self.water_gamma.shape == (self.q,self.q) + self.gamma = self.p.gamma # set other attributes self.burial_in_context = self.p.burial_in_context @@ -153,9 +170,6 @@ def __init__(self, # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix self.charges2 = charges2 - # ?????? - self._decoy_fluctuation = {} # don't know what this does - self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ def subclass_setup_helper(self): """ @@ -239,12 +253,14 @@ def calculate_energy_and_potts(self): # Compute potts model self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] + assert self.potts_model['h'].shape == (self.N, self.q), self.potts_model['h'].shape + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] + assert self.potts_model['J'].shape == (self.N, self.N, self.q, self.q), self.potts_model['J'].shape # Set the gap energy to zero - self.potts_model['h'][:, 0] = 0 - self.potts_model['J'][:, :, 0, :] = 0 - self.potts_model['J'][:, :, :, 0] = 0 + #self.potts_model['h'][:, 0] = 0 + #self.potts_model['J'][:, :, 0, :] = 0 + #self.potts_model['J'][:, :, :, 0] = 0 self._native_energy=None # don't know what this does @@ -289,7 +305,7 @@ def __init__(self, self.subclass_setup_helper() def setup_structure(self, pdb_structure): - if not isinstance(pdb_structure, tuple): # alt_conf should our custom Structure object + if not isinstance(pdb_structure, tuple): # alt_conf should be our custom Structure object # maybe our type check here should be more restrictive, # but the __init__ only requires pdb_structure to be an object, # so I'll take my cue from that @@ -387,16 +403,16 @@ def calculate_indicators(self): self.indicators.append(protein_indicator[:,:,0,0]*self.sequence_mask_contact) self.indicators.append(water_indicator[:,:,0,0]*self.sequence_mask_contact) self.gamma_array=[] - temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] - temp_burial_gamma[0]=0 + temp_burial_gamma=self.burial_gamma[:]#self.aa_map_awsem_list] + #temp_burial_gamma[0]=0 temp_burial_gamma *= -0.5 * self.p.k_contact self.gamma_array.append(temp_burial_gamma[:,0]) self.gamma_array.append(temp_burial_gamma[:,1]) self.gamma_array.append(temp_burial_gamma[:,2]) for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: - temp_gamma = contact_gamma[self.aa_map_awsem_x, self.aa_map_awsem_y].copy() - temp_gamma[0, :] = 0 - temp_gamma[:, 0] = 0 + temp_gamma = contact_gamma[:,:].copy()#self.aa_map_awsem_x, self.aa_map_awsem_y].copy() + #temp_gamma[0, :] = 0 + #temp_gamma[:, 0] = 0 temp_gamma *= -0.5 * self.k_contact self.gamma_array.append(temp_gamma) self.burial_indicator = burial_indicator # probably could get rid of either this or indicators list @@ -408,9 +424,9 @@ def calculate_indicators(self): electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list - temp_gamma = 0.5 * self.p.k_electrostatics * self.charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] - temp_gamma[0,:]=0 - temp_gamma[:,0]=0 + temp_gamma = 0.5 * self.p.k_electrostatics * self.charges2[:,:]#self.aa_map_awsem_x, self.aa_map_awsem_y] + #temp_gamma[0,:]=0 + #temp_gamma[:,0]=0 self.gamma_array.append(temp_gamma) def calculate_energy_and_potts(self): @@ -427,9 +443,9 @@ def calculate_energy_and_potts(self): def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - _AA='ARNDCQEGHILKMFPSTWYV' + _AA = self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' if aa_freq is None: - seq_index = np.array([_AA.find(aa) for aa in self.sequence]) + seq_index = np.array([_AA.index(aa) for aa in self.sequence]) N=self.N else: N=self.N*10 @@ -485,8 +501,8 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): return mean_decoy_energy, std_decoy_energy def compute_configurational_energies(self): - _AA='ARNDCQEGHILKMFPSTWYV' - seq_index = np.array([_AA.find(aa) for aa in self.sequence]) + _AA= self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' + seq_index = np.array([_AA.index(aa) for aa in self.sequence]) distances = np.triu(self.distance_matrix) distances = distances[(distances0)] n_contacts=len(distances) @@ -810,12 +826,12 @@ def calculate_energy_and_potts(self): # Compute potts model self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] # Set the gap energy to zero - self.potts_model['h'][:, 0] = 0 - self.potts_model['J'][:, :, 0, :] = 0 - self.potts_model['J'][:, :, :, 0] = 0 + #self.potts_model['h'][:, 0] = 0 + #self.potts_model['J'][:, :, 0, :] = 0 + #self.potts_model['J'][:, :, :, 0] = 0 self._native_energy=None # don't know what this does class DecoyEnsemble(): diff --git a/frustratometer/classes/Frustratometer.py b/frustratometer/classes/Frustratometer.py index 36a79542..929f6a38 100644 --- a/frustratometer/classes/Frustratometer.py +++ b/frustratometer/classes/Frustratometer.py @@ -61,9 +61,9 @@ def native_energy(self,sequence:str = None,ignore_couplings_of_gaps:bool=False,i if sequence is None: sequence=self.sequence else: - return frustration.compute_native_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,ignore_fields_of_gaps) + return frustration.compute_native_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,ignore_fields_of_gaps,self.gamma.alphabet) if not self._native_energy: - self._native_energy=frustration.compute_native_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,ignore_fields_of_gaps) + self._native_energy=frustration.compute_native_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,ignore_fields_of_gaps,self.gamma.alphabet) energy_value=self._native_energy return energy_value @@ -89,7 +89,7 @@ def sequences_energies(self, sequences:np.array, split_couplings_and_fields:bool output (if split_couplings_and_fields==True): np.array Array containing computed fields and couplings energies of the protein sequences. """ - output=frustration.compute_sequences_energy(sequences, self.potts_model, self.mask, split_couplings_and_fields) + output=frustration.compute_sequences_energy(sequences, self.potts_model, self.mask, split_couplings_and_fields,self.gamma.alphabet) return output def fields_energy(self, sequence:str = None, ignore_fields_of_gaps:bool = False) -> float: @@ -114,7 +114,7 @@ def fields_energy(self, sequence:str = None, ignore_fields_of_gaps:bool = False) """ if sequence is None: sequence=self.sequence - fields_energy=frustration.compute_fields_energy(sequence, self.potts_model,ignore_fields_of_gaps) + fields_energy=frustration.compute_fields_energy(sequence, self.potts_model,ignore_fields_of_gaps,self.gamma.alphabet) return fields_energy def couplings_energy(self, sequence:str = None,ignore_couplings_of_gaps:bool = False) -> float: @@ -139,7 +139,7 @@ def couplings_energy(self, sequence:str = None,ignore_couplings_of_gaps:bool = F """ if sequence is None: sequence=self.sequence - couplings_energy=frustration.compute_couplings_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps) + couplings_energy=frustration.compute_couplings_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,self.gamma.alphabet) return couplings_energy def decoy_fluctuation(self, sequence:str = None,kind:str = 'singleresidue',mask:np.array = None) -> np.array: @@ -167,13 +167,13 @@ def decoy_fluctuation(self, sequence:str = None,kind:str = 'singleresidue',mask: if not isinstance(mask, np.ndarray): mask=self.mask if kind == 'singleresidue': - fluctuation = frustration.compute_singleresidue_decoy_energy_fluctuation(sequence, self.potts_model, mask) + fluctuation = frustration.compute_singleresidue_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) elif kind == 'mutational': - fluctuation = frustration.compute_mutational_decoy_energy_fluctuation(sequence, self.potts_model, mask) + fluctuation = frustration.compute_mutational_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) elif kind == 'configurational': - fluctuation = frustration.compute_configurational_decoy_energy_fluctuation(sequence, self.potts_model, mask) + fluctuation = frustration.compute_configurational_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) elif kind == 'contact': - fluctuation = frustration.compute_contact_decoy_energy_fluctuation(sequence, self.potts_model, mask) + fluctuation = frustration.compute_contact_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) else: raise Exception("Wrong kind of decoy generation selected") self._decoy_fluctuation[kind] = fluctuation @@ -268,7 +268,7 @@ def plot_decoy_energy(self, sequence:str = None, kind:str = 'singleresidue', met native_energy = self.native_energy(sequence=sequence) decoy_energy = self.decoy_energy(kind=kind,sequence=sequence) if kind == 'singleresidue': - g = frustration.plot_singleresidue_decoy_energy(decoy_energy, native_energy, method) + g = frustration.plot_singleresidue_decoy_energy(decoy_energy, native_energy, method,self.gamma.alphabet) return g def roc(self): diff --git a/frustratometer/frustration/frustration.py b/frustratometer/frustration/frustration.py index c5345be5..240a2368 100644 --- a/frustratometer/frustration/frustration.py +++ b/frustratometer/frustration/frustration.py @@ -59,7 +59,8 @@ def compute_native_energy(seq: str, potts_model: dict, mask: np.array, ignore_gap_couplings: bool = False, - ignore_gap_fields: bool = False) -> float: + ignore_gap_fields: bool = False, + AA : str = _AA) -> float: """ Computes the native energy of a protein sequence based on a given Potts model and an interaction mask. @@ -107,7 +108,7 @@ def compute_native_energy(seq: str, .. todo:: Optimize the computation. """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) pos1, pos2 = np.meshgrid(np.arange(seq_len), np.arange(seq_len), indexing='ij', sparse=True) @@ -133,7 +134,8 @@ def compute_native_energy(seq: str, def compute_fields_energy(seq: str, potts_model: dict, - ignore_fields_of_gaps: bool = False) -> float: + ignore_fields_of_gaps: bool = False, + AA : str = _AA) -> float: """ Computes the fields energy of a protein sequence based on a given Potts model. @@ -165,7 +167,7 @@ def compute_fields_energy(seq: str, >>> fields_energy = compute_fields_energy(seq, potts_model) >>> print(f"Computed fields energy: {fields_energy:.2f}") """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) h = -potts_model['h'][range(seq_len), seq_index] @@ -180,7 +182,8 @@ def compute_fields_energy(seq: str, def compute_couplings_energy(seq: str, potts_model: dict, mask: np.array, - ignore_couplings_of_gaps: bool = False) -> float: + ignore_couplings_of_gaps: bool = False, + AA : str = _AA) -> float: """ Computes the couplings energy of a protein sequence based on a given Potts model and an interaction mask. @@ -223,7 +226,7 @@ def compute_couplings_energy(seq: str, .. todo:: Optimize the computation. """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) pos1, pos2 = np.meshgrid(np.arange(seq_len), np.arange(seq_len), indexing='ij', sparse=True) aa1, aa2 = np.meshgrid(seq_index, seq_index, indexing='ij', sparse=True) @@ -241,7 +244,8 @@ def compute_couplings_energy(seq: str, def compute_sequences_energy(seqs: list, potts_model: dict, mask: np.array, - split_couplings_and_fields = False) -> np.array: + split_couplings_and_fields = False, + AA : str = _AA) -> np.array: """ Computes the energy of multiple protein sequences based on a given Potts model and an interaction mask. @@ -288,7 +292,7 @@ def compute_sequences_energy(seqs: list, .. todo:: Optimize the computation. """ - seq_index = np.array([[_AA.find(aa) for aa in seq] for seq in seqs]) + seq_index = np.array([[AA.index(aa) for aa in seq] for seq in seqs]) N_seqs, seq_len = seq_index.shape pos_index=np.repeat([np.arange(seq_len)], N_seqs,axis=0) @@ -312,7 +316,8 @@ def compute_sequences_energy(seqs: list, def compute_singleresidue_decoy_energy_fluctuation(seq: str, potts_model: dict, - mask: np.array) -> np.array: + mask: np.array, + AA : str = _AA) -> np.array: """ Computes a (Lx21) matrix for a sequence of length L. Row i contains all possible changes in energy upon mutating residue i. @@ -354,7 +359,7 @@ def compute_singleresidue_decoy_energy_fluctuation(seq: str, .. todo:: Optimize the computation. """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) # Create decoys @@ -377,7 +382,8 @@ def compute_singleresidue_decoy_energy_fluctuation(seq: str, def compute_mutational_decoy_energy_fluctuation(seq: str, potts_model: dict, - mask: np.array, ) -> np.array: + mask: np.array, + AA : str = _AA) -> np.array: """ Computes a (LxLx21x21) matrix for a sequence of length L. Matrix[i,j] describes all possible changes in energy upon mutating residue i and j simultaneously. @@ -418,7 +424,7 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, .. todo:: Optimize the computation. """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) # Create masked decoys @@ -456,7 +462,8 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, def compute_configurational_decoy_energy_fluctuation(seq: str, potts_model: dict, - mask: np.array, ) -> np.array: + mask: np.array, + AA : str = _AA) -> np.array: """ Computes a (LxLx21x21) matrix for a sequence of length L. Matrix[i,j] describes all possible changes in energy upon mutating and altering the local densities of residue i and j simultaneously. @@ -498,7 +505,7 @@ def compute_configurational_decoy_energy_fluctuation(seq: str, .. todo:: Optimize the computation. """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) # Create masked decoys @@ -536,7 +543,8 @@ def compute_configurational_decoy_energy_fluctuation(seq: str, def compute_contact_decoy_energy_fluctuation(seq: str, potts_model: dict, - mask: np.array) -> np.array: + mask: np.array, + AA : str = _AA) -> np.array: r""" $$ \Delta DCA_{ij} = \Delta j_{ij} $$ :param seq: @@ -545,7 +553,7 @@ def compute_contact_decoy_energy_fluctuation(seq: str, :return: """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) # Create decoys @@ -611,7 +619,7 @@ def compute_decoy_energy(seq: str, potts_model: dict, mask: np.array, kind='sing decoy_energy=native_energy + compute_contact_decoy_energy_fluctuation(seq, potts_model, mask) return decoy_energy -def compute_aa_freq(seq, include_gaps=True): +def compute_aa_freq(seq, include_gaps=True, AA = _AA): """ Calculates amino acid frequencies in given sequence @@ -629,14 +637,14 @@ def compute_aa_freq(seq, include_gaps=True): aa_freq: np.array Array of frequencies of all 21 possible amino acids within sequence """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) aa_freq = np.array([(seq_index == i).sum() for i in range(21)]) if not include_gaps: aa_freq[0] = 0 return aa_freq -def compute_contact_freq(seq): +def compute_contact_freq(seq, AA = _AA): """ Calculates contact frequencies in given sequence @@ -650,7 +658,7 @@ def compute_contact_freq(seq): contact_freq: np.array 21x21 array of frequencies of all possible contacts within sequence. """ - seq_index = np.array([_AA.find(aa) for aa in seq]) + seq_index = np.array([AA.index(aa) for aa in seq]) aa_freq = np.array([(seq_index == i).sum() for i in range(21)], dtype=np.float64) aa_freq /= aa_freq.sum() contact_freq = (aa_freq[:, np.newaxis] * aa_freq[np.newaxis, :]) @@ -820,7 +828,7 @@ def plot_roc(roc_score): plt.plot([0, 1], [0, 1], '--') -def plot_singleresidue_decoy_energy(decoy_energy, native_energy, method='clustermap'): +def plot_singleresidue_decoy_energy(decoy_energy, native_energy, method='clustermap', AA = _AA): """ Plot comparison of single residue decoy energies, relative to the native energy @@ -841,7 +849,7 @@ def plot_singleresidue_decoy_energy(decoy_energy, native_energy, method='cluster g = f(decoy_energy, cmap='RdBu_r', vmin=native_energy - decoy_energy.std() * 3, vmax=native_energy + decoy_energy.std() * 3) - AA_dict = {str(i): _AA[i] for i in range(len(_AA))} + AA_dict = {str(i): AA[i] for i in range(len(AA))} new_ticklabels = [] if method == 'clustermap': ax_heatmap = g.ax_heatmap diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index b1ff257e..538ea635 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -172,7 +172,7 @@ def test_fields_couplings_AWSEM_energy(): assert model.fields_energy() + model.couplings_energy() - model.native_energy() < 1E-6 def test_single_residue_AWSEM_energy(): - _AA = '-ACDEFGHIKLMNPQRSTVWY' + _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer single residue frustration values lammps_single_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_singleresidue_1E8decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") ### @@ -197,7 +197,7 @@ def test_single_residue_AWSEM_energy(): assert (abs(np.array(lammps_single_frustration_dataframe["native_energy"])-test_residue_total_energy) < 1E-1).all() def test_contact_pair_AWSEM_energy(): - _AA = '-ACDEFGHIKLMNPQRSTVWY' + _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer mutational frustration values lammps_mutational_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_mutational_1E6decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") lammps_mutational_frustration_dataframe["i"]=lammps_mutational_frustration_dataframe["i"]-1 @@ -295,7 +295,7 @@ def test_selected_subsequence_AWSEM_contact_energy_without_protein_context(): assert np.round(selected_region_contact, 2) == -148.92 def test_single_residue_decoy_AWSEM_energy_statistics(): - _AA = '-ACDEFGHIKLMNPQRSTVWY' + _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer single residue frustration values lammps_single_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_singleresidue_1E8decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") ### @@ -324,7 +324,7 @@ def test_single_residue_decoy_AWSEM_energy_statistics(): assert (abs(np.array(lammps_single_frustration_dataframe["std(decoy_energies)"])-(expected_std_decoy_energy)) < 1.2E-1).all() def test_contact_pair_decoy_AWSEM_energy_statistics(): - _AA = '-ACDEFGHIKLMNPQRSTVWY' + _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer mutational frustration values lammps_mutational_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_mutational_1E6decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") lammps_mutational_frustration_dataframe["i"]=lammps_mutational_frustration_dataframe["i"]-1 @@ -372,7 +372,7 @@ def structure(): @pytest.mark.parametrize("distance_cutoff_contact", [None, 10]) def test_expose_indicators(structure, k_electrostatics, min_sequence_separation_contact, distance_cutoff_contact): """ Check that the AWSEM indicators exposed can reproduce the native energy, where E_native = -sum_{i} h_i - sum_{i,j} J_ij = sum_{i} gamma_i * I_i """ - _AA = '-ACDEFGHIKLMNPQRSTVWY' + _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, min_sequence_separation_contact = min_sequence_separation_contact, distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True) model_seq_index=np.array([_AA.find(aa) for aa in model.sequence]) indicators1D=np.array(model.indicators[0:3]) From ce5c6c70ba352b324429f4b41ed54bd677af9031 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 6 Dec 2025 16:32:06 -0600 Subject: [PATCH 53/76] now passing energy tests --- frustratometer/classes/AWSEM.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index c6805290..5563b5ea 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -238,7 +238,7 @@ def calculate_energy_and_potts(self): h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) # compute burial and contact energies - self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] + self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] @@ -403,7 +403,7 @@ def calculate_indicators(self): self.indicators.append(protein_indicator[:,:,0,0]*self.sequence_mask_contact) self.indicators.append(water_indicator[:,:,0,0]*self.sequence_mask_contact) self.gamma_array=[] - temp_burial_gamma=self.burial_gamma[:]#self.aa_map_awsem_list] + temp_burial_gamma=self.burial_gamma[:].copy()#self.aa_map_awsem_list] #temp_burial_gamma[0]=0 temp_burial_gamma *= -0.5 * self.p.k_contact self.gamma_array.append(temp_burial_gamma[:,0]) From 87d1f8089ef5be33e9242c9fe056bdc0778ead39 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 6 Dec 2025 18:13:37 -0600 Subject: [PATCH 54/76] passing awsem energy and frustration tests; need to check expose indicators and dca --- frustratometer/classes/AWSEM.py | 6 +- frustratometer/frustration/frustration.py | 68 +++++++++++++---------- tests/test_awsem_frustratometer.py | 11 ++-- 3 files changed, 50 insertions(+), 35 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 5563b5ea..b1a27847 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -153,8 +153,8 @@ def __init__(self, # set other attributes self.burial_in_context = self.p.burial_in_context - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + self.aa_freq = frustration.compute_aa_freq(self.sequence, AA=self.gamma.alphabet) + self.contact_freq = frustration.compute_contact_freq(self.sequence, AA=self.gamma.alphabet) charges2 = self.p.charges[:,np.newaxis] * self.p.charges[np.newaxis,:] if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) @@ -170,6 +170,8 @@ def __init__(self, # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix self.charges2 = charges2 + self._decoy_fluctuation = {} # used for mutational calculation, possibly others + self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ def subclass_setup_helper(self): """ diff --git a/frustratometer/frustration/frustration.py b/frustratometer/frustration/frustration.py index 240a2368..045fac3d 100644 --- a/frustratometer/frustration/frustration.py +++ b/frustratometer/frustration/frustration.py @@ -330,14 +330,14 @@ def compute_singleresidue_decoy_energy_fluctuation(seq: str, seq : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model. potts_model : dict - A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy. + A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, q), where L is the length of the sequence and q is the number of amino acids. The couplings are a 4D array of shape (L, L, q, q). The fields and couplings are assumed to be in units of energy. mask : np.array A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence. Returns ------- decoy_energy: np.array - (Lx21) matrix describing the energetic changes upon mutating a single residue. + (Lxq) matrix describing the energetic changes upon mutating a single residue. Examples -------- @@ -359,16 +359,17 @@ def compute_singleresidue_decoy_energy_fluctuation(seq: str, .. todo:: Optimize the computation. """ + q = len(AA) seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) # Create decoys - pos1, aa1 = np.meshgrid(np.arange(seq_len), np.arange(21), indexing='ij', sparse=True) + pos1, aa1 = np.meshgrid(np.arange(seq_len), np.arange(q), indexing='ij', sparse=True) - decoy_energy = np.zeros([seq_len, 21]) + decoy_energy = np.zeros([seq_len, q]) decoy_energy -= (potts_model['h'][pos1, aa1] - potts_model['h'][pos1, seq_index[pos1]]) # h correction aa1 - j_correction = np.zeros([seq_len, seq_len, 21]) + j_correction = np.zeros([seq_len, seq_len, q]) # J correction interactions with other aminoacids reduced_j = potts_model['J'][range(seq_len), :, seq_index, :].astype(np.float32) j_correction += reduced_j[:, pos1, seq_index[pos1]] * mask[:, pos1] @@ -385,7 +386,8 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, mask: np.array, AA : str = _AA) -> np.array: """ - Computes a (LxLx21x21) matrix for a sequence of length L. Matrix[i,j] describes all possible changes in energy upon mutating residue i and j simultaneously. + Computes a (LxLxqxq) matrix for a sequence of length L and AA of length q. + Matrix[i,j] describes all possible changes in energy upon mutating residue i and j simultaneously. .. math:: \Delta H_{ij} = H_i - H_{i'} + H_{j}-H_{j'} + J_{ij} -J_{ij'} + J_{i'j'} - J_{i'j} + \\sum_k {J_{ik} - J_{i'k} + J_{jk} -J_{j'k}} @@ -395,14 +397,14 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, seq : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model. potts_model : dict - A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy. + A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, q), where L is the length of the sequence and q is the number of amino acids. The couplings are a 4D array of shape (L, L, q, q). The fields and couplings are assumed to be in units of energy. mask : np.array A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence. Returns ------- decoy_energy2: np.array - (LxLx21x21) matrix describing the energetic changes upon mutating two residues simultaneously. + (LxLxqxq) matrix describing the energetic changes upon mutating two residues simultaneously. Examples -------- @@ -424,23 +426,23 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, .. todo:: Optimize the computation. """ + q = len(AA) seq_index = np.array([AA.index(aa) for aa in seq]) seq_len = len(seq_index) - # Create masked decoys - pos1,pos2=np.where(mask>0) + # get indices and amino acid types for just the unmasked contacts + pos1,pos2=np.where(mask>0) contacts_len=len(pos1) - - pos1,aa1,aa2=np.meshgrid(pos1, np.arange(21), np.arange(21), indexing='ij', sparse=True) - pos2,aa1,aa2=np.meshgrid(pos2, np.arange(21), np.arange(21), indexing='ij', sparse=True) + pos1,aa1,aa2=np.meshgrid(pos1, np.arange(q), np.arange(q), indexing='ij', sparse=True) + pos2,aa1,aa2=np.meshgrid(pos2, np.arange(q), np.arange(q), indexing='ij', sparse=True) #Compute fields - decoy_energy = np.zeros([contacts_len, 21, 21]) + decoy_energy = np.zeros([contacts_len, q, q]) decoy_energy -= (potts_model['h'][pos1, aa1] - potts_model['h'][pos1, seq_index[pos1]]) # h correction aa1 decoy_energy -= (potts_model['h'][pos2, aa2] - potts_model['h'][pos2, seq_index[pos2]]) # h correction aa2 #Compute couplings - j_correction = np.zeros([contacts_len, 21, 21]) + j_correction = np.zeros([contacts_len, q, q]) for pos, aa in enumerate(seq_index): # J correction interactions with other aminoacids reduced_j = potts_model['J'][pos, :, aa, :].astype(np.float32) @@ -455,7 +457,7 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, j_correction -= potts_model['J'][pos1, pos2, aa1, aa2] * mask[pos1, pos2] # Correct combination decoy_energy += j_correction - decoy_energy2=np.zeros([seq_len,seq_len,21,21]) + decoy_energy2=np.zeros([seq_len,seq_len,q,q]) decoy_energy2[mask]=decoy_energy return decoy_energy2 @@ -635,10 +637,11 @@ def compute_aa_freq(seq, include_gaps=True, AA = _AA): Returns ------- aa_freq: np.array - Array of frequencies of all 21 possible amino acids within sequence + Array of frequencies of all q possible amino acids within sequence """ + q = len(AA) seq_index = np.array([AA.index(aa) for aa in seq]) - aa_freq = np.array([(seq_index == i).sum() for i in range(21)]) + aa_freq = np.array([(seq_index == i).sum() for i in range(q)]) if not include_gaps: aa_freq[0] = 0 return aa_freq @@ -656,10 +659,11 @@ def compute_contact_freq(seq, AA = _AA): Returns ------- contact_freq: np.array - 21x21 array of frequencies of all possible contacts within sequence. + qxq array of frequencies of all possible contacts within sequence. """ + q = len(AA) seq_index = np.array([AA.index(aa) for aa in seq]) - aa_freq = np.array([(seq_index == i).sum() for i in range(21)], dtype=np.float64) + aa_freq = np.array([(seq_index == i).sum() for i in range(q)], dtype=np.float64) aa_freq /= aa_freq.sum() contact_freq = (aa_freq[:, np.newaxis] * aa_freq[np.newaxis, :]) return contact_freq @@ -674,17 +678,18 @@ def compute_single_frustration(decoy_fluctuation, Parameters ---------- decoy_fluctuation: np.array - (Lx21) matrix for a sequence of length L, describing the energetic changes upon mutating a single residue. + (Lxq) matrix for a sequence of length L, describing the energetic changes upon mutating a single residue. aa_freq: np.array - Array of frequencies of all 21 possible amino acids within sequence + Array of frequencies of all q possible amino acids within sequence Returns ------- frustration: np.array Array of length L featuring single residue frustration indices. """ + q = decoy_fluctuation.shape[1] if aa_freq is None: - aa_freq = np.ones(21) + aa_freq = np.ones(q) mean_energy = (aa_freq * decoy_fluctuation).sum(axis=1) / aa_freq.sum() std_energy = np.sqrt( ((aa_freq * (decoy_fluctuation - mean_energy[:, np.newaxis]) ** 2) / aa_freq.sum()).sum(axis=1)) @@ -702,9 +707,9 @@ def compute_pair_frustration(decoy_fluctuation, Parameters ---------- decoy_fluctuation: np.array - (LxLx21x21) matrix for a sequence of length L, describing the energetic changes upon mutating two residues simultaneously. + (LxLxqxq) matrix for a sequence of length L, describing the energetic changes upon mutating two residues simultaneously. contact_freq: np.array - 21x21 array of frequencies of all possible contacts within sequence. + qxq array of frequencies of all possible contacts within sequence. Returns ------- @@ -712,12 +717,17 @@ def compute_pair_frustration(decoy_fluctuation, LxL array featuring pair frustration indices (mutational or configurational frustration, depending on decoy_fluctuation matrix provided) """ + q = decoy_fluctuation.shape[2] # also could have chosen decoy_fluctuation.shape[3] if contact_freq is None: - contact_freq = np.ones([21, 21]) + contact_freq = np.ones([q, q]) decoy_energy = decoy_fluctuation seq_len = decoy_fluctuation.shape[0] - average = np.average(decoy_energy.reshape(seq_len * seq_len, 21 * 21), weights=contact_freq.flatten(), axis=-1) - variance = np.average((decoy_energy.reshape(seq_len * seq_len, 21 * 21) - average[:, np.newaxis]) ** 2, + try: + average = np.average(decoy_energy.reshape(seq_len * seq_len, q * q), weights=contact_freq.flatten(), axis=-1) + except: + raise Exception(f'contact_freq.shape: {contact_freq.shape}, decoy_flucuation.shape: {decoy_fluctuation.shape}') + + variance = np.average((decoy_energy.reshape(seq_len * seq_len, q * q) - average[:, np.newaxis]) ** 2, weights=contact_freq.flatten(), axis=-1) mean_energy = average.reshape(seq_len, seq_len) std_energy = np.sqrt(variance).reshape(seq_len, seq_len) @@ -835,7 +845,7 @@ def plot_singleresidue_decoy_energy(decoy_energy, native_energy, method='cluster Parameters ---------- decoy_energy : np.array - Lx21 array of decoy energies + Lxq array of decoy energies native_energy : float Native energy value method : str diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index 538ea635..579ab049 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -227,12 +227,14 @@ def test_contact_pair_AWSEM_energy(): def test_selected_subsequence_AWSEM_contact_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 3to26") model=frustratometer.AWSEM(structure) - assert model.potts_model['h'].shape==(24,21) + q = len(model.gamma.alphabet) + assert model.potts_model['h'].shape==(24,q) def test_selected_subsequence_AWSEM_burial_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 150to315") model=frustratometer.AWSEM(structure) - assert model.potts_model['J'].shape==(166,166,21,21) + q = len(model.gamma.alphabet) + assert model.potts_model['J'].shape==(166,166,q,q) ##### #Test Protein Segment Native AWSEM Energy Calculation @@ -325,6 +327,7 @@ def test_single_residue_decoy_AWSEM_energy_statistics(): def test_contact_pair_decoy_AWSEM_energy_statistics(): _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' + q = len(_AA) #Import Lammps AWSEM Frustratometer mutational frustration values lammps_mutational_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_mutational_1E6decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") lammps_mutational_frustration_dataframe["i"]=lammps_mutational_frustration_dataframe["i"]-1 @@ -351,10 +354,10 @@ def test_contact_pair_decoy_AWSEM_energy_statistics(): calculated_mutational_frustration_dataframe["j"]=j.ravel() ### decoy_fluctuations=(model.decoy_fluctuation(kind='mutational'))/4.184 - weighted_decoy_fluctations=np.average(decoy_fluctuations.reshape(seq_len * seq_len, 21 * 21), weights=model.contact_freq.flatten(), axis=-1) + weighted_decoy_fluctations=np.average(decoy_fluctuations.reshape(seq_len * seq_len, q * q), weights=model.contact_freq.flatten(), axis=-1) calculated_mutational_frustration_dataframe["Weighted_Decoy_Fluctuations"]=weighted_decoy_fluctations.ravel() calculated_mutational_frustration_dataframe["Test_Mean_Decoy_Energy"]=calculated_mutational_frustration_dataframe["Test_Native_Energy"]+calculated_mutational_frustration_dataframe["Weighted_Decoy_Fluctuations"] - calculated_mutational_frustration_dataframe["STD_Decoy_Energy"]=np.average((decoy_fluctuations.reshape(seq_len * seq_len, 21 * 21)-calculated_mutational_frustration_dataframe["Weighted_Decoy_Fluctuations"].astype(float).values[:,np.newaxis]) ** 2,weights=model.contact_freq.flatten(), axis=-1) + calculated_mutational_frustration_dataframe["STD_Decoy_Energy"]=np.average((decoy_fluctuations.reshape(seq_len * seq_len, q * q)-calculated_mutational_frustration_dataframe["Weighted_Decoy_Fluctuations"].astype(float).values[:,np.newaxis]) ** 2,weights=model.contact_freq.flatten(), axis=-1) calculated_mutational_frustration_dataframe["STD_Decoy_Energy"]=np.sqrt(calculated_mutational_frustration_dataframe["STD_Decoy_Energy"]) merged_dataframe=calculated_mutational_frustration_dataframe.merge(lammps_mutational_frustration_dataframe,on=["i","j"]) From 214f02b5bcdc6b4d809f1b4dc3fad17a9498f6bb Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 6 Dec 2025 20:34:45 -0600 Subject: [PATCH 55/76] passing awsem energy and frustration tests; need to check expose indicators and dca --- frustratometer/classes/AWSEM.py | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index b1a27847..064a47ba 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -173,6 +173,27 @@ def __init__(self, self._decoy_fluctuation = {} # used for mutational calculation, possibly others self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ + # carlos wanted to have gamma_array with gammas multiplied by lambda and coefficients + @property + def coefficient_lambda_gamma_array(self): + _coefficient_lambda_gamma_array = [] + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.burial_gamma[:,0]) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.burial_gamma[:,1]) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.burial_gamma[:,2]) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.direct_gamma) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.protein_gamma) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.water_gamma) + _coefficient_lambda_gamma_array.append(0.5 * self.p.k_electrostatics * self.charges2) + # not a typo, supposed to be positive ^^^ + # charges2 is our electrostatic "gamma" + return _coefficient_lambda_gamma_array + @coefficient_lambda_gamma_array.setter + def coefficient_lambda_gamma_array(self): + raise AttributeError("""Setting AWSEM.coefficient_lambda_gamma_array + directly is not allowed. Modify AWSEM.k_contact, + AWSEM.burial_gamma, AWSEM.direct_gamma, + AWSEM.protein_gamma, or AWSEM.water_gamma instead.""") + def subclass_setup_helper(self): """ This method calls methods to calculate native indicator functions, @@ -404,32 +425,15 @@ def calculate_indicators(self): self.indicators.append(direct_indicator[:,:,0,0]*self.sequence_mask_contact) self.indicators.append(protein_indicator[:,:,0,0]*self.sequence_mask_contact) self.indicators.append(water_indicator[:,:,0,0]*self.sequence_mask_contact) - self.gamma_array=[] - temp_burial_gamma=self.burial_gamma[:].copy()#self.aa_map_awsem_list] - #temp_burial_gamma[0]=0 - temp_burial_gamma *= -0.5 * self.p.k_contact - self.gamma_array.append(temp_burial_gamma[:,0]) - self.gamma_array.append(temp_burial_gamma[:,1]) - self.gamma_array.append(temp_burial_gamma[:,2]) - for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: - temp_gamma = contact_gamma[:,:].copy()#self.aa_map_awsem_x, self.aa_map_awsem_y].copy() - #temp_gamma[0, :] = 0 - #temp_gamma[:, 0] = 0 - temp_gamma *= -0.5 * self.k_contact - self.gamma_array.append(temp_gamma) - self.burial_indicator = burial_indicator # probably could get rid of either this or indicators list - self.direct_indicator = direct_indicator # probably could get rid of either this or indicators list - self.water_indicator = water_indicator # probably could get rid of either this or indicators list - self.protein_indicator = protein_indicator # probably could get rid of either this or indicators list + self.burial_indicator = burial_indicator + self.direct_indicator = direct_indicator + self.water_indicator = water_indicator + self.protein_indicator = protein_indicator #breakpoint() if self.p.k_electrostatics != 0: electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list - temp_gamma = 0.5 * self.p.k_electrostatics * self.charges2[:,:]#self.aa_map_awsem_x, self.aa_map_awsem_y] - #temp_gamma[0,:]=0 - #temp_gamma[:,0]=0 - self.gamma_array.append(temp_gamma) def calculate_energy_and_potts(self): super().calculate_energy_and_potts() From d7d3fe48bd720be9f74a678b3de7628eb6d4a542 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 6 Dec 2025 20:36:46 -0600 Subject: [PATCH 56/76] always append electrostatic indicators, even if lambda electrostatics if 0 --- frustratometer/classes/AWSEM.py | 4 ++-- tests/test_awsem_frustratometer.py | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 064a47ba..2aa55162 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -430,10 +430,10 @@ def calculate_indicators(self): self.water_indicator = water_indicator self.protein_indicator = protein_indicator #breakpoint() - if self.p.k_electrostatics != 0: + if True:#self.p.k_electrostatics != 0: electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask self.indicators.append(electrostatics_indicator) - self.electrostatics_indicator = electrostatics_indicator # probably could get rid of either this or indicators list + self.electrostatics_indicator = electrostatics_indicator def calculate_energy_and_potts(self): super().calculate_energy_and_potts() diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index 579ab049..cadf88ae 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -376,17 +376,21 @@ def structure(): def test_expose_indicators(structure, k_electrostatics, min_sequence_separation_contact, distance_cutoff_contact): """ Check that the AWSEM indicators exposed can reproduce the native energy, where E_native = -sum_{i} h_i - sum_{i,j} J_ij = sum_{i} gamma_i * I_i """ _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' + q = len(_AA) model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, min_sequence_separation_contact = min_sequence_separation_contact, distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True) model_seq_index=np.array([_AA.find(aa) for aa in model.sequence]) indicators1D=np.array(model.indicators[0:3]) indicators2D=np.array(model.indicators[3:]) - true_indicator1D=np.array([indicators1D[:,model_seq_index==i].sum(axis=1) for i in range(21)]).T - true_indicator2D=np.array([indicators2D[:,model_seq_index==i][:,:, model_seq_index==j].sum(axis=(1,2)) for i in range(21) for j in range(21)]).reshape(21,21,-1).T - burial_gamma=np.concatenate(model.gamma_array[:3]) + true_indicator1D=np.array([indicators1D[:,model_seq_index==i].sum(axis=1) for i in range(q)]).T + true_indicator2D=np.array([indicators2D[:,model_seq_index==i][:,:, model_seq_index==j].sum(axis=(1,2)) for i in range(q) for j in range(q)]).reshape(q,q,-1).T + burial_gamma=np.concatenate(model.coefficient_lambda_gamma_array[:3]) burial_energy_predicted = (burial_gamma * np.concatenate(true_indicator1D)).sum() burial_energy_expected = -model.potts_model['h'][range(len(model_seq_index)), model_seq_index].sum() assert np.isclose(burial_energy_predicted,burial_energy_expected), f"Expected energy {burial_energy_expected} but got {burial_energy_predicted}" - contact_gamma=np.concatenate([a.ravel() for a in model.gamma_array[3:]]) + contact_gamma=np.concatenate([a.ravel() for a in model.coefficient_lambda_gamma_array[3:]]) + #assert indicators2D.shape == "foo", indicators2D.shape + #assert true_indicators2D.shape == "foo", true_indicators2D.shape + #assert contact_gamma.shape == "foo", contact_gamma.shape contact_energy_predicted = (contact_gamma * np.concatenate([a.ravel() for a in true_indicator2D])).sum() contact_energy_expected = model.couplings_energy() assert np.isclose(contact_energy_predicted,contact_energy_expected), f"Expected energy {contact_energy_expected} but got {contact_energy_predicted}" From 42d7db15efe741b040babe3cef8479d93a5c12ac Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sat, 6 Dec 2025 21:59:15 -0600 Subject: [PATCH 57/76] finished updating alphabet handling so that it's an object attribute or function argument, not a module-level variable --- frustratometer/classes/AWSEM.py | 11 ++++- frustratometer/classes/DCA.py | 10 +++-- frustratometer/classes/Frustratometer.py | 22 +++++----- frustratometer/frustration/frustration.py | 50 ++++++++++++----------- tests/test_dca_frustratometer.py | 10 ++--- 5 files changed, 58 insertions(+), 45 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 2aa55162..4eed69bf 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -153,8 +153,8 @@ def __init__(self, # set other attributes self.burial_in_context = self.p.burial_in_context - self.aa_freq = frustration.compute_aa_freq(self.sequence, AA=self.gamma.alphabet) - self.contact_freq = frustration.compute_contact_freq(self.sequence, AA=self.gamma.alphabet) + self.aa_freq = frustration.compute_aa_freq(self.sequence, self.gamma.alphabet) + self.contact_freq = frustration.compute_contact_freq(self.sequence, self.gamma.alphabet) charges2 = self.p.charges[:,np.newaxis] * self.p.charges[np.newaxis,:] if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) @@ -173,6 +173,13 @@ def __init__(self, self._decoy_fluctuation = {} # used for mutational calculation, possibly others self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ + @property + def alphabet(self): + return self.gamma.alphabet # this allows us to access the alphabet in the same way as for DCA instances + @alphabet.setter + def alphabet(self): + raise AttributeError("Changing the underlying alphabet is prohibited. Instead, create a new AWSEM instance from a different Gamma.") + # carlos wanted to have gamma_array with gammas multiplied by lambda and coefficients @property def coefficient_lambda_gamma_array(self): diff --git a/frustratometer/classes/DCA.py b/frustratometer/classes/DCA.py index 7c9623c5..c385ad83 100644 --- a/frustratometer/classes/DCA.py +++ b/frustratometer/classes/DCA.py @@ -78,6 +78,8 @@ class DCA(Frustratometer): # self._decoy_fluctuation = {} # return self + alphabet = '-ACDEFGHIKLMNPQRSTVWY' + @classmethod def from_potts_model_file(cls,pdb_structure: object, potts_model_file: Union[Path,str] = None, @@ -146,8 +148,8 @@ def from_potts_model_file(cls,pdb_structure: object, self.potts_model["J"]= self.potts_model["familycouplings"].reshape(int(len(self.filtered_aligned_sequence)),21,int(len(self.filtered_aligned_sequence)),21).transpose(0,2,1,3) if self.filtered_aligned_sequence is not None: - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + self.aa_freq = frustration.compute_aa_freq(self.sequence, self.alphabet) + self.contact_freq = frustration.compute_contact_freq(self.sequence, self.alphabet) else: self.aa_freq = None self.contact_freq = None @@ -222,8 +224,8 @@ def from_pottsmodel(cls,pdb_structure : object, self.potts_model["J"]= self.potts_model["familycouplings"].reshape(int(len(self.filtered_aligned_sequence)),21,int(len(self.filtered_aligned_sequence)),21).transpose(0,2,1,3) if self.filtered_aligned_sequence is not None: - self.aa_freq = frustration.compute_aa_freq(self.sequence) - self.contact_freq = frustration.compute_contact_freq(self.sequence) + self.aa_freq = frustration.compute_aa_freq(self.sequence, self.alphabet) + self.contact_freq = frustration.compute_contact_freq(self.sequence, self.alphabet) else: self.aa_freq = None self.contact_freq = None diff --git a/frustratometer/classes/Frustratometer.py b/frustratometer/classes/Frustratometer.py index 929f6a38..d649e429 100644 --- a/frustratometer/classes/Frustratometer.py +++ b/frustratometer/classes/Frustratometer.py @@ -61,9 +61,11 @@ def native_energy(self,sequence:str = None,ignore_couplings_of_gaps:bool=False,i if sequence is None: sequence=self.sequence else: - return frustration.compute_native_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,ignore_fields_of_gaps,self.gamma.alphabet) + return frustration.compute_native_energy(sequence, self.potts_model, self.mask, self.alphabet, + ignore_couplings_of_gaps, ignore_fields_of_gaps) if not self._native_energy: - self._native_energy=frustration.compute_native_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,ignore_fields_of_gaps,self.gamma.alphabet) + self._native_energy=frustration.compute_native_energy(sequence, self.potts_model, self.mask, self.alphabet, + ignore_couplings_of_gaps, ignore_fields_of_gaps) energy_value=self._native_energy return energy_value @@ -89,7 +91,7 @@ def sequences_energies(self, sequences:np.array, split_couplings_and_fields:bool output (if split_couplings_and_fields==True): np.array Array containing computed fields and couplings energies of the protein sequences. """ - output=frustration.compute_sequences_energy(sequences, self.potts_model, self.mask, split_couplings_and_fields,self.gamma.alphabet) + output=frustration.compute_sequences_energy(sequences, self.potts_model, self.mask, self.alphabet, split_couplings_and_fields) return output def fields_energy(self, sequence:str = None, ignore_fields_of_gaps:bool = False) -> float: @@ -114,7 +116,7 @@ def fields_energy(self, sequence:str = None, ignore_fields_of_gaps:bool = False) """ if sequence is None: sequence=self.sequence - fields_energy=frustration.compute_fields_energy(sequence, self.potts_model,ignore_fields_of_gaps,self.gamma.alphabet) + fields_energy=frustration.compute_fields_energy(sequence, self.potts_model, self.alphabet, ignore_fields_of_gaps) return fields_energy def couplings_energy(self, sequence:str = None,ignore_couplings_of_gaps:bool = False) -> float: @@ -139,7 +141,7 @@ def couplings_energy(self, sequence:str = None,ignore_couplings_of_gaps:bool = F """ if sequence is None: sequence=self.sequence - couplings_energy=frustration.compute_couplings_energy(sequence, self.potts_model, self.mask,ignore_couplings_of_gaps,self.gamma.alphabet) + couplings_energy=frustration.compute_couplings_energy(sequence, self.potts_model, self.mask, self.alphabet, ignore_couplings_of_gaps) return couplings_energy def decoy_fluctuation(self, sequence:str = None,kind:str = 'singleresidue',mask:np.array = None) -> np.array: @@ -167,13 +169,13 @@ def decoy_fluctuation(self, sequence:str = None,kind:str = 'singleresidue',mask: if not isinstance(mask, np.ndarray): mask=self.mask if kind == 'singleresidue': - fluctuation = frustration.compute_singleresidue_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) + fluctuation = frustration.compute_singleresidue_decoy_energy_fluctuation(sequence, self.potts_model, mask, self.alphabet) elif kind == 'mutational': - fluctuation = frustration.compute_mutational_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) + fluctuation = frustration.compute_mutational_decoy_energy_fluctuation(sequence, self.potts_model, mask, self.alphabet) elif kind == 'configurational': - fluctuation = frustration.compute_configurational_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) + fluctuation = frustration.compute_configurational_decoy_energy_fluctuation(sequence, self.potts_model, mask, self.alphabet) elif kind == 'contact': - fluctuation = frustration.compute_contact_decoy_energy_fluctuation(sequence, self.potts_model, mask,self.gamma.alphabet) + fluctuation = frustration.compute_contact_decoy_energy_fluctuation(sequence, self.potts_model, mask, self.alphabet) else: raise Exception("Wrong kind of decoy generation selected") self._decoy_fluctuation[kind] = fluctuation @@ -268,7 +270,7 @@ def plot_decoy_energy(self, sequence:str = None, kind:str = 'singleresidue', met native_energy = self.native_energy(sequence=sequence) decoy_energy = self.decoy_energy(kind=kind,sequence=sequence) if kind == 'singleresidue': - g = frustration.plot_singleresidue_decoy_energy(decoy_energy, native_energy, method,self.gamma.alphabet) + g = frustration.plot_singleresidue_decoy_energy(decoy_energy, native_energy, method, self.alphabet) return g def roc(self): diff --git a/frustratometer/frustration/frustration.py b/frustratometer/frustration/frustration.py index 045fac3d..61adb1b7 100644 --- a/frustratometer/frustration/frustration.py +++ b/frustratometer/frustration/frustration.py @@ -4,8 +4,6 @@ from typing import Union from pathlib import Path -_AA = '-ACDEFGHIKLMNPQRSTVWY' - def compute_mask(distance_matrix: np.array, maximum_contact_distance: Union[float, None] = None, minimum_sequence_separation: Union[int, None] = None) -> np.array: @@ -58,9 +56,9 @@ def compute_mask(distance_matrix: np.array, def compute_native_energy(seq: str, potts_model: dict, mask: np.array, + AA : str, ignore_gap_couplings: bool = False, - ignore_gap_fields: bool = False, - AA : str = _AA) -> float: + ignore_gap_fields: bool = False) -> float: """ Computes the native energy of a protein sequence based on a given Potts model and an interaction mask. @@ -134,8 +132,8 @@ def compute_native_energy(seq: str, def compute_fields_energy(seq: str, potts_model: dict, - ignore_fields_of_gaps: bool = False, - AA : str = _AA) -> float: + AA : str, + ignore_fields_of_gaps: bool = False) -> float: """ Computes the fields energy of a protein sequence based on a given Potts model. @@ -182,8 +180,8 @@ def compute_fields_energy(seq: str, def compute_couplings_energy(seq: str, potts_model: dict, mask: np.array, - ignore_couplings_of_gaps: bool = False, - AA : str = _AA) -> float: + AA : str, + ignore_couplings_of_gaps: bool = False) -> float: """ Computes the couplings energy of a protein sequence based on a given Potts model and an interaction mask. @@ -244,8 +242,8 @@ def compute_couplings_energy(seq: str, def compute_sequences_energy(seqs: list, potts_model: dict, mask: np.array, - split_couplings_and_fields = False, - AA : str = _AA) -> np.array: + AA : str, + split_couplings_and_fields = False) -> np.array: """ Computes the energy of multiple protein sequences based on a given Potts model and an interaction mask. @@ -317,7 +315,7 @@ def compute_sequences_energy(seqs: list, def compute_singleresidue_decoy_energy_fluctuation(seq: str, potts_model: dict, mask: np.array, - AA : str = _AA) -> np.array: + AA : str) -> np.array: """ Computes a (Lx21) matrix for a sequence of length L. Row i contains all possible changes in energy upon mutating residue i. @@ -384,7 +382,7 @@ def compute_singleresidue_decoy_energy_fluctuation(seq: str, def compute_mutational_decoy_energy_fluctuation(seq: str, potts_model: dict, mask: np.array, - AA : str = _AA) -> np.array: + AA : str) -> np.array: """ Computes a (LxLxqxq) matrix for a sequence of length L and AA of length q. Matrix[i,j] describes all possible changes in energy upon mutating residue i and j simultaneously. @@ -465,7 +463,7 @@ def compute_mutational_decoy_energy_fluctuation(seq: str, def compute_configurational_decoy_energy_fluctuation(seq: str, potts_model: dict, mask: np.array, - AA : str = _AA) -> np.array: + AA : str) -> np.array: """ Computes a (LxLx21x21) matrix for a sequence of length L. Matrix[i,j] describes all possible changes in energy upon mutating and altering the local densities of residue i and j simultaneously. @@ -546,7 +544,7 @@ def compute_configurational_decoy_energy_fluctuation(seq: str, def compute_contact_decoy_energy_fluctuation(seq: str, potts_model: dict, mask: np.array, - AA : str = _AA) -> np.array: + AA : str) -> np.array: r""" $$ \Delta DCA_{ij} = \Delta j_{ij} $$ :param seq: @@ -569,7 +567,7 @@ def compute_contact_decoy_energy_fluctuation(seq: str, return decoy_energy -def compute_decoy_energy(seq: str, potts_model: dict, mask: np.array, kind='singleresidue') -> np.array: +def compute_decoy_energy(seq: str, potts_model: dict, mask: np.array, AA : str, kind='singleresidue') -> np.array: """ Computes all possible decoy energies. @@ -610,18 +608,18 @@ def compute_decoy_energy(seq: str, potts_model: dict, mask: np.array, kind='sing .. todo:: Optimize the computation. """ - native_energy = compute_native_energy(seq, potts_model, mask) + native_energy = compute_native_energy(seq, potts_model, mask, AA) if kind == 'singleresidue': - decoy_energy=native_energy + compute_singleresidue_decoy_energy_fluctuation(seq, potts_model, mask) + decoy_energy=native_energy + compute_singleresidue_decoy_energy_fluctuation(seq, potts_model, mask, AA) elif kind == 'mutational': - decoy_energy=native_energy + compute_mutational_decoy_energy_fluctuation(seq, potts_model, mask) + decoy_energy=native_energy + compute_mutational_decoy_energy_fluctuation(seq, potts_model, mask, AA) elif kind == 'configurational': - decoy_energy=native_energy + compute_configurational_decoy_energy_fluctuation(seq, potts_model, mask) + decoy_energy=native_energy + compute_configurational_decoy_energy_fluctuation(seq, potts_model, mask, AA) elif kind == 'contact': - decoy_energy=native_energy + compute_contact_decoy_energy_fluctuation(seq, potts_model, mask) + decoy_energy=native_energy + compute_contact_decoy_energy_fluctuation(seq, potts_model, mask, AA) return decoy_energy -def compute_aa_freq(seq, include_gaps=True, AA = _AA): +def compute_aa_freq(seq, AA, include_gaps=True,): """ Calculates amino acid frequencies in given sequence @@ -629,6 +627,8 @@ def compute_aa_freq(seq, include_gaps=True, AA = _AA): ---------- seq : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. + AA : str + The alphabet of allowed residues include_gaps: bool If True, frequencies of gaps ('-') in the sequence are set to 0. Default is True. @@ -647,7 +647,7 @@ def compute_aa_freq(seq, include_gaps=True, AA = _AA): return aa_freq -def compute_contact_freq(seq, AA = _AA): +def compute_contact_freq(seq, AA): """ Calculates contact frequencies in given sequence @@ -655,7 +655,9 @@ def compute_contact_freq(seq, AA = _AA): ---------- seq : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. - + AA : str + The alphabet of allowed residues + Returns ------- contact_freq: np.array @@ -838,7 +840,7 @@ def plot_roc(roc_score): plt.plot([0, 1], [0, 1], '--') -def plot_singleresidue_decoy_energy(decoy_energy, native_energy, method='clustermap', AA = _AA): +def plot_singleresidue_decoy_energy(decoy_energy, native_energy, AA, method='clustermap'): """ Plot comparison of single residue decoy energies, relative to the native energy diff --git a/tests/test_dca_frustratometer.py b/tests/test_dca_frustratometer.py index e02df7bc..c7a97b25 100644 --- a/tests/test_dca_frustratometer.py +++ b/tests/test_dca_frustratometer.py @@ -225,7 +225,7 @@ def test_functional_compute_DCA_native_energy(): distance_matrix = frustratometer.pdb.get_distance_matrix(pdb_path, chain_id, method='minimum') potts_model = frustratometer.dca.matlab.load_potts_model(potts_model_path) mask = frustratometer.frustration.compute_mask(distance_matrix, maximum_contact_distance=4, minimum_sequence_separation=0) - energy = frustratometer.frustration.compute_native_energy(sequence, potts_model, mask) + energy = frustratometer.frustration.compute_native_energy(sequence, potts_model, mask, '-ACDEFGHIKLMNPQRSTVWY') assert np.round(energy, 4) == expected_energy @@ -406,8 +406,8 @@ def test_compute_singleresidue_DCA_decoy_energy(): seq = [aa for aa in seq] seq[pos_x] = AA[aa_x] seq = ''.join(seq) - test_energy = frustratometer.frustration.compute_native_energy(seq, potts_model, mask) - decoy_energy = frustratometer.frustration.compute_decoy_energy(seq, potts_model, mask, 'singleresidue') + test_energy = frustratometer.frustration.compute_native_energy(seq, potts_model, mask, AA) + decoy_energy = frustratometer.frustration.compute_decoy_energy(seq, potts_model, mask, '-ACDEFGHIKLMNPQRSTVWY', 'singleresidue') assert (decoy_energy[pos_x, aa_x] - test_energy) ** 2 < 1E-16 @@ -427,8 +427,8 @@ def test_compute_mutational_DCA_decoy_energy(): seq[pos_x] = AA[aa_x] seq[pos_y] = AA[aa_y] seq = ''.join(seq) - test_energy = frustratometer.frustration.compute_native_energy(seq, potts_model, mask) - decoy_energy = frustratometer.frustration.compute_decoy_energy(seq, potts_model, mask, 'mutational') + test_energy = frustratometer.frustration.compute_native_energy(seq, potts_model, mask, AA) + decoy_energy = frustratometer.frustration.compute_decoy_energy(seq, potts_model, mask, '-ACDEFGHIKLMNPQRSTVWY', 'mutational') assert (decoy_energy[pos_x, pos_y, aa_x, aa_y] - test_energy) ** 2 < 1E-16 From d2b810d69221f4a740cca9d688b30bf94ed6f348 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 7 Dec 2025 18:49:21 -0600 Subject: [PATCH 58/76] cleaning some things up, including changing AWSEM.indicators to AWSEM.masked_indicators, but still need to clean a few things up; currently passing tests --- frustratometer/classes/AWSEM.py | 131 +++++++++++--------- frustratometer/optimization/optimization.py | 20 +-- tests/test_awsem_frustratometer.py | 96 ++++++++------ tests/test_optimization.py | 22 ++-- 4 files changed, 153 insertions(+), 116 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 4eed69bf..14ff685f 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -196,10 +196,10 @@ def coefficient_lambda_gamma_array(self): return _coefficient_lambda_gamma_array @coefficient_lambda_gamma_array.setter def coefficient_lambda_gamma_array(self): - raise AttributeError("""Setting AWSEM.coefficient_lambda_gamma_array - directly is not allowed. Modify AWSEM.k_contact, - AWSEM.burial_gamma, AWSEM.direct_gamma, - AWSEM.protein_gamma, or AWSEM.water_gamma instead.""") + raise AttributeError(f"""Setting {self.__class__}.coefficient_lambda_gamma_array + directly is not allowed. Modify {self.__class__}.k_contact, + {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, + {self.__class__}.protein_gamma, or {self.__class__}.water_gamma instead.""") def subclass_setup_helper(self): """ @@ -396,63 +396,80 @@ def change_conformation(self,alt_conf): self.pdb_structure = alt_conf def calculate_indicators(self): - # Calculate rho - rho = 0.25 - rho *= (1 + np.tanh(self.p.eta * (self.selected_matrix - self.p.r_min))) - rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - self.selected_matrix))) - rho *= self.sequence_mask_rho - self.rho=rho - #Calculate sigma water - rho_r = (rho).sum(axis=1) - if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: - if self.burial_in_context==True: - self.init_index_shift=self.pdb_structure.init_index_shift - self.fin_index_shift=self.pdb_structure.fin_index_shift - rho_r=rho_r[self.init_index_shift:self.fin_index_shift] - self.rho_r=rho_r - rho_b = np.expand_dims(rho_r, 1) - rho1 = np.expand_dims(rho_r, 0) - rho2 = np.expand_dims(rho_r, 1) - sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) - if self.alt_sigma_wat: - sigma_water = -sigma_water + 0.5*( (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) + (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0)))) - sigma_protein = 1 - sigma_water - #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - self.distance_matrix))) - thetaII = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_minII))) * (1 + np.tanh(self.p.eta * (self.p.r_maxII - self.distance_matrix))) - burial_indicator = np.tanh(self.p.burial_kappa * (rho_b - self.p.burial_ro_min)) + np.tanh(self.p.burial_kappa * (self.p.burial_ro_max - rho_b)) - direct_indicator = theta[:, :, np.newaxis, np.newaxis] - water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] - protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] - # store indicators and gammas for our particular sequence as attributes - self.indicators=[] - self.indicators.append(burial_indicator[:,0]) - self.indicators.append(burial_indicator[:,1]) - self.indicators.append(burial_indicator[:,2]) - self.indicators.append(direct_indicator[:,:,0,0]*self.sequence_mask_contact) - self.indicators.append(protein_indicator[:,:,0,0]*self.sequence_mask_contact) - self.indicators.append(water_indicator[:,:,0,0]*self.sequence_mask_contact) - self.burial_indicator = burial_indicator - self.direct_indicator = direct_indicator - self.water_indicator = water_indicator - self.protein_indicator = protein_indicator - #breakpoint() - if True:#self.p.k_electrostatics != 0: - electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) * self.electrostatics_mask - self.indicators.append(electrostatics_indicator) + if self.expose_indicator_functions: + # Calculate rho + rho = 0.25 + rho *= (1 + np.tanh(self.p.eta * (self.selected_matrix - self.p.r_min))) + rho *= (1 + np.tanh(self.p.eta * (self.p.r_max - self.selected_matrix))) + rho *= self.sequence_mask_rho + self.rho=rho + #Calculate sigma water + rho_r = (rho).sum(axis=1) + if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: + if self.burial_in_context==True: + self.init_index_shift=self.pdb_structure.init_index_shift + self.fin_index_shift=self.pdb_structure.fin_index_shift + rho_r=rho_r[self.init_index_shift:self.fin_index_shift] + self.rho_r=rho_r + rho_b = np.expand_dims(rho_r, 1) + rho1 = np.expand_dims(rho_r, 0) + rho2 = np.expand_dims(rho_r, 1) + sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) + if self.alt_sigma_wat: + sigma_water = -sigma_water + 0.5*( (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) + (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0)))) + sigma_protein = 1 - sigma_water + #Calculate theta and indicators + theta = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - self.distance_matrix))) + thetaII = 0.25 * (1 + np.tanh(self.p.eta * (self.distance_matrix - self.p.r_minII))) * (1 + np.tanh(self.p.eta * (self.p.r_maxII - self.distance_matrix))) + burial_indicator = np.tanh(self.p.burial_kappa * (rho_b - self.p.burial_ro_min)) + np.tanh(self.p.burial_kappa * (self.p.burial_ro_max - rho_b)) + direct_indicator = theta[:, :, np.newaxis, np.newaxis] + water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] + protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] + self.burial_indicator = burial_indicator + self.direct_indicator = direct_indicator + self.water_indicator = water_indicator + self.protein_indicator = protein_indicator + electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length)*self.electrostatics_mask self.electrostatics_indicator = electrostatics_indicator + else: + print("""self.expose_indicator_functions was False; will not calculate and store indicator functions. + Indicator functions will be computed on the fly as needed for energy calculations and then discarded. + If you want to get the indicator functions directly, set self.expose_indicator_functions + to True and then call this method again.""") + + @property + def masked_indicators(self): + # store indicators and gammas for our particular sequence as attributes + _masked_indicators=[] + _masked_indicators.append(self.burial_indicator[:,0]) + _masked_indicators.append(self.burial_indicator[:,1]) + _masked_indicators.append(self.burial_indicator[:,2]) + _masked_indicators.append(self.direct_indicator[:,:,0,0]*self.sequence_mask_contact) + _masked_indicators.append(self.protein_indicator[:,:,0,0]*self.sequence_mask_contact) + _masked_indicators.append(self.water_indicator[:,:,0,0]*self.sequence_mask_contact) + _masked_indicators.append(self.electrostatics_indicator*self.electrostatics_mask) + return _masked_indicators + @masked_indicators.setter + def masked_indicators(self): + raise AttributeError(f"""Setting {self.__class__}.indicators directly is not allowed. + Modify {self.__class__}.burial_indicator, {self.__class__}.direct_indicator, + {self.__class__}.protein_indicator, {self.__class__}.water_indicator, + {self.__class__}.electrostatic_indicator, + {self.__class__}.sequence_mask_contact, + or {self.__class__}.electrostatics_mask instead.""") def calculate_energy_and_potts(self): super().calculate_energy_and_potts() - if not self.expose_indicator_functions: - del self.burial_indicator - del self.direct_indicator - del self.water_indicator - del self.protein_indicator - if "electrostatics_indicator" in dir(self): - # won't exist if electrostatics are turned off - del self.electrostatics_indicator - del self.indicators + # if expose_indicator_functions is off, we should never set the attributes in the first place + #if not self.expose_indicator_functions: + # del self.burial_indicator + # del self.direct_indicator + # del self.water_indicator + # del self.protein_indicator + # if "electrostatics_indicator" in dir(self): + # # won't exist if electrostatics are turned off + # del self.electrostatics_indicator + # del self.indicators def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] diff --git a/frustratometer/optimization/optimization.py b/frustratometer/optimization/optimization.py index 0560bbdc..d4daa6ca 100644 --- a/frustratometer/optimization/optimization.py +++ b/frustratometer/optimization/optimization.py @@ -543,7 +543,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.reindex_dca=[_AA.index(aa) for aa in alphabet] assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." - self.indicators = model.indicators + self.indicators = model.masked_indicators self.alphabet_size=len(alphabet) self.model=model self.model_h = model.potts_model['h'][:,self.reindex_dca] @@ -552,7 +552,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.indicators1D=np.array([ind for ind in self.indicators if len(ind.shape)==1]) self.indicators2D=np.array([ind for ind in self.indicators if len(ind.shape)==2]) #TODO: Fix the gamma matrix to account for elecrostatics - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.coefficient_lambda_gamma_array]) self.initialize_functions() @@ -695,12 +695,12 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.alphabet=alphabet self.reindex_dca=[_AA.index(aa) for aa in alphabet] assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." - self.indicators = model.indicators + self.indicators = model.masked_indicators self.alphabet_size=len(alphabet) self.model_h = model.potts_model['h'][:,self.reindex_dca] self.model_J = model.potts_model['J'][:,:,self.reindex_dca][:,:,:,self.reindex_dca] self.mask = model.mask - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.coefficient_lambda_gamma_array]) self.initialize_functions() def initialize_functions(self): @@ -858,12 +858,12 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.alphabet=alphabet self.reindex_dca=[_AA.index(aa) for aa in alphabet] assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." - self.indicators = model.indicators + self.indicators = model.masked_indicators self.alphabet_size=len(alphabet) self.model_h = model.potts_model['h'][:,self.reindex_dca] self.model_J = model.potts_model['J'][:,:,self.reindex_dca][:,:,:,self.reindex_dca] self.mask = model.mask - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.coefficient_lambda_gamma_array]) self.initialize_functions() def initialize_functions(self): @@ -962,7 +962,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.reindex_dca=[_AA.index(aa) for aa in alphabet] assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." - self.indicators = model.indicators + self.indicators = model.masked_indicators self.alphabet_size=len(alphabet) self.model=model self.model_h = model.potts_model['h'][:,self.reindex_dca] @@ -971,7 +971,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA): self.indicators1D=np.array([ind for ind in self.indicators if len(ind.shape)==1]) self.indicators2D=np.array([ind for ind in self.indicators if len(ind.shape)==2]) #TODO: Fix the gamma matrix to account for elecrostatics - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.coefficient_lambda_gamma_array]) self.initialize_functions() @@ -1150,7 +1150,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA, n_decoys= self.n_decoys=n_decoys assert "indicators" in model.__dict__.keys(), "Indicator functions were not exposed. Initialize AWSEM function with `expose_indicator_functions=True` first." - self.indicators = model.indicators + self.indicators = model.masked_indicators self.alphabet_size=len(alphabet) self.model=model self.model_h = model.potts_model['h'][:,self.reindex_dca] @@ -1159,7 +1159,7 @@ def __init__(self, model:Frustratometer, use_numba=True, alphabet=_AA, n_decoys= self.indicators1D=np.array([ind for ind in self.indicators if len(ind.shape)==1]) self.indicators2D=np.array([ind for ind in self.indicators if len(ind.shape)==2]) #TODO: Fix the gamma matrix to account for elecrostatics - self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.gamma_array]) + self.gamma = np.concatenate([(a[self.reindex_dca].ravel() if len(a.shape)==1 else a[self.reindex_dca][:,self.reindex_dca].ravel()) for a in model.coefficient_lambda_gamma_array]) self.initialize_functions() diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index cadf88ae..cdfeb374 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -28,7 +28,9 @@ def test_prody_expected_error(): def test_density_residues(test_data): structure = frustratometer.Structure(test_data_path/f"{test_data['pdb']}.pdb") sequence_separation = 2 if test_data['seqsep'] == 3 else 13 - model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, k_electrostatics=0) + model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, + min_sequence_separation_rho=sequence_separation, k_electrostatics=0, + expose_indicator_functions=True) data = pd.read_csv(test_data['singleresidue'], delim_whitespace=True) data['Calculated_density'] = model.rho_r data['Expected_density'] = data['DensityRes'] @@ -45,7 +47,9 @@ def test_density_residues(test_data): def test_single_residue_frustration(test_data): structure = frustratometer.Structure(test_data_path/f"{test_data['pdb']}.pdb") sequence_separation = 2 if test_data['seqsep'] == 3 else 13 - model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=2, k_electrostatics=test_data['k_electrostatics'] * 4.184, min_sequence_separation_electrostatics=1) + model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, + min_sequence_separation_contact=2, k_electrostatics=test_data['k_electrostatics'] * 4.184, + min_sequence_separation_electrostatics=1, expose_indicator_functions=True) data = pd.read_csv(test_data['singleresidue'], delim_whitespace=True) data['Calculated_frustration'] = model.frustration(kind='singleresidue') data['Expected_frustration'] = data['FrstIndex'] @@ -63,7 +67,9 @@ def test_mutational_frustration(test_data): if test_data['k_electrostatics']==1000: assert True return - model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, min_sequence_separation_electrostatics=1) + model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, + min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, + min_sequence_separation_electrostatics=1, expose_indicator_functions=True) data = pd.read_csv(test_data['mutational'], delim_whitespace=True) if test_data['pdb']!="ijge": @@ -104,7 +110,8 @@ def test_configurational_frustration(test_data): min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, - min_sequence_separation_electrostatics=1) + min_sequence_separation_electrostatics=1, + expose_indicator_functions=True) data = pd.read_csv(test_data['configurational'], delim_whitespace=True) @@ -142,47 +149,51 @@ def test_residue_density_calculation(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, - min_sequence_separation_contact=2) + min_sequence_separation_contact=2, + expose_indicator_functions=True) assert np.round(model.rho_r,2).all()==np.round(expected_rho_values,2).all() def test_AWSEM_native_energy(): structure=frustratometer.Structure(test_data_path/f'1l63.pdb',"A") - model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, distance_cutoff_contact = None) + model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, + distance_cutoff_contact = None, expose_indicator_functions=True) e = model.native_energy() print(e) assert np.round(e, 0) == -915 def test_AWSEM_fields_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, distance_cutoff_contact = None) + model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, + distance_cutoff_contact = None, expose_indicator_functions=True) e = model.fields_energy() print(e) assert np.round(e, 0) == -555 def test_AWSEM_couplings_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, distance_cutoff_contact = None) + model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, distance_cutoff_contact = None, + expose_indicator_functions=True) e = model.couplings_energy() print(e) assert np.round(e, 0) == -362 def test_fields_couplings_AWSEM_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model = frustratometer.AWSEM(structure) + model = frustratometer.AWSEM(structure, expose_indicator_functions=True) assert model.fields_energy() + model.couplings_energy() - model.native_energy() < 1E-6 def test_single_residue_AWSEM_energy(): - _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer single residue frustration values lammps_single_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_singleresidue_1E8decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, - k_electrostatics=0) + k_electrostatics=0, + expose_indicator_functions=True) #Calculate fields - seq_index = np.array([_AA.find(aa) for aa in structure.sequence]) + seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) h = -model.potts_model['h'][range(seq_len), seq_index] @@ -197,7 +208,6 @@ def test_single_residue_AWSEM_energy(): assert (abs(np.array(lammps_single_frustration_dataframe["native_energy"])-test_residue_total_energy) < 1E-1).all() def test_contact_pair_AWSEM_energy(): - _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer mutational frustration values lammps_mutational_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_mutational_1E6decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") lammps_mutational_frustration_dataframe["i"]=lammps_mutational_frustration_dataframe["i"]-1 @@ -206,9 +216,9 @@ def test_contact_pair_AWSEM_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=0, - k_electrostatics=0) + k_electrostatics=0, expose_indicator_functions=True) #Calculate fields - seq_index = np.array([_AA.find(aa) for aa in structure.sequence]) + seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) h = -model.potts_model['h'][range(seq_len), seq_index] @@ -226,13 +236,13 @@ def test_contact_pair_AWSEM_energy(): def test_selected_subsequence_AWSEM_contact_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 3to26") - model=frustratometer.AWSEM(structure) + model=frustratometer.AWSEM(structure, expose_indicator_functions=True) q = len(model.gamma.alphabet) assert model.potts_model['h'].shape==(24,q) def test_selected_subsequence_AWSEM_burial_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 150to315") - model=frustratometer.AWSEM(structure) + model=frustratometer.AWSEM(structure, expose_indicator_functions=True) q = len(model.gamma.alphabet) assert model.potts_model['J'].shape==(166,166,q,q) @@ -243,12 +253,14 @@ def test_selected_subsequence_AWSEM_burial_energy_matrix(): def test_selected_subsequence_AWSEM_rho_calculations(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") - model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0) + model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0, expose_indicator_functions=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") - model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0) + model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0, expose_indicator_functions=True) #Check if shape and entries of rho matrices are identical assert model_1.rho_r.shape==model_2.rho_r[model_1_init_index:model_1_fin_index].shape @@ -257,12 +269,14 @@ def test_selected_subsequence_AWSEM_rho_calculations(): def test_selected_subsequence_AWSEM_burial_energy(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") - model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0) + model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0, expose_indicator_functions=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") - model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0) + model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0, expose_indicator_functions=True) #Check if burial energies are identical assert model_1.burial_energy.shape==model_2.burial_energy[model_1_init_index:model_1_fin_index].shape @@ -271,12 +285,14 @@ def test_selected_subsequence_AWSEM_burial_energy(): def test_selected_subsequence_AWSEM_contact_energy(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") - model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0) + model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0, expose_indicator_functions=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") - model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0) + model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0, expose_indicator_functions=True) #Check if contact energies are identical assert model_1.contact_energy.shape==model_2.contact_energy[:,model_1_init_index:model_1_fin_index,model_1_init_index:model_1_fin_index,:,:].shape @@ -284,27 +300,29 @@ def test_selected_subsequence_AWSEM_contact_energy(): def test_selected_subsequence_AWSEM_burial_energy_without_protein_context(): structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") - model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0,burial_in_context=False) + model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True) selected_region_burial=model.fields_energy() # Energy units are in kJ/mol assert np.round(selected_region_burial, 2) == -377.95 def test_selected_subsequence_AWSEM_contact_energy_without_protein_context(): structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") - model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10,distance_cutoff_contact=10.0,burial_in_context=False) + model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, + distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True) selected_region_contact=model.couplings_energy() # Energy units are in kJ/mol assert np.round(selected_region_contact, 2) == -148.92 def test_single_residue_decoy_AWSEM_energy_statistics(): - _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' #Import Lammps AWSEM Frustratometer single residue frustration values lammps_single_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_singleresidue_1E8decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, k_electrostatics=0) + model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, k_electrostatics=0, + expose_indicator_functions=True) #Calculate fields - seq_index = np.array([_AA.find(aa) for aa in structure.sequence]) + seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) h = -model.potts_model['h'][range(seq_len), seq_index] @@ -326,17 +344,18 @@ def test_single_residue_decoy_AWSEM_energy_statistics(): assert (abs(np.array(lammps_single_frustration_dataframe["std(decoy_energies)"])-(expected_std_decoy_energy)) < 1.2E-1).all() def test_contact_pair_decoy_AWSEM_energy_statistics(): - _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' - q = len(_AA) #Import Lammps AWSEM Frustratometer mutational frustration values lammps_mutational_frustration_dataframe=pd.read_csv(test_data_path/f"6U5E_A_tertiary_frustration_mutational_1E6decoys_AWSEM_Frustratometer_LAMMPS_Carlos.dat",header=0,sep="\s+") lammps_mutational_frustration_dataframe["i"]=lammps_mutational_frustration_dataframe["i"]-1 lammps_mutational_frustration_dataframe["j"]=lammps_mutational_frustration_dataframe["j"]-1 ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.5, min_sequence_separation_contact=None, k_electrostatics=0) + model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.5, min_sequence_separation_contact=None, k_electrostatics=0, + expose_indicator_functions=True) + q = len(model.alphabet) + #Calculate fields - seq_index = np.array([_AA.find(aa) for aa in structure.sequence]) + seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) h = -model.potts_model['h'][range(seq_len), seq_index] @@ -375,12 +394,13 @@ def structure(): @pytest.mark.parametrize("distance_cutoff_contact", [None, 10]) def test_expose_indicators(structure, k_electrostatics, min_sequence_separation_contact, distance_cutoff_contact): """ Check that the AWSEM indicators exposed can reproduce the native energy, where E_native = -sum_{i} h_i - sum_{i,j} J_ij = sum_{i} gamma_i * I_i """ - _AA = 'ARNDCQEGHILKMFPSTWYV'#'-ACDEFGHIKLMNPQRSTVWY' - q = len(_AA) - model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, min_sequence_separation_contact = min_sequence_separation_contact, distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True) - model_seq_index=np.array([_AA.find(aa) for aa in model.sequence]) - indicators1D=np.array(model.indicators[0:3]) - indicators2D=np.array(model.indicators[3:]) + model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, + min_sequence_separation_contact = min_sequence_separation_contact, + distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True) + q = len(model.alphabet) + model_seq_index=np.array([model.alphabet.index(aa) for aa in model.sequence]) + indicators1D=np.array(model.masked_indicators[0:3]) + indicators2D=np.array(model.masked_indicators[3:]) true_indicator1D=np.array([indicators1D[:,model_seq_index==i].sum(axis=1) for i in range(q)]).T true_indicator2D=np.array([indicators2D[:,model_seq_index==i][:,:, model_seq_index==j].sum(axis=(1,2)) for i in range(q) for j in range(q)]).reshape(q,q,-1).T burial_gamma=np.concatenate(model.coefficient_lambda_gamma_array[:3]) diff --git a/tests/test_optimization.py b/tests/test_optimization.py index 2f1414bc..afb777af 100644 --- a/tests/test_optimization.py +++ b/tests/test_optimization.py @@ -428,8 +428,8 @@ def test_awsem_energy_variance(model, reduced_alphabet, use_numba): # from itertools import permutations # decoy_sequences = np.array(list(permutations(seq_index))) -# indicators1D=np.array(model.indicators[:3]) -# indicators2D=np.array(model.indicators[3:]) +# indicators1D=np.array(model.masked_indicators[:3]) +# indicators2D=np.array(model.masked_indicators[3:]) # indicator_arrays=[] # energies=[] # for decoy_index in decoy_sequences: @@ -443,36 +443,36 @@ def test_awsem_energy_variance(model, reduced_alphabet, use_numba): # ind2D[i] =np.bincount(decoy_index2D.ravel(), weights=indicators2D[i].ravel(), minlength=21*21) # indicator_array = np.concatenate([ind1D.ravel(),ind2D.ravel()]) -# gamma_array = np.concatenate([a.ravel() for a in model.gamma_array]) +# gamma_array = np.concatenate([a.ravel() for a in model.coefficient_lambda_gamma_array]) -# energy_i = gamma_array @ indicator_array +# energy_i = coefficient_lambda_gamma_array @ indicator_array # assert np.isclose(model.native_energy(index_to_sequence(decoy_index,alphabet=_AA)),energy_i), f"Expected energy {model.native_energy(index_to_sequence(decoy_index,alphabet=_AA))} but got {energy_i}" # energies.append(energy_i) # indicator_arrays.append(indicator_array) # indicator_arrays = np.array(indicator_arrays) # energies = np.array(energies) -# assert np.isclose(gamma_array@indicator_arrays.mean(axis=0),energies.mean()), f"Expected mean energy {gamma_array@indicator_arrays.mean(axis=0)} but got {np.mean(energies)}" +# assert np.isclose(coefficient_lambda_gamma_array@indicator_arrays.mean(axis=0),energies.mean()), f"Expected mean energy {coefficient_lambda_gamma_array@indicator_arrays.mean(axis=0)} but got {np.mean(energies)}" # # I will code something like this using numpy einsums: # # np.array([[np.outer(indicator_arrays[:,i],indicator_arrays[:,j]).mean() - indicator_arrays[:,i].mean()*indicator_arrays[:,i].mean() for i in range(indicator_arrays.shape[1])] for j in range(indicator_arrays.shape[1])]) # outer_product = np.einsum('ij,ik->ijk', indicator_arrays, indicator_arrays) # mean_outer_product = outer_product.mean(axis=0) # mean_outer_product -= np.outer(indicator_arrays.mean(axis=0), indicator_arrays.mean(axis=0)) -# assert np.allclose(gamma_array @ mean_outer_product @ gamma_array, energies.var()), "Covariance matrix is not correct" +# assert np.allclose(coefficient_lambda_gamma_array @ mean_outer_product @ coefficient_lambda_gamma_array, energies.var()), "Covariance matrix is not correct" # # Indicator tests -# indicators1D=np.array(model.indicators[0:3]) -# indicators2D=np.array(model.indicators[3:]) -# gamma=model.gamma_array +# indicators1D=np.array(model.masked_indicators[0:3]) +# indicators2D=np.array(model.masked_indicators[3:]) +# gamma=model.coefficient_lambda_gamma_array # true_indicator1D=np.array([indicators1D[:,model_seq_index==i].sum(axis=1) for i in range(21)]).T # true_indicator2D=np.array([indicators2D[:,model_seq_index==i][:,:, model_seq_index==j].sum(axis=(1,2)) for i in range(21) for j in range(21)]).reshape(21,21,3).T # true_indicator=np.concatenate([true_indicator1D.ravel(),true_indicator2D.ravel()]) -# burial_gamma=np.concatenate(model.gamma_array[:3]) +# burial_gamma=np.concatenate(model.coefficient_lambda_gamma_array[:3]) # burial_energy_predicted = (burial_gamma * np.concatenate(true_indicator1D)).sum() # burial_energy_expected = -model.potts_model['h'][range(len(model_seq_index)), model_seq_index].sum() # assert np.isclose(burial_energy_predicted,burial_energy_expected), f"Expected energy {burial_energy_expected} but got {burial_energy_predicted}" -# contact_gamma=np.concatenate([a.ravel() for a in model.gamma_array[3:]]) +# contact_gamma=np.concatenate([a.ravel() for a in model.coefficient_lambda_gamma_array[3:]]) # contact_energy_predicted = (contact_gamma * np.concatenate([a.ravel() for a in true_indicator2D])).sum() # contact_energy_expected = model.couplings_energy() # assert np.isclose(contact_energy_predicted,contact_energy_expected), f"Expected energy {contact_energy_expected} but got {contact_energy_predicted}" \ No newline at end of file From 86bcc95dfc2ae0370447dd46d0cc011b62a13976 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 7 Dec 2025 19:01:52 -0600 Subject: [PATCH 59/76] moved multiplication of electrostatic mask by indicators to the energy evaluation instead of indicator assignment, same as for the direct, protein, and water potentials --- frustratometer/classes/AWSEM.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 14ff685f..9e5c9160 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -162,13 +162,13 @@ def __init__(self, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix - self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] else: self.sequence_cutoff=self.p.min_sequence_separation_contact self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix + self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] self.charges2 = charges2 self._decoy_fluctuation = {} # used for mutational calculation, possibly others self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ @@ -274,10 +274,9 @@ def calculate_energy_and_potts(self): protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - # Compute electrostatics and add to contact energy - if self.p.k_electrostatics!=0: - electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis] - contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) + electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] + contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) self.contact_energy = contact_energy @@ -429,7 +428,7 @@ def calculate_indicators(self): self.direct_indicator = direct_indicator self.water_indicator = water_indicator self.protein_indicator = protein_indicator - electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length)*self.electrostatics_mask + electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) self.electrostatics_indicator = electrostatics_indicator else: print("""self.expose_indicator_functions was False; will not calculate and store indicator functions. From 1fdc77f6d3355d1a928db309e4f262338dee5534 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Sun, 7 Dec 2025 19:52:41 -0600 Subject: [PATCH 60/76] further cleaned up gamma electrostatics --- frustratometer/classes/AWSEM.py | 43 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 9e5c9160..f2dd8320 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -52,11 +52,17 @@ class Parameters(BaseModel): min_sequence_separation_electrostatics: Optional[int] = Field(1, description="Minimum sequence separation for electrostatics calculation.") k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)") - charges: np.array = Field(np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]), description="Charge on each residue type") - # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - #charges: np.array = Field(np.array([0, 0, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]), description="Charge on each residue type") - #['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + # We might not know the order of amino acids in our alphabet at the time of instantiating this class + # (this happens the above gammas are Paths), so we'll have to build the electrostatic "gamma" when + # initializing AWSEMBase. Fortunately, we can still specify everything we need to know in this dict. + charge_dict : dict = Field({'A':0.0,'C':0.0,'D':-1.0,'E':-1.0, + 'F':0.0,'G':0.0,'H':0.0,'I':0.0, + 'K':1.0,'L':0.0,'M':0.0,'N':0.0,'P':0.0, + 'Q':0.0,'R':1.0,'S':0.0,'T':0.0, + 'V':0.0,'W':0.0,'Y':0.0}, + description='charge of each amino acid type that may be used') + class AWSEMBase(Frustratometer): @@ -149,13 +155,17 @@ def __init__(self, self.protein_gamma = np.squeeze(gamma['Protein']) self.water_gamma = np.squeeze(gamma['Water']) assert self.direct_gamma.shape == self.protein_gamma.shape == self.water_gamma.shape == (self.q,self.q) - self.gamma = self.p.gamma - - # set other attributes - self.burial_in_context = self.p.burial_in_context - self.aa_freq = frustration.compute_aa_freq(self.sequence, self.gamma.alphabet) - self.contact_freq = frustration.compute_contact_freq(self.sequence, self.gamma.alphabet) - charges2 = self.p.charges[:,np.newaxis] * self.p.charges[np.newaxis,:] + # electrostatic gamma + ordered_charges = np.zeros(self.q) + for counter in range(self.q): + try: + ordered_charges[counter] = self.charge_dict[gamma.alphabet[counter]] + except KeyError as e: + raise Exception(f"""One-letter code {order[counter]} from alphabet {gamma.alphabet} + with unknown charge. If use of this noncanonical AA in intentional, + you must supply a custom charge_dict + so that we know how to calculate the electrostatic potential.""") + charges2 = ordered_charges[:,np.newaxis] * ordered_charges[np.newaxis,:] if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, @@ -170,13 +180,20 @@ def __init__(self, # that only matters if we need to compute a mask from a distance matrix self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] self.charges2 = charges2 + # helpful ? + self.gamma = self.p.gamma + + # set other attributes + self.burial_in_context = self.p.burial_in_context + self.aa_freq = frustration.compute_aa_freq(self.sequence, self.gamma.alphabet) + self.contact_freq = frustration.compute_contact_freq(self.sequence, self.gamma.alphabet) self._decoy_fluctuation = {} # used for mutational calculation, possibly others self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ @property def alphabet(self): return self.gamma.alphabet # this allows us to access the alphabet in the same way as for DCA instances - @alphabet.setter + @alphabet.setter # the user might think they can change the alphabet like the conformation (as in AWSEM), but that's not supported def alphabet(self): raise AttributeError("Changing the underlying alphabet is prohibited. Instead, create a new AWSEM instance from a different Gamma.") @@ -194,7 +211,7 @@ def coefficient_lambda_gamma_array(self): # not a typo, supposed to be positive ^^^ # charges2 is our electrostatic "gamma" return _coefficient_lambda_gamma_array - @coefficient_lambda_gamma_array.setter + @coefficient_lambda_gamma_array.setter # clarifies that this is derived from more fundamental quantities def coefficient_lambda_gamma_array(self): raise AttributeError(f"""Setting {self.__class__}.coefficient_lambda_gamma_array directly is not allowed. Modify {self.__class__}.k_contact, From d8f0ef3f9afecbe32b66a7e2fab630c0f1f28f99 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 8 Dec 2025 12:05:16 -0600 Subject: [PATCH 61/76] editing potts model and native energy calculation workflow; passing all tests except the usual hmmer and pfam DCA issues --- frustratometer/classes/AWSEM.py | 111 +++++++++++++---------- frustratometer/classes/Frustratometer.py | 20 ++++ tests/test_awsem_frustratometer.py | 48 +++++----- 3 files changed, 105 insertions(+), 74 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index f2dd8320..b588f005 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -69,7 +69,7 @@ class AWSEMBase(Frustratometer): def __init__(self, sequence: str, expose_indicator_functions: bool=False, - potts: bool=True, + potts: bool=False, **parameters)->object: """ Generate AWSEM object @@ -94,12 +94,24 @@ def __init__(self, self.sequence = sequence # set indicator function exposure based on argument - # i guess not exposing indicator functions saves memory? + # (not exposing them saves a tiny bit of RAM but it's useful to Ezequiel) self.expose_indicator_functions = expose_indicator_functions - # whether to compute potts model + # whether to store the potts model as an object attribute, + # which requires a lot of ram self.potts = potts + if self.potts and not self.expose_indicator_functions: + print(f""" + You requested storing the potts model as an object attribute by using potts=True + but requested NOT storing the indicator functions as object attributes by using + expose_indicator_functions=False. Since the potts model requires far more RAM than + the indicator functions, we will override your indicator function request + and store them anyway. This has no effect on the accuracy of any calculations. + + Setting {self.__class__}.expose_indicator_functions = True""") + self.expose_indicator_functions = True + # parse other arguments p = Parameters(**parameters) if p.min_sequence_separation_contact is None: @@ -189,6 +201,7 @@ def __init__(self, self.contact_freq = frustration.compute_contact_freq(self.sequence, self.gamma.alphabet) self._decoy_fluctuation = {} # used for mutational calculation, possibly others self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ + self._native_energy = None @property def alphabet(self): @@ -218,6 +231,16 @@ def coefficient_lambda_gamma_array(self): {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, {self.__class__}.protein_gamma, or {self.__class__}.water_gamma instead.""") + def native_energy(self): + if self.potts: + if not hasattr(self, 'potts_model'): # create potts model if it doesn't already exist + self.calculate_energy_and_potts() + energy = super().native_energy() # method to compute native energy given potts model + else: + energy = 0 # fill in numba function here + #self._native_energy = energy # maybe _native_energy is needed for compatibility with certain things? + return energy + def subclass_setup_helper(self): """ This method calls methods to calculate native indicator functions, @@ -233,23 +256,7 @@ def subclass_setup_helper(self): """ self.calculate_masks() # subclasses should (re)define this method as needed self.calculate_indicators() # subclasses should (re)define this method as needed - if self.potts: - self.calculate_energy_and_potts() - else: - if 'potts_model' in dir(self) or 'burial_energy' in dir(self)\ - or 'contact_energy' in dir(self) or '_native_energy' in dir(self): - # if one has been defined, they should all have been defined - assert 'potts_model' in dir(self), dir(self) - assert 'burial_energy' in dir(self), dir(self) - assert 'contact_energy' in dir(self), dir(self) - assert '_native_energy' in dir(self), dir(self) - # potts model and energies will be inaccurate once indicators are modified; - # if we don't care about the potts model, then we should delete the old - # data so it can't be accidentally misused in the future - del self.potts_model - del self.burial_energy - del self.contact_energy - del self._native_energy + self.calculate_energy_and_potts() def calculate_indicators(self): raise NotImplementedError("Subclasses must implement this method") @@ -280,34 +287,38 @@ def calculate_masks(self): self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function def calculate_energy_and_potts(self): - - J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) - h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) - - # compute burial and contact energies - self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] - direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] - water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] - protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - - electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ - * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] - contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) - - self.contact_energy = contact_energy - - # Compute potts model - self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] - assert self.potts_model['h'].shape == (self.N, self.q), self.potts_model['h'].shape - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] - assert self.potts_model['J'].shape == (self.N, self.N, self.q, self.q), self.potts_model['J'].shape - # Set the gap energy to zero - #self.potts_model['h'][:, 0] = 0 - #self.potts_model['J'][:, :, 0, :] = 0 - #self.potts_model['J'][:, :, :, 0] = 0 - self._native_energy=None # don't know what this does + if self.potts: + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) + + # compute burial and contact energies + self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] + direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] + water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] + protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] + contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + + electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] + contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) + + self.contact_energy = contact_energy + + # Compute potts model + self.potts_model = {} + self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] + assert self.potts_model['h'].shape == (self.N, self.q), self.potts_model['h'].shape + self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] + assert self.potts_model['J'].shape == (self.N, self.N, self.q, self.q), self.potts_model['J'].shape + # Set the gap energy to zero + #self.potts_model['h'][:, 0] = 0 + #self.potts_model['J'][:, :, 0, :] = 0 + #self.potts_model['J'][:, :, :, 0] = 0 + else: + print("""self.potts was False; will not calculate and store potts model. + Energies will be computed on the fly as needed for frustration calculations and then discarded. + If you want to get the energies for your own purposes, set self.potts + to True and then call this method again.""") def compute_configurational_decoy_statistics(self): @@ -327,7 +338,7 @@ def __init__(self, pdb_structure: object | tuple, # tuple is an object, but this clarifies what we expect sequence: str =None, expose_indicator_functions: bool=False, - potts: bool=True, + potts: bool=False, alt_sigma_wat: bool=False, **parameters)->object: # assume the user wanted the sequence from the pdb structure if not given @@ -450,7 +461,7 @@ def calculate_indicators(self): else: print("""self.expose_indicator_functions was False; will not calculate and store indicator functions. Indicator functions will be computed on the fly as needed for energy calculations and then discarded. - If you want to get the indicator functions directly, set self.expose_indicator_functions + If you want to get the indicator functions for your own purposes, set self.expose_indicator_functions to True and then call this method again.""") @property diff --git a/frustratometer/classes/Frustratometer.py b/frustratometer/classes/Frustratometer.py index d649e429..d61b5c92 100644 --- a/frustratometer/classes/Frustratometer.py +++ b/frustratometer/classes/Frustratometer.py @@ -66,6 +66,26 @@ def native_energy(self,sequence:str = None,ignore_couplings_of_gaps:bool=False,i if not self._native_energy: self._native_energy=frustration.compute_native_energy(sequence, self.potts_model, self.mask, self.alphabet, ignore_couplings_of_gaps, ignore_fields_of_gaps) + else: + # For the direct children of this Frustratometer class, DCA and AWSEMBase, + # changing the alphabet or gammas of an instance is not allowed, so there should never be a case + # where we have a previously defined (not None) but "out-of-date" _native_energy. + # Still, we will check that our code is working as intended + new = frustration.compute_native_energy( + sequence, self.potts_model, self.mask, self.alphabet, + ignore_couplings_of_gaps, ignore_fields_of_gaps) + if not (self._native_energy == new): + raise AssertionError(f""" + It seems that you have changed parameters of an object such that + the native energy of your system is now different from what it was + originally computed to be. Our code probably should prevent this + from happening, but you can prevent it too by not changing the alphabet + or any other parameters after initializing your DCA or child of + AWSEMBase. + + Previous value of {self.__class__}._native_energy: {self._native_energy} + New value of {self.__class__}._native_energy: {new}""") + energy_value=self._native_energy return energy_value diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index cdfeb374..b9f7ed06 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -30,7 +30,7 @@ def test_density_residues(test_data): sequence_separation = 2 if test_data['seqsep'] == 3 else 13 model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, k_electrostatics=0, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) data = pd.read_csv(test_data['singleresidue'], delim_whitespace=True) data['Calculated_density'] = model.rho_r data['Expected_density'] = data['DensityRes'] @@ -49,7 +49,7 @@ def test_single_residue_frustration(test_data): sequence_separation = 2 if test_data['seqsep'] == 3 else 13 model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=2, k_electrostatics=test_data['k_electrostatics'] * 4.184, - min_sequence_separation_electrostatics=1, expose_indicator_functions=True) + min_sequence_separation_electrostatics=1, expose_indicator_functions=True, potts=True) data = pd.read_csv(test_data['singleresidue'], delim_whitespace=True) data['Calculated_frustration'] = model.frustration(kind='singleresidue') data['Expected_frustration'] = data['FrstIndex'] @@ -69,7 +69,7 @@ def test_mutational_frustration(test_data): return model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, - min_sequence_separation_electrostatics=1, expose_indicator_functions=True) + min_sequence_separation_electrostatics=1, expose_indicator_functions=True, potts=True) data = pd.read_csv(test_data['mutational'], delim_whitespace=True) if test_data['pdb']!="ijge": @@ -111,7 +111,7 @@ def test_configurational_frustration(test_data): min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, min_sequence_separation_electrostatics=1, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) data = pd.read_csv(test_data['configurational'], delim_whitespace=True) @@ -150,13 +150,13 @@ def test_residue_density_calculation(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) assert np.round(model.rho_r,2).all()==np.round(expected_rho_values,2).all() def test_AWSEM_native_energy(): structure=frustratometer.Structure(test_data_path/f'1l63.pdb',"A") model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, - distance_cutoff_contact = None, expose_indicator_functions=True) + distance_cutoff_contact = None, expose_indicator_functions=True, potts=True) e = model.native_energy() print(e) assert np.round(e, 0) == -915 @@ -164,7 +164,7 @@ def test_AWSEM_native_energy(): def test_AWSEM_fields_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, - distance_cutoff_contact = None, expose_indicator_functions=True) + distance_cutoff_contact = None, expose_indicator_functions=True, potts=True) e = model.fields_energy() print(e) assert np.round(e, 0) == -555 @@ -172,14 +172,14 @@ def test_AWSEM_fields_energy(): def test_AWSEM_couplings_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, distance_cutoff_contact = None, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) e = model.couplings_energy() print(e) assert np.round(e, 0) == -362 def test_fields_couplings_AWSEM_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model = frustratometer.AWSEM(structure, expose_indicator_functions=True) + model = frustratometer.AWSEM(structure, expose_indicator_functions=True, potts=True) assert model.fields_energy() + model.couplings_energy() - model.native_energy() < 1E-6 def test_single_residue_AWSEM_energy(): @@ -190,7 +190,7 @@ def test_single_residue_AWSEM_energy(): model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, k_electrostatics=0, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) #Calculate fields seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) @@ -216,7 +216,7 @@ def test_contact_pair_AWSEM_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=0, - k_electrostatics=0, expose_indicator_functions=True) + k_electrostatics=0, expose_indicator_functions=True, potts=True) #Calculate fields seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) @@ -236,13 +236,13 @@ def test_contact_pair_AWSEM_energy(): def test_selected_subsequence_AWSEM_contact_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 3to26") - model=frustratometer.AWSEM(structure, expose_indicator_functions=True) + model=frustratometer.AWSEM(structure, expose_indicator_functions=True, potts=True) q = len(model.gamma.alphabet) assert model.potts_model['h'].shape==(24,q) def test_selected_subsequence_AWSEM_burial_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 150to315") - model=frustratometer.AWSEM(structure, expose_indicator_functions=True) + model=frustratometer.AWSEM(structure, expose_indicator_functions=True, potts=True) q = len(model.gamma.alphabet) assert model.potts_model['J'].shape==(166,166,q,q) @@ -254,13 +254,13 @@ def test_selected_subsequence_AWSEM_rho_calculations(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) #Check if shape and entries of rho matrices are identical assert model_1.rho_r.shape==model_2.rho_r[model_1_init_index:model_1_fin_index].shape @@ -270,13 +270,13 @@ def test_selected_subsequence_AWSEM_burial_energy(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) #Check if burial energies are identical assert model_1.burial_energy.shape==model_2.burial_energy[model_1_init_index:model_1_fin_index].shape @@ -286,13 +286,13 @@ def test_selected_subsequence_AWSEM_contact_energy(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) #Check if contact energies are identical assert model_1.contact_energy.shape==model_2.contact_energy[:,model_1_init_index:model_1_fin_index,model_1_init_index:model_1_fin_index,:,:].shape @@ -301,7 +301,7 @@ def test_selected_subsequence_AWSEM_contact_energy(): def test_selected_subsequence_AWSEM_burial_energy_without_protein_context(): structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True) + distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True, potts=True) selected_region_burial=model.fields_energy() # Energy units are in kJ/mol assert np.round(selected_region_burial, 2) == -377.95 @@ -309,7 +309,7 @@ def test_selected_subsequence_AWSEM_burial_energy_without_protein_context(): def test_selected_subsequence_AWSEM_contact_energy_without_protein_context(): structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True) + distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True, potts=True) selected_region_contact=model.couplings_energy() # Energy units are in kJ/mol assert np.round(selected_region_contact, 2) == -148.92 @@ -320,7 +320,7 @@ def test_single_residue_decoy_AWSEM_energy_statistics(): ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, k_electrostatics=0, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) #Calculate fields seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) @@ -351,7 +351,7 @@ def test_contact_pair_decoy_AWSEM_energy_statistics(): ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.5, min_sequence_separation_contact=None, k_electrostatics=0, - expose_indicator_functions=True) + expose_indicator_functions=True, potts=True) q = len(model.alphabet) #Calculate fields @@ -396,7 +396,7 @@ def test_expose_indicators(structure, k_electrostatics, min_sequence_separation_ """ Check that the AWSEM indicators exposed can reproduce the native energy, where E_native = -sum_{i} h_i - sum_{i,j} J_ij = sum_{i} gamma_i * I_i """ model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, min_sequence_separation_contact = min_sequence_separation_contact, - distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True) + distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True, potts=True) q = len(model.alphabet) model_seq_index=np.array([model.alphabet.index(aa) for aa in model.sequence]) indicators1D=np.array(model.masked_indicators[0:3]) From 7b31a5a691288356c8d317cc7d825365c99e563e Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 8 Dec 2025 12:53:58 -0600 Subject: [PATCH 62/76] cleaned up a bit more --- frustratometer/classes/AWSEM.py | 26 ++++++++++++++++++------ frustratometer/classes/Frustratometer.py | 11 ++++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index b588f005..c75d3476 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -227,8 +227,8 @@ def coefficient_lambda_gamma_array(self): @coefficient_lambda_gamma_array.setter # clarifies that this is derived from more fundamental quantities def coefficient_lambda_gamma_array(self): raise AttributeError(f"""Setting {self.__class__}.coefficient_lambda_gamma_array - directly is not allowed. Modify {self.__class__}.k_contact, - {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, + directly is not allowed. Initialize a new instance with a different + {self.__class__}.k_contact, {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, {self.__class__}.protein_gamma, or {self.__class__}.water_gamma instead.""") def native_energy(self): @@ -243,9 +243,9 @@ def native_energy(self): def subclass_setup_helper(self): """ - This method calls methods to calculate native indicator functions, - masks (based on the native distance matrix), and native energy, - then optionally sets up the potts model. + This method calls methods to calculate native indicator functions (optional), + masks (based on the native distance matrix), native energy (optional), + and potts model (optional). This method is intended to be called as the last step of __init__ in each subclass of AWSEMBase. The subclasses may differ in how @@ -320,7 +320,6 @@ def calculate_energy_and_potts(self): If you want to get the energies for your own purposes, set self.potts to True and then call this method again.""") - def compute_configurational_decoy_statistics(self): raise NotImplementedError("Subclasses must define this method") @@ -331,6 +330,21 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) + def mutational_frustration(self): + # This algorithm is defined in the Frustratometer class + # because it applies to both AWSEM and DCA frustratometry, + # and both the AWSEMBase and DCA classes inherit from Frustratometer. + # Our goal here is just to provide an interface that matches that used + # for configurational frustration, which has no DCA analog and therefore + # is not defined in Frustratometer (although Frustratometer.frustration + # calls the configurational_frustration method of this class if passed + # the kind='configurational' argument) + return super().frustration(kind='mutational') + + def singleresidue_frustration(self): + # see note for mutational_frustration method + return super().frustration(kind='singleresidue') + class AWSEM(AWSEMBase): diff --git a/frustratometer/classes/Frustratometer.py b/frustratometer/classes/Frustratometer.py index d61b5c92..462b1113 100644 --- a/frustratometer/classes/Frustratometer.py +++ b/frustratometer/classes/Frustratometer.py @@ -233,7 +233,8 @@ def scores(self): """ return frustration.compute_scores(self.potts_model) - def frustration(self, sequence:str = None, kind:str = 'singleresidue', mask:np.array = None, aa_freq:np.array = None, correction:int = 0) -> np.array: + def frustration(self, sequence:str = None, kind:str = 'singleresidue', mask:np.array = None, aa_freq:np.array = None, + correction:int = 0, n_decoys:int = 4000) -> np.array: """ Calculates frustration index values. @@ -264,9 +265,11 @@ def frustration(self, sequence:str = None, kind:str = 'singleresidue', mask:np.a frustration_values=frustration.compute_single_frustration(decoy_fluctuation, aa_freq, correction) return frustration_values elif kind in ['mutational', 'configurational', 'contact']: - if kind == 'configurational' and 'configurational_frustration' in dir(self): - #TODO: Correct this function for different aa_freq than WT - return self.configurational_frustration(None, correction) + if kind == 'configurational': + if 'configurational_frustration' in dir(self): + return self.configurational_frustration(aa_freq=aa_freq, correction=correction, n_decoys=n_decoys) + else: + raise ValueError("kind='configurational' may only be used on objects implementing self.configurational_frustration") if aa_freq is None: aa_freq = self.contact_freq frustration_values=frustration.compute_pair_frustration(decoy_fluctuation, aa_freq, correction) From ccb42e35c56e483411a0a3b16b70420103c837d4 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 8 Dec 2025 16:59:14 -0600 Subject: [PATCH 63/76] committing numba code and configuring imports --- frustratometer/__init__.py | 1 + frustratometer/classes/AWSEM.py | 2 + frustratometer/frustration/__init__.py | 4 +- .../numba_util/frustration_algorithms.py | 401 +++++ frustratometer/numba_util/hamiltonian.py | 1444 +++++++++++++++++ 5 files changed, 1851 insertions(+), 1 deletion(-) create mode 100644 frustratometer/numba_util/frustration_algorithms.py create mode 100644 frustratometer/numba_util/hamiltonian.py diff --git a/frustratometer/__init__.py b/frustratometer/__init__.py index c381858d..cb1d16d2 100644 --- a/frustratometer/__init__.py +++ b/frustratometer/__init__.py @@ -13,6 +13,7 @@ from . import align from . import frustration from . import optimization +from . import numba_util # Handle versioneer from ._version import get_versions diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index c75d3476..bb1bc075 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -1,6 +1,8 @@ import numpy as np from ..utils import _path from .. import frustration +from .. import numba_util +from ..numba_util import ham, algos from .Frustratometer import Frustratometer from .Gamma import Gamma from pydantic import BaseModel, Field, ConfigDict diff --git a/frustratometer/frustration/__init__.py b/frustratometer/frustration/__init__.py index 7d6d920c..fed7a071 100644 --- a/frustratometer/frustration/__init__.py +++ b/frustratometer/frustration/__init__.py @@ -7,6 +7,8 @@ """ from .frustration import * +#from .numba_hamiltonian import * + __all__ = ['compute_mask', 'compute_native_energy', 'compute_fields_energy', 'compute_couplings_energy', 'compute_sequences_energy', 'compute_singleresidue_decoy_energy_fluctuation', @@ -14,4 +16,4 @@ 'compute_contact_decoy_energy_fluctuation', 'compute_decoy_energy', 'compute_aa_freq', 'compute_contact_freq', 'compute_single_frustration', 'compute_pair_frustration', 'compute_scores', 'compute_roc', 'compute_auc', 'plot_roc', 'plot_singleresidue_decoy_energy', 'write_tcl_script', - 'call_vmd', 'canvas'] + 'call_vmd', 'canvas', 'ham'] diff --git a/frustratometer/numba_util/frustration_algorithms.py b/frustratometer/numba_util/frustration_algorithms.py new file mode 100644 index 00000000..6444a6c9 --- /dev/null +++ b/frustratometer/numba_util/frustration_algorithms.py @@ -0,0 +1,401 @@ +""" +Functions for frustration calculations with numba. +Relies upon numba_hamiltonian module to evaluate the potential. + +Sometimes, the Potts model for a system requires more RAM than we have available. +One solution to this challenge is to calculate energies on the fly instead of +storing them in a massive array. To speed up evaluation of the many loops needed +to calculate quantities on the fly, we would like to use numba. Unfortunately, +numba struggles to jit-compile most python objects, like Structure and +AWSEM. Our solution is to define functions that take attributes from our +python objects as parameters, which we can then jit without issue. + +The object-oriented interfaces found elsewhere in this repository should +offer an option called something like "use_numba" or "ram_limited" that, +when set to True, results in these numba utilities being called. +""" + +import numpy as np +import numba +from numba import njit, prange, int64, float64, boolean + +from . import hamiltonian as ham + +signature = numba.types.UniTuple(float64,2)( + float64[:,:], + float64[:], float64[:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64, int64[:], int64[:]) +def pair_decoy_stats( + allowed_thetaIthetaIIelectrostatic, + allowed_rho_i, allowed_rho_j, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + n_decoys, seq_index_i, seq_index_j): + """ + Generate distribution of pair energies by randomly sampling + indicators and gammas, then compute the mean and + and standard deviation of the distribution. + + The sampling is performed in the following way: + - Randomly select a row from thetaIthetaIIelectrostatic_array, + representing the pairwise distance-based indicator functions + for a particular pair of residues (i,j) + - Randomly select a rho value for residue i from allowed_rho_i + - Randomly select a rho value for residue j from allowed_rho_j + - Randomly select an amino acid type for residue i + from seq_index_i + - Randomly select an amino acid type for residue j + from seq_index_j + - Randomly select an amino acid type for residue j + - Get the appropriate gammas for the pair (i,j) + - Compute the pair energy given the indicators and gammas + + When writing this function, I'm thinking of seq_index_i and + seq_index_j as the seq_index of the protein (list equal to + the length of the protein where each element represents the + amino acid type at its position). But you can get aa_freq + behavior by replacing seq_index with a different array having + different amino acid types in your desired proportions. + + Similarly, I'm thinking of allowed_thetaIthetaIIelectrostatic as + including one set of {thetaI, thetaII, electrostatic_indicator} + for each pair of residues in the protein meeting some mask + condition (applied by the user before calling this function). + + Note that this function uses the deprecated np.random.choice() + function to take a uniform random sample of our arrays. Getting + random number generators to work with numba is tricky, so it's + probably best to stick with this way of doing things. + + Parameters + ---------- + - allowed_thetaIthetaIIelectrostatic : np.array(C_1, 3) + thetaI, thetaII, and electrostatic indicator values for + all C_1 allowed contacts. + Each set {thetaI_i, thetaII_i, electrostatic_indicator_i} + should be repeated multiple times in proportion to the + desired probability. + - allowed_rho_i : np.array(C_2,) + All C_2 choices of rho allowed for residue "i". + Each unique value should be repeated multiple times + in proportion to the desired probability. + - allowed_rho_j : np.array(C_3,) + All C_3 choices of rho allowed for residue "j". + Each unique value should be repeated multiple times + in proportion to the desired probability. + - lambda_direct : float + Scale factor for direct interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_protein and lambda_water. + - direct_gamma : np.array(20,20) + Array formatted in the same way as self.direct_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). + - lambda_protein : float + Scale factor for protein-mediated interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_direct and lambda_water. + - protein_gamma : np.array(20,20) + Array formatted in the same way as self.protein_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). + - lambda_water : float + Scale factor for water-mediated interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_direct and lambda_protein. + - water_gamma : np.array(20,20) + Array formatted in the same way as self.water_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). + - lambda_burial : float + Scale factor for burial interaction energies. + Should be 1 kcal/mol (4.184 kJ/mol). + - burial_gamma : np.array(20,3) + Array formatted in the same way as self.burial_gamma from the AWSEM class. + Order along axis 0 may vary (ACDE vs. ARND), but should always be ordered as + [low density, medium density, high density] along axis 1. + - lambda_electrostatic : float + Our electrostatic "lambda" and "gamma" are different from those for our other + terms in that they seek to represent fundamental from the bottom up, + rather than the top-down optimization followed for the other gammas. + Specifically, the "lambda" is the conversion factor from fundamental + charge units to kJ/mol, adjusted for the (uniform component of the) + solvent dielectric screening. (Heterogeneities in the solvation structures of + ions are accounted for in the electrostatics indicator function). + - electrostatic_gamma : np.array(20,20) + Our electrostatic "lambda" and "gamma" are different from those for our other + terms in that they seek to represent fundamental from the bottom up, + rather than the top-down optimization followed for the other gammas. + Specifically, the "gamma" is the product of the expected fundamental charges + of the side chain -- usually +/-1, but we could do -2 for phosphorylation + - n_decoys : int + Number of samples draw to construct the distribution of pair energies. + Ideally, n_decoys = infinity. + - seq_index_i : np.array(C_4,) + All C_4 choices of amino acid type allowed for residue "i". + Necessarily repeats amino acid types for C_4 > 20. + Each unique value should be repeated multiple times + in proportion to the desired probability. + - seq_index_j : np.array(C_5,) + All C_5 choices of amino acid type allowed for residue "j". + Necessarily repeats amino acid types for C_5 > 20. + Each unique value should be repeated multiple times + in proportion to the desired probability. + + Returns + ------- + mean : float + Average energy of the decoys + stdev : float + Standard deviation of the energies of the decoys + """ + # randomly choose (with replacement) indices to sample, + # then generate arrays containing the randomly sampled values + thetaIthetaIIelectrostatic_array = allowed_thetaIthetaIIelectrostatic\ + [np.random.choice(allowed_thetaIthetaIIelectrostatic.shape[0],size=n_decoys),:] + rho_i_array = allowed_rho_i[np.random.choice(allowed_rho_i.shape[0],size=n_decoys)] + rho_j_array = allowed_rho_j[np.random.choice(allowed_rho_j.shape[0],size=n_decoys)] + aa_i_array = seq_index_i[np.random.choice(seq_index_i.shape[0],size=n_decoys)] + aa_j_array = seq_index_j[np.random.choice(seq_index_j.shape[0],size=n_decoys)] + # calculate pair energies and fill array + pair_energies = np.zeros(n_decoys) + for counter in prange(n_decoys): + thetaI = thetaIthetaIIelectrostatic_array[counter,0] + thetaII = thetaIthetaIIelectrostatic_array[counter,1] + electrostatic_indicator = thetaIthetaIIelectrostatic_array[counter,2] + rho_i = rho_i_array[counter] + rho_j = rho_j_array[counter] + aa_i = seq_index_i[aa_i_array[counter]] + aa_j = seq_index_j[aa_j_array[counter]] + gamma_bi = burial_gamma[aa_i,:] + gamma_bj = burial_gamma[aa_j,:] + gamma_d = direct_gamma[aa_i, aa_j] + gamma_p = protein_gamma[aa_i, aa_j] + gamma_w = water_gamma[aa_i, aa_j] + gamma_e = electrostatic_gamma[aa_i, aa_j] + pair_energy = ham.compute_pair_energy_ij_useful( + rho_i, rho_j, thetaI, thetaII, electrostatic_indicator, + lambda_direct, gamma_d, lambda_protein, gamma_p, lambda_water, gamma_w, + lambda_burial, gamma_bi, gamma_bj, lambda_electrostatic, gamma_e) + #burial_energy_i = ham.compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma_bi) + #burial_energy_j = ham.compute_burial_potential_i_from_rho_gamma(rho_j, lambda_burial, gamma_bj) + #direct_energy = ham.compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma_d) + #protein_energy, water_energy = ham.compute_long_potentials_ij_from_rho_thetaII_gamma( + # rho_i, rho_j, thetaII, lambda_protein, gamma_p, lambda_water, gamma_w) + #electrostatic_energy = ham.compute_electrostatic_potential_ij_from_indicator_gamma( + # lambda_electrostatic, gamma_e, electrostatic_indicator) + #pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ + # protein_energy + water_energy + electrostatic_energy + pair_energies[counter] = pair_energy + mean = np.average(pair_energies) + stdev = np.std(pair_energies) + return mean, stdev +pair_decoy_stats_parallel = njit(signature_or_function=signature, parallel=True)(pair_decoy_stats) +pair_decoy_stats = njit(signature_or_function=signature)(pair_decoy_stats) +#pair_decoy_stats_parallel = njit(parallel=True)(pair_decoy_stats).compile(signature) +#pair_decoy_stats = njit()(pair_decoy_stats).compile(signature) +#pair_decoy_stats_parallel = njit(pair_decoy_stats, +# signature_or_function=signature, parallel=True) +#pair_decoy_stats = njit(pair_decoy_stats, signature_or_function=signature) +# +@njit(signature_or_function=numba.types.UniTuple(float64,2)( + float64, int64, int64[:], int64[:], float64[:,:], + float64, float64, + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64, int64[:]), + parallel=True) # we definitely want to parallelize this function +def standard_config_decoy_stats( + l_D, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, + min_dist_decoy_gen, max_dist_decoy_gen, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + n_decoys, seq_index): + """ + Get mean and standard deviation of decoy energies + following the standard configurational frustration algorithm. + + Parameters + ---------- + l_D : float + Screening length for Debye-Huckel electrostatics, in units of Angstroms + min_seq_sep_rho : int + The minimum distance in sequence for two residues to contribute to each others' + rho. Include i,j (i.e., set mask bit bool to True) if |i-j| >= min_seq_sep_rho. + chain_starts : np.array(N_c) + List of 0-indexed residue indices marking the start of each chain, + for example, array([0]) for the case of a single chain (N_c==1). + chain_ends : np.array(N_c) + List of 0-indexed residue indices marking the end of each chain, + for example, array([L-1]) for the case of a single chain (N_c==1). + dist_mat : np.array(L,L) + Pairwise distance matrix for the entire protein system + min_dist_decoy_gen : float + Discard distances lower than this value from the distribution + max_dist_decoy_gen : float + Discard distances greater than this value from the distribution + lambda_direct : float + Scale factor for direct interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_protein and lambda_water. + direct_gamma : np.array(20,20) + Array formatted in the same way as self.direct_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). + lambda_protein : float + Scale factor for protein-mediated interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_direct and lambda_water. + protein_gamma : np.array(20,20) + Array formatted in the same way as self.protein_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). + lambda_water : float + Scale factor for water-mediated interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_direct and lambda_protein. + water_gamma : np.array(20,20) + Array formatted in the same way as self.water_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). + lambda_burial : float + Scale factor for burial interaction energies. + Should be 1 kcal/mol (4.184 kJ/mol). + burial_gamma : np.array(20,3) + Array formatted in the same way as self.burial_gamma from the AWSEM class. + Order along axis 0 may vary (ACDE vs. ARND), but should always be ordered as + [low density, medium density, high density] along axis 1. + lambda_electrostatic : float + Our electrostatic "lambda" and "gamma" are different from those for our other + terms in that they seek to represent fundamental from the bottom up, + rather than the top-down optimization followed for the other gammas. + Specifically, the "lambda" is the conversion factor from fundamental + charge units to kJ/mol, adjusted for the (uniform component of the) + solvent dielectric screening. (Heterogeneities in the solvation structures of + ions are accounted for in the electrostatics indicator function). + electrostatic_gamma : np.array(20,20) + Our electrostatic "lambda" and "gamma" are different from those for our other + terms in that they seek to represent fundamental from the bottom up, + rather than the top-down optimization followed for the other gammas. + Specifically, the "gamma" is the product of the expected fundamental charges + of the side chain -- usually +/-1, but we could do -2 for phosphorylation + n_decoys : int + Number of samples draw to construct the distribution of pair energies. + Ideally, n_decoys = infinity. + seq_index : np.array(L,) + Array equal in length to the number of amino acids in the protein, + where each element is the numerical code for the amino acid + at that position. Numerical codes are determined by the position + of the one-letter code of the amino acid in the string of all + one-letter amino acid codes, and so should range from 0 to 19. + The string of all one-letter amino acid codes is probably + "ARND..." or "ACDE...", alphabetical by 3-letter code or 1-letter code. + + Returns + ------- + mean : float + Average energy of the decoys + stdev : float + Standard deviation of the energies of the decoys + """ + # calculate rho + C_2 = dist_mat.shape[0] + allowed_rho_i = np.zeros(C_2) + for counter in prange(C_2): + allowed_rho_i[counter] = ham.compute_rho_i(counter, + min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + allowed_rho_j = allowed_rho_i + # calculate distance-based indicators + #triu_indices = np.triu_indices(C_2,k=1) + #distances = dist_mat[triu_indices[0], triu_indices[1]] + #distances = distances[(distances<=max_dist_decoy_gen)&(distances>=min_dist_decoy_gen)] + distances = np.zeros(((C_2**2)-C_2)//2) # maximum possible number of distances + num_distances = 0 + for i in range(C_2): + for j in range(i+1, C_2): + dist_ij = dist_mat[i,j] + if min_dist_decoy_gen <= dist_ij <= max_dist_decoy_gen: + distances[num_distances] = dist_ij + num_distances += 1 + distances = distances[:num_distances+1] + C_1 = distances.shape[0] + allowed_thetaIthetaIIelectrostatic = np.zeros((C_1, 3)) + for counter in prange(C_1): + dist_ij = distances[counter] + allowed_thetaIthetaIIelectrostatic[counter,0] = ham.compute_thetaI(dist_ij) + allowed_thetaIthetaIIelectrostatic[counter,1] = ham.compute_thetaII(dist_ij) + allowed_thetaIthetaIIelectrostatic[counter,2] = ham.compute_electrostatic_indicator(l_D, dist_ij) + # assign pools of aa types to draw from + seq_index_i = seq_index + seq_index_j = seq_index + # send our formatted data to numba function for rapid sampling + mean, stdev = pair_decoy_stats(allowed_thetaIthetaIIelectrostatic, + allowed_rho_i, allowed_rho_j, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + n_decoys, seq_index_i, seq_index_j) + return mean, stdev +# + + +## no numba for this function +#def compute_frustration_matrix(dist_mat, +# min_seq_sep_rho, min_seq_sep_frust_index, +# chain_starts, chain_ends, +# seq_index, +# lambda_direct, direct_gamma, +# lambda_protein, protein_gamma, +# lambda_water, water_gamma, +# lambda_burial, burial_gamma, +# lambda_electrostatic, electrostatic_gamma, l_D, +# decoy_stats_method): +# """ +# Calculate matrix of frustration indices +# +# Parameters +# ---------- +# decoy_stats_method : callable +# function that returns decoy mean and standard deviation +# (recommend numba_util.pair_decoy_stats_config) +# others : +# See module-level docstring +# +# Returns +# ------- +# frustration_matrix: +# Matrix of the same shape as dist_mat, where each element (i,j) +# is, if unmasked, the frustration index of the pair (i,j), or, +# if masked, np.nan. +# """ +# pair_energy_matrix = compute_pair_energy_matrix( +# dist_mat, +# min_seq_sep_rho, min_seq_sep_frust_index, +# chain_starts, chain_ends, +# seq_index, +# lambda_direct, direct_gamma, +# lambda_protein, protein_gamma, +# lambda_water, water_gamma, +# lambda_burial, burial_gamma, +# lambda_electrostatic, electrostatic_gamma, l_D) +# mean, stdev = decoy_stats_method(dist_mat, +# min_dist_decoy_gen, max_dist_decoy_gen, +# min_seq_sep_rho, +# lambda_direct, direct_gamma, +# lambda_protein, protein_gamma, +# lambda_water, water_gamma, +# lambda_burial, burial_gamma, +# lambda_electrostatic, electrostatic_gamma, l_D) +# # will generate warnings about np.nan +# frustration_matrix = (pair_energy_matrix - mean) / stdev +# return frustration_matrix \ No newline at end of file diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py new file mode 100644 index 00000000..18d7ca16 --- /dev/null +++ b/frustratometer/numba_util/hamiltonian.py @@ -0,0 +1,1444 @@ +""" +Hierarchy of functions for AMW/tertiary/frustratometer/potts +Hamiltonian calculations with numba. + +Sometimes, the Potts model for a system requires more RAM than we have available. +One solution to this challenge is to calculate energies on the fly instead of +storing them in a massive array. To speed up evaluation of the many loops needed +to calculate quantities on the fly, we would like to use numba. Unfortunately, +numba struggles to jit-compile most python objects, like Structure and +AWSEM. Our solution is to define functions that take attributes from our +python objects as parameters, which we can then jit without issue. + +The object-oriented interfaces found elsewhere in this repository should +offer an option called something like "use_numba" or "ram_limited" that, +when set to True, results in these numba utilities being called. + +Conventions +----------- + +This is the complete list of parameters that may be used by any function: +i, j, l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, +min_seq_sep_frust_index, min_seq_sep, seq_sep, chain_starts, chain_ends, +same_chain, min_dist, max_dist, dist_mat, dist_ij, rho_i, rho_j, +thetaI, thetaII, sigma_water, lambda_direct, direct_gamma, lambda_protein, +protein_gamma, gamma_p, lambda_water, water_gamma, gamma_w, lambda_burial, burial_gamma, +lambda_electrostatic, electrostatic_gamma, gamma, seq_index, parallel + +No function uses all these parameters, but all functions use a subset of +these parameters. The subset of parameters is always ordered the same +as it is in the above list. The meanings of the parameters are given +below, in order. + +Parameters to select the residue(s) for a computation +- i : int + 0-indexed position of residue "i" in the complete system +- j : int + 0-indexed position of residue "j" in the complete system + +Mathematical parameters of the indicator functions +- l_D: float + Screening length for Debye-Huckel electrostatics, in units of Angstroms + +Parameters for evaluating mask conditions +- min_seq_sep_rho : int + The minimum distance in sequence for two residues to contribute to each others' + rho. Include i,j (i.e., set mask bit bool to True) if |i-j| >= min_seq_sep_rho. +- min_seq_sep_contact : int + The minimum distance in sequence for a contact to be considered "real" and unmasked. + Include i,j (i.e., set mask bit bool to True) if |i-j| >= min_seq_sep_contact. +- min_seq_sep_electrostatic : int + The minimum distance in sequence for a charged pair to be considered "real" and unmasked. + Include i,j (i.e., set mask bit bool to True) if |i-j| >= min_seq_sep_electrostatic. +- min_seq_sep_frust_index : int + The minimum distance in sequence for a pair's frustration index to be + calculated (frustration index is set to np.nan if not satisfied) +- min_seq_sep : int + Sequence separation used to determine whether two residues "see" each other; + what it means to two residues to "see" each other depends on the context + (see min_seq_sep_contact and min_seq_sep_rho) +- seq_sep : int + Actual distance in sequence between two residues, |i-j| +- chain_starts : np.array(N_c) + List of 0-indexed residue indices marking the start of each chain, + for example, array([0]) for the case of a single chain (N_c==1). +- chain_ends : np.array(N_c) + List of 0-indexed residue indices marking the end of each chain, + for example, array([L-1]) for the case of a single chain (N_c==1). +- same_chain : bool + Whether the two residues i and j are part of the same chain +- min_dist : float + Residues closer in space than this distance are masked +- max_dist : float + Residues further in space than this distance are masked + +Parameters holding the values of indicator functions or +quantities needed to compute indicator functions +- dist_mat : np.array (L,L) + Distance matrix for all residue pairs +- dist_ij : float + Distance between two residues, in angstroms +- rho_i : float + Rho value of residue "i" +- rho_j : float + Rho value of residue "j" +- burial_indicator : np.array(3,) + Low, medium, and high burial components for a burial indicator + function for a particular residue +- thetaI : float + Value of the short-range indicator function for a pair of residues. + This indicator function is used to compute the direct interaction + and as an input to the rho computation +- thetaII : float + Value of the long-range indicator function for a pair of residues. + This indicator function is used to compute the protein-mediated + and water-mediated interactions +- sigma_water : float + Used to determine whether a pair of residues is in a solvent- + exposed or buried environment. +- electrostatic_indicator : float + Effective interaction strength of two charged residues, + based on their distance and debye-huckel screening length + +Parameters needed to compute energies but not indicator functions +- lambda_direct : float + Scale factor for direct interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_protein and lambda_water. +- direct_gamma : np.array(20,20) + Array formatted in the same way as self.direct_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). +- gamma_d : float + Like the plain gamma argument (see below) +- lambda_protein : float + Scale factor for protein-mediated interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_direct and lambda_water. +- protein_gamma : np.array(20,20) + Array formatted in the same way as self.protein_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). +- gamma_p : float + Like the plain gamma argument (see below), but differentiates the protein gamma from + the water gamma in the long-range potential calculation function +- lambda_water : float + Scale factor for water-mediated interaction energies. + Should probably be 1 kcal/mol (4.184 kJ/mol), + but has sometimes been set to 0.75 kcal/mol along with lambda_direct and lambda_protein. +- water_gamma : np.array(20,20) + Array formatted in the same way as self.water_gamma from the AWSEM class. + Order may vary (ACDE vs. ARND). +- gamma_w : float + Like the plain gamma argument (see below), but differentiates the water gamma from + the protein gamma in the long-range potential calculation function +- lambda_burial : float + Scale factor for burial interaction energies. + Should be 1 kcal/mol (4.184 kJ/mol). +- burial_gamma : np.array(20,3) + Array formatted in the same way as self.burial_gamma from the AWSEM class. + Order along axis 0 may vary (ACDE vs. ARND), but should always be ordered as + [low density, medium density, high density] along axis 1. +- gamma_bi : float + Like the plain gamma argument (see below), but is np.array(3,) instead of float +- gamma_bj : float + Like the plain gamma argument (see below), but is np.array(3,) instead of float +- lambda_electrostatic : float + Our electrostatic "lambda" and "gamma" are different from those for our other + terms in that they seek to represent fundamental from the bottom up, + rather than the top-down optimization followed for the other gammas. + Specifically, the "lambda" is the conversion factor from fundamental + charge units to kJ/mol, adjusted for the (uniform component of the) + solvent dielectric screening. (Heterogeneities in the solvation structures of + ions are accounted for in the electrostatics indicator function). +- electrostatic_gamma : np.array(20,20) + Our electrostatic "lambda" and "gamma" are different from those for our other + terms in that they seek to represent fundamental from the bottom up, + rather than the top-down optimization followed for the other gammas. + Specifically, the "gamma" is the product of the expected fundamental charges + of the side chain -- usually +/-1, but we could do -2 for phosphorylation +- gamma_e : float + Like the plain gamma argument (see below) +- gamma : float + Scalar gamma that has been selected from a gamma array based on the + amino acid types of residues i and j +- seq_index : np.array(L,) + Array equal in length to the number of amino acids in the protein, + where each element is the numerical code for the amino acid + at that position. Numerical codes are determined by the position + of the one-letter code of the amino acid in the string of all + one-letter amino acid codes, and so should range from 0 to 19. + The string of all one-letter amino acid codes is probably + "ARND..." or "ACDE...", alphabetical by 3-letter code or 1-letter code. + +Parameters to optimize computation efficiency +- parallel : bool + Whether to jit-compile the function with parallel=True + +Notes +----- +What we call the "(AWSEM) tertiary Hamiltonian" +or the "frustratometer Hamiltonian" or the "AMW Hamiltonian" +without electrostatics was defined in its modern form in + +Papoian, Ulander, Eastwood, Luthey-Schulten, and Wolynes, +PNAS 2004 (https://www.pnas.org/doi/10.1073/pnas.0307851100) + +This paper also gave us the gammas for the contact and burial interactions. + +Electrostatics were introduced in + +Tsai, Zheng, Balamurugan, Schafer, Kim, Cheung, and Wolynes, +Prot. Sci. 2016 (https://doi.org/10.1002%2Fpro.2751) +""" + +import numpy as np +import numba +from numba import njit, prange, int64, float64, boolean + +################################################################################ +# FUNCTIONS TO CALCULATE masks, thetaI, thetaII, rho, sigma_wat, sigma_prot, +# burial_indicator, AND THE ELECTROSTATICS INDICATOR, +# GIVEN A SINGLE RESIDUE i OR A PAIR OF RESIDUES (i,j), AS APPROPRIATE +# THESE FUNCTIONS **DON'T** CHECK MASK CONDITIONS! +# +@njit(signature_or_function=boolean(int64, int64, int64[:], int64[:])) +def check_same_chain(i, j, chain_starts, chain_ends): + """ + Checks whether two zero-indexed residue indices, i and j, belong to the same chain + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + same_chain : bool + Whether i and j are in the same chain + """ + same_chain = False + for counter in range(len(chain_starts)): # should be same length as chain_ends + if (chain_starts[counter] <= i <= chain_ends[counter]) and (chain_starts[counter] <= j <= chain_ends[counter]): + same_chain = True + break # this could save us a couple iterations, probably doesn't matter + return same_chain +# +@njit(signature_or_function=boolean(int64, float64, float64, int64, boolean, float64)) +def mask_of_pair(min_seq_sep, seq_sep, min_dist, max_dist, same_chain, dist_ij): + """ + Get a bool representing whether a pair of residues having + sequence separation seq_sep and distance dist_ij should be + considered (True) or ignored (False), given the supplied parameters. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + mask_bit : bool + Whether pair should be considered unmasked (True) or masked (False). + """ + if min_dist<=dist_ij<=max_dist and (min_seq_sep<=seq_sep or not same_chain): + mask_bit = True + else: + mask_bit = False + return mask_bit +# +@njit(signature_or_function=float64(float64, float64, float64)) +def _compute_theta(dist_ij, r_min, r_max): + # This function may be called to evaluate either thetaI or thetaII. + # Since thetaI is used to compute both contact indicators and rho, + # we have to worry about min_seq_sep_contact vs min_seq_sep_rho. + # So we do not check the mask here, but instead check it before thetaI + # or thetaII is called. + # 5 (Angstrom^-1) is "eta" + theta = 0.25 * (1 + np.tanh(5*(dist_ij-r_min))) * (1 + np.tanh(5*(r_max-dist_ij))) + return theta +@njit(signature_or_function=float64(float64)) +def compute_thetaI(dist_ij): + """ + Computes thetaI, the short-range indicator function + that tells us whether two residues are close but not overlapping. + This function does not check whether the ij interaction should + be blocked by a mask; this should be done in the calling scope. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + thetaI : float + The short-range indicator function. + """ + return _compute_theta(dist_ij, 4.5, 6.5) +@njit(signature_or_function=float64(float64)) +def compute_thetaII(dist_ij): + """ + Computes thetaII, the long-range switching function + that tells us whether two residues are close but not in direct contact. + This function does not check whether the ij interaction should + be blocked by a mask; this should be done in the calling scope. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + thetaII : float + The long-range indicator function + """ + return _compute_theta(dist_ij, 6.5, 9.5) +# +signature = float64(int64, int64, int64[:], int64[:], float64[:,:]) +def compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat): + """ + Compute the "local density," rho, of a given 0-indexed + residue index, i. The quantity rho_i may be loosely thought of + as the number of neighbors (coordination number) of residue i. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + rho_i : float + The local density of residue i + """ + rho_i = 0.0 + for j in prange(dist_mat.shape[1]): + # check mask + same_chain = check_same_chain(i, j, chain_starts, chain_ends) + # 2.5 and 8.5 cutoffs: effectively, + # we're truncating the potential where the indicators are almost zero + # (thetaI(2.5)==thetaI(8.5)==2.0611536367E-9) + if mask_of_pair(min_seq_sep_rho, abs(i-j), 2.5, 8.5, + same_chain, dist_mat[i,j]): + # only let the residue contribute if it isn't caught by the mask + rho_i += compute_thetaI(dist_mat[i,j]) + return rho_i +compute_rho_i_parallel = njit(parallel=True)(compute_rho_i) +compute_rho_i_parallel.compile(signature) +compute_rho_i = njit()(compute_rho_i) +compute_rho_i.compile(signature) +#compute_rho_i_parallel = njit(compute_rho_i, +# signature_or_function=signature, parallel=True) +#compute_rho_i = njit(compute_rho_i, signature_or_function=signature) +# +@njit(signature_or_function=float64[:](float64)) +def compute_burial_indicator_i(rho_i): + """ + Compute the vector-valued burial indicator function (one element + each for low, medium, and high density). + + Parameters + ---------- + See module-level docstring + + Returns + ------- + burial_indicator : np.array(3,) + The burial indicator for residue i in each well + Remember that the burial indicator is defined as ranging from 0 to 2 + """ + + burial_indicator = np.zeros(3) + # 4.0 is "burial_kappa" + burial_indicator[0] = (np.tanh(4.0*(rho_i-0.0)) + np.tanh(4.0*(3.0-rho_i))) + burial_indicator[1] = (np.tanh(4.0*(rho_i-3.0)) + np.tanh(4.0*(6.0-rho_i))) + burial_indicator[2] = (np.tanh(4.0*(rho_i-6.0)) + np.tanh(4.0*(9.0-rho_i))) + return burial_indicator +# +@njit(signature_or_function=float64(float64,float64)) +def compute_sigma_water(rho_i, rho_j): + """ + Compute sigma_water based on local densities of the two residues in the pair. + + If both residues are exposed ((rho_i < rho_0) && (rho_j < rho_0)), + then water-mediated interactions dominate (sigma_water ~ 1). + If either is buried ((rho_i > rho_0) || (rho_j > rho_0)), + then water-mediated interactions are small (sigma_water ~ 0). + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + sigma_water : float + Fraction of water-mediated interactions (0 to 1) + """ + #sigma_water = 0.25 * (1 - np.tanh(eta_sigma * (rho_i - rho_0))) * (1 - np.tanh(eta_sigma * (rho_j - rho_0))) + sigma_water = 0.25*(1-np.tanh(7*(rho_i-2.6)))*(1-np.tanh(7*(rho_j-2.6))) + return sigma_water +# +@njit(signature_or_function=float64(float64, float64)) +def compute_electrostatic_indicator(l_D, dist_ij): + """ + Computes electrostatics indicator function, which gives an + effective proximity (higher <==> closer) of two residues, + capturing not only the 1/r decay of the coulomb energy, + but also the screening effects of counterions; l_D + should be negatively correlated with the ionic strength. + + Parameters + ---------- + See module-level docstring + + Returns + ------- + electrostatics_indicator(i,j,dist_mat[i,j]), parameterized by l_D + """ + if dist_ij >= 1: + safe_dist = dist_ij + else: + raise ValueError("Distance between two residue was less than 1 angstrom!") + electrostatics_indicator = np.exp(-safe_dist / l_D) / safe_dist + return electrostatics_indicator +# +########################################################################### +# FUNCTIONS TO CALCULATE ENERGIES, GIVEN A SINGLE RESIDUE i OR A PAIR (i,j) +# THESE FUNCTIONS **DON'T** CHECK MASK CONDITIONS! +# +# BURIAL POTENTIAL +@njit(signature_or_function=float64(float64[:], float64, float64[:])) +def compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_burial, gamma): + """ + Compute the burial energy for residue i based on its local density. + Note that this function computes and sums the 3 types of burial energies: + low-density, medium-density, and high-density. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + burial_energy : float + Total burial energy for residue i, sum across all three burial wells. + """ + # Caution: the burial indicator functions range from 0 to 2, + # not 0 to 1, like the other indicator functions. + # This is why we have a coefficient of 0.5 in the energy expression. + low_indicator = burial_indicator[0] + low_gamma = gamma[0] + medium_indicator = burial_indicator[1] + medium_gamma = gamma[1] + high_indicator = burial_indicator[2] + high_gamma = gamma[2] + burial_energy = -0.5*lambda_burial *\ + (low_indicator*low_gamma+medium_indicator*medium_gamma+high_indicator*high_gamma) + return burial_energy +@njit(signature_or_function=float64(float64, float64, float64[:])) +def compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma): + """ + Compute the burial energy for residue i based on its local density. + Note that this function computes and sums the 3 types of burial energies: + low-density, medium-density, and high-density. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + burial_energy : float + Total burial energy for residue i, sum across all three burial wells. + """ + burial_indicator = compute_burial_indicator_i(rho_i) + burial_energy = compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_burial, gamma) + return burial_energy +@njit(signature_or_function=float64(int64, float64, float64, float64[:,:], int64[:])) +def compute_burial_potential_i_from_rho(i, rho_i, lambda_burial, burial_gamma, seq_index): + """ + Compute the burial energy for residue i based on its local density. + Note that this function computes and sums the 3 types of burial energies: + low-density, medium-density, and high-density. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + burial_energy : float + Total burial energy for residue i, sum across all three burial wells. + """ + gamma = burial_gamma[seq_index[i]] + burial_energy = compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma) + return burial_energy +@njit(signature_or_function=float64(int64, int64, int64[:], int64[:], float64[:,:], float64, float64[:])) +def compute_burial_potential_i_from_gamma(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, gamma): + """ + Compute the burial energy for residue i based on its local density. + Note that this function computes and sums the 3 types of burial energies: + low-density, medium-density, and high-density. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + burial_energy : float + Total burial energy for residue i, sum across all three burial wells. + """ + rho_i = compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + burial_energy = compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma) + return burial_energy +@njit(signature_or_function=float64(int64, int64, int64[:], int64[:], float64[:,:], float64, float64[:,:], int64[:])) +def compute_burial_potential_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, burial_gamma, seq_index): + """ + Compute the burial energy for residue i based on its local density. + Note that this function computes and sums the 3 types of burial energies: + low-density, medium-density, and high-density. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + burial_energy : float + Total burial energy for residue i, sum across all three burial wells. + """ + gamma = burial_gamma[seq_index[i], :] + burial_energy = compute_burial_potential_i_from_gamma(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, gamma) + return burial_energy +# feel free to add more functions with different signatures for greater flexibility of use +# +# DIRECT POTENTIAL +@njit(signature_or_function=float64(float64, float64, float64)) +def compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma): + """ + Compute the direct interaction potential for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + direct_energy : float + Energy of the direct contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + return -lambda_direct * thetaI * gamma +@njit(signature_or_function=float64(float64, float64, float64)) +def compute_direct_potential_ij_from_distij_gamma(dist_ij, lambda_direct, gamma): + """ + Compute the direct interaction potential for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + direct_energy : float + Energy of the direct contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + # get indicator + thetaI = compute_thetaI(dist_ij) + # put it all together + direct_energy = compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma) + return direct_energy +@njit(signature_or_function=float64(int64, int64, float64[:,:], float64, float64)) +def compute_direct_potential_ij_from_gamma(i, j, dist_mat, lambda_direct, gamma): + """ + Compute the direct interaction potential for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + direct_energy : float + Energy of the direct contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + dist_ij = dist_mat[i,j] + return compute_direct_potential_ij_from_distij_gamma(dist_ij, lambda_direct, gamma) +@njit(signature_or_function=float64(int64, int64, float64[:,:], float64, float64[:,:], int64[:])) +def compute_direct_potential_ij(i, j, dist_mat, lambda_direct, direct_gamma, seq_index): + """ + Compute the direct interaction potential for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + direct_energy : float + Energy of the direct contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + gamma = direct_gamma[seq_index[i], seq_index[j]] + return compute_direct_potential_ij_from_gamma(i, j, dist_mat, lambda_direct, gamma) +# +# LONG RANGE (protein-mediated and water-mediated) CONTACT POTENTIALS +@njit(signature_or_function=numba.types.UniTuple(float64,2)( + float64, float64, float64, float64, float64, float64)) +def compute_long_potentials_ij_from_sigmawater_thetaII_gamma(thetaII, sigma_water, + lambda_protein, gamma_p, lambda_water, gamma_w): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + # this function is defined so that we have the details of the + # calculation in one place and don't have to type the equation + # in several different places. probably not a big deal, + # but just trying to follow best practices + sigma_protein = 1.0 - sigma_water + protein_energy = -lambda_protein * thetaII * sigma_protein * gamma_p + water_energy = -lambda_water * thetaII * sigma_water * gamma_w + return protein_energy, water_energy +@njit(signature_or_function=numba.types.UniTuple(float64,2)(float64, float64, float64, float64, float64, float64)) +def compute_long_potentials_ij_from_sigmawater_distij_gamma(dist_ij, sigma_water, + lambda_protein, gamma_p, lambda_water, gamma_w): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + # get indicators and sigma values + thetaII = compute_thetaII(dist_ij) + # compute energies + protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_thetaII_gamma( + thetaII, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) + return protein_energy, water_energy +@njit(signature_or_function=numba.types.UniTuple(float64,2)(int64, int64, float64, float64, + float64, float64[:,:], float64, float64[:,:], int64[:])) +def compute_long_potentials_ij_from_sigmawater_distij(i, j, dist_ij, sigma_water, + lambda_protein, protein_gamma, lambda_water, water_gamma, seq_index): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + gamma_p = protein_gamma[seq_index[i], seq_index[j]] + gamma_w = water_gamma[seq_index[i], seq_index[j]] + # compute energies + protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_distij_gamma( + dist_ij, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) + return protein_energy, water_energy +@njit(signature_or_function=numba.types.UniTuple(float64,2)(int64, int64, float64[:,:], float64, + float64, float64[:,:], float64, float64[:,:], int64[:])) +def compute_long_potentials_ij_from_sigmawater(i, j, dist_mat, sigma_water, + lambda_protein, protein_gamma, lambda_water, water_gamma, seq_index): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + dist_ij = dist_mat[i,j] + # compute energies + protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_distij(i, j, + dist_ij, sigma_water, lambda_protein, protein_gamma, lambda_water, water_gamma, seq_index) + return protein_energy, water_energy +@njit(signature_or_function=numba.types.UniTuple(float64,2)( + float64, float64, float64, float64, float64, float64, float64)) +def compute_long_potentials_ij_from_rho_thetaII_gamma(rho_i, rho_j, thetaII, + lambda_protein, gamma_p, lambda_water, gamma_w): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + sigma_water = compute_sigma_water(rho_i, rho_j) + protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_thetaII_gamma( + thetaII, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) + return protein_energy, water_energy +@njit(signature_or_function=numba.types.UniTuple(float64,2)( + float64, float64, float64, float64, float64, float64, float64)) +def compute_long_potentials_ij_from_rho_distij_gamma(dist_ij, rho_i, rho_j, + lambda_protein, gamma_p, lambda_water, gamma_w): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + sigma_water = compute_sigma_water(rho_i, rho_j) + protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_distij_gamma( + dist_ij, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) + return protein_energy, water_energy +@njit(signature_or_function=numba.types.UniTuple(float64,2)( + int64, int64, int64, int64[:], int64[:], float64[:,:], float64, float64, float64, float64)) +def compute_long_potentials_ij_from_gamma(i, j, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, + lambda_protein, gamma_p, lambda_water, gamma_w): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + rho_i = compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + rho_j = compute_rho_i(j, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + protein_energy, water_energy = compute_long_potentials_ij_from_rho_distij_gamma( + dist_mat[i,j], rho_i, rho_j, lambda_protein, gamma_p, lambda_water, gamma_w) + return protein_energy, water_energy +@njit(numba.types.UniTuple(float64,2)(int64, int64, int64, int64[:], int64[:], float64[:,:], + float64, float64[:,:], float64, float64[:,:], int64[:])) +def compute_long_potentials_ij(i, j, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, + lambda_protein, protein_gamma, lambda_water, water_gamma, seq_index): + """ + Compute the protein-mediated and water-mediated (long-range) potentials + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + protein_energy : float + Energy of the protein-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + water_energy : float + Energy of the water-mediated contact term for the pair (i,j), + set to 0 if the pair is masked. + """ + gamma_p = protein_gamma[seq_index[i], seq_index[j]] + gamma_w = water_gamma[seq_index[i], seq_index[j]] + return compute_long_potentials_ij_from_gamma(i, j, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, + lambda_protein, lambda_water, gamma_p, gamma_w) +# feel free to add more functions with different signatures for greater flexibility of use +# +@njit(signature_or_function=float64(float64, float64, float64)) +def compute_electrostatic_potential_ij_from_indicator_gamma(electrostatic_indicator, lambda_electrostatic, gamma): + """ + Compute the solvation-averaged electrostatic potential + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + electrostatic_energy : float + Energy of the electrostatic interaction between residues i and j + """ + return -lambda_electrostatic * electrostatic_indicator * gamma +@njit(signature_or_function=float64(float64, float64, float64, float64)) +def compute_electrostatic_potential_ij_from_distij_gamma(l_D, dist_ij, lambda_electrostatic, gamma): + """ + Compute the solvation-averaged electrostatic potential + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + electrostatic_energy : float + Energy of the electrostatic interaction between residues i and j + """ + indicator = compute_electrostatic_indicator(l_D, dist_ij) + electrostatic_energy = compute_electrostatic_potential_ij_from_indicator_gamma( + indicator, lambda_electrostatic, gamma) + return electrostatic_energy +@njit(signature_or_function=float64(int64, int64, float64, float64[:,:], float64, float64)) +def compute_electrostatic_potential_ij_from_gamma(i, j, l_D, dist_mat, lambda_electrostatic, gamma): + """ + Compute the solvation-averaged electrostatic potential + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + electrostatic_energy : float + Energy of the electrostatic interaction between residues i and j + """ + dist_ij = dist_mat[i,j] + electrostatic_energy = compute_electrostatic_potential_ij_from_distij_gamma(l_D, dist_ij, lambda_electrostatic, gamma) + return electrostatic_energy +@njit(signature_or_function=float64(int64, int64, float64, float64[:,:], float64, float64[:,:], int64[:])) +def compute_electrostatic_potential_ij(i, j, l_D, dist_mat, lambda_electrostatic, electrostatic_gamma, seq_index): + """ + Compute the solvation-averaged electrostatic potential + for a pair of residues. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + electrostatic_energy : float + Energy of the electrostatic interaction between residues i and j + """ + gamma = electrostatic_gamma[seq_index[i], seq_index[j]] + electrostatic_energy = compute_electrostatic_potential_ij_from_distij_gamma( + l_D, dist_mat[i,j], lambda_electrostatic, gamma) + return electrostatic_energy +# feel free to add more functions with different signatures for greater flexibility of use +# +########################################################################## +# FUNCTIONS TO SUM DIFFERENT ENERGY TYPES OVER AN ENTIRE PROTEIN SYSTEM. +# THESE FUNCTIONS **DO** CHECK MASK CONDITIONS! +# +signature = float64(int64, + int64[:], int64[:], + float64[:,:], + float64, float64[:,:], int64[:]) +def compute_burial_potential_total(min_seq_sep_rho, + chain_starts, chain_ends, + dist_mat, + lambda_burial, burial_gamma, seq_index): + """ + Compute the total burial potential for all residues in the protein system. + Iterates over all residues and sums burial energies. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + total_burial_energy : float + Sum of burial energies for all residues + """ + num_res = dist_mat.shape[0] + total_burial_energy = 0.0 + rho_array = np.zeros(num_res) + burial_indicators = np.zeros((num_res,3)) # axis 1 ordered (low, medium, high) + for i in prange(num_res): + rho_array[i] = compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + burial_indicators[i] = compute_burial_indicator_i(rho_array[i]) + energy = compute_burial_potential_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, burial_gamma, seq_index) + total_burial_energy += energy + return total_burial_energy +compute_burial_potential_total_parallel = njit(parallel=True)(compute_burial_potential_total).compile(signature) +compute_burial_potential_total = njit()(compute_burial_potential_total).compile(signature) +#compute_burial_potential_total_parallel = njit(compute_burial_potential_total, +# signature_or_function=signature, parallel=True) +#compute_burial_potential_total = njit(compute_burial_potential_total, +# signature_or_function=signature) +# +signature = float64(int64, + int64[:], int64[:], + float64[:,:], + float64, float64[:,:], int64[:],) +def compute_direct_potential_total(min_seq_sep_contact, + chain_starts, chain_ends, + dist_mat, + lambda_direct, direct_gamma, seq_index,): + """ + Compute the total direct contact potential for the entire protein system. + Iterates over all residue pairs and sums direct interaction energies. + + Parameters + ---------- + See module-level docstring + + Returns + ------- + total_direct_energy : float + Sum of all direct contact energies + """ + num_res = dist_mat.shape[0] + total_direct_energy = 0.0 + # loop over all pairs of residues + for i in prange(num_res): + # parallelizing inner loop doesn't make much of a difference + #for j in prange(i+1, num_res): + for j in range(i+1, num_res): + # check mask + same_chain = check_same_chain(i, j, chain_starts, chain_ends) + # 2.5 and 8.5 cutoffs: effectively, + # we're truncating the potential where the indicators are almost zero + # (thetaI(2.5)==thetaI(8.5)==2.0611536367E-9) + if not mask_of_pair(min_seq_sep_contact, abs(i-j), 2.5, 8.5, + same_chain, dist_mat[i,j]): + continue # just call it 0 energy if the pair is masked + energy = compute_direct_potential_ij(i, j, dist_mat, lambda_direct, direct_gamma, seq_index) + total_direct_energy += energy + return total_direct_energy +compute_direct_potential_total_parallel = njit(parallel=True)(compute_direct_potential_total).compile(signature) +compute_direct_potential_total = njit()(compute_direct_potential_total).compile(signature) +#compute_direct_potential_total_parallel = njit(compute_direct_potential_total, +# signature_or_function=signature, parallel=True) +#compute_direct_potential_total = njit(compute_direct_potential_total, +# signature_or_function=signature) +# +signature = numba.types.UniTuple(float64,2)(int64, int64, + int64[:], int64[:], + float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64[:]) +def compute_long_potentials_total(min_seq_sep_rho, min_seq_sep_contact, + chain_starts, chain_ends, + dist_mat, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + seq_index): + """ + Compute the total protein-mediated and water-mediated contact potentials + for the entire protein structure. Iterates over all residue pairs and sums + long-range interaction energies, considering local densities + for the sigma (protein vs. water mediated) weighting. + This function also applies the mask as appropriate. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + total_protein_energy : float + Sum of all protein-mediated contact energies + total_water_energy : float + Sum of all water-mediated contact energies + """ + num_res = dist_mat.shape[0] + total_protein_energy = 0.0 + total_water_energy = 0.0 + # Pre-compute rho for all residues + rho_array = np.zeros(num_res) + for i in prange(num_res): + rho_array[i] = compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + # compute pairwise energies and add to the total of each type + for i in prange(num_res): + # parallelizing inner loop doesn't make much of a difference + #for j in prange(i+1, num_res): + for j in range(i+1, num_res): + # check contact mask + same_chain = check_same_chain(i, j, chain_starts, chain_ends) + # 4.5 and 11.5 cutoffs: effectively, + # we're truncating the potential where the indicators are almost zero + # (thetaII(4.5)==thetaII(11.5)==2.0611536367E-9 + if not mask_of_pair(min_seq_sep_contact, abs(i-j), + 4.5, 11.5, same_chain, dist_mat[i,j]): + continue # just call it 0 energy if the pair is masked + # compute sigma for this pair from precomputed rhos, then call long potentials + sigma_water = compute_sigma_water(rho_array[i], rho_array[j]) + protein_energy, water_energy = compute_long_potentials_ij_from_rho_distij_gamma( + dist_mat[i,j], rho_array[i], rho_array[j], + lambda_protein, protein_gamma[seq_index[i], seq_index[j]], + lambda_water, water_gamma[seq_index[i], seq_index[j]]) + total_protein_energy += protein_energy + total_water_energy += water_energy + return total_protein_energy, total_water_energy +compute_long_potentials_total_parallel = njit(parallel=True)(compute_long_potentials_total).compile(signature) +compute_long_potential_total = njit()(compute_long_potentials_total).compile(signature) +#compute_long_potentials_total_parallel = njit(compute_long_potentials_total, +# signature_or_function=signature, parallel=True) +#compute_long_potentials_total = njit(compute_long_potentials_total, +# signature_or_function=signature) +# +signature = float64(float64, int64, + int64[:], int64[:], + float64[:,:], + float64, float64[:,:], + int64[:],) +def compute_electrostatic_potential_total(l_D, min_seq_sep_electrostatic, + chain_starts, chain_ends, + dist_mat, + lambda_electrostatic, electrostatic_gamma, + seq_index): + """ + Compute the total Debye-Huckel electrostatic potential + for the entire protein structure. Iterates over all residue pairs and sums + electrostatic interaction energies. + This function also applies the mask as appropriate. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + total_electrostatic_energy : float + Sum of all electrostatic energies, masked as appropriate + """ + # will move this check and/or get rid of it + #if lambda_electrostatic == 0: + # return 0.0 # save some time if we're going to set everything to 0 anyway + num_res = dist_mat.shape[0] + total_electrostatic_energy = 0.0 + # loop over all pairs of residues + for i in prange(num_res): + # parallelizing inner loop doesn't make much of a difference + #for j in prange(i+1, num_res): + for j in range(i+1, num_res): + # check mask + same_chain = check_same_chain(i, j, chain_starts, chain_ends) + # unlike the other potentials, the electrostatic potential doesn't + # decay below some minimum distance, so the lower bound is 0; + # the upper bound varies with the debye length + if not mask_of_pair(min_seq_sep_electrostatic, abs(i-j), 0, 10*l_D, + same_chain, dist_mat[i,j]): + continue # just call it 0 energy if the pair is masked + energy = compute_electrostatic_potential_ij(i, j, l_D, dist_mat, lambda_electrostatic, electrostatic_gamma, seq_index) + total_electrostatic_energy += energy + return total_electrostatic_energy +compute_electrostatic_potential_total_parallel = njit(parallel=True)(compute_electrostatic_potential_total).compile(signature) +compute_electrostatic_potential_total = njit()(compute_electrostatic_potential_total).compile(signature) +#compute_electrostatic_potential_total_parallel = njit(compute_electrostatic_potential_total, +# signature_or_function=signature, parallel=True) +#compute_electrostatic_potential_total = njit(compute_electrostatic_potential_total, +# signature_or_function=signature) +# +# no numba for this function, since it doesn't have any loops or do any intensive computation +def compute_potential_total(l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, + chain_starts, chain_ends, + dist_mat, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index, parallel): + """ + Compute the total AWSEM energy for the entire protein system. + + CAUTION: this is NOT the sum over all i and j of compute_pair_energy_ij + (compute_pair_energy_ij is found below with the frustration utilities). + Taking the sum of compute_pair_energy_ij over all i and j would overcount + each residue's burial energy, since it is included in the pair energy + of all contacts in which that residue participates, but the burial energy + should only be counted once for each residue. + + Aggregates direct , protein-mediated, water-mediated, + burial, and electrostatic terms. + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + total_energy : float + Total AWSEM energy for the protein system + """ + direct_args = (min_seq_sep_contact, + chain_starts, chain_ends, + dist_mat, + lambda_direct, direct_gamma, + seq_index) + long_args = (min_seq_sep_rho, min_seq_sep_contact, + chain_starts, chain_ends, + dist_mat, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + seq_index) + burial_args = (min_seq_sep_rho, + chain_starts, chain_ends, + dist_mat, + lambda_burial, burial_gamma, + seq_index) + electrostatic_args = (l_D, min_seq_sep_electrostatic, + chain_starts, chain_ends, + dist_mat, + lambda_electrostatic, electrostatic_gamma, + seq_index) + if parallel: + direct_e = compute_direct_potential_total_parallel(*direct_args) + protein_e, water_e = compute_long_potentials_total_parallel(*long_args) + burial_e = compute_burial_potential_total_parallel(*burial_args) + electrostatic_e = compute_electrostatic_potential_total_parallel(*electrostatic_args) + else: + direct_e = compute_direct_potential_total(*direct_args) + protein_e, water_e = compute_long_potentials_total(*long_args) + burial_e = compute_burial_potential_total(*burial_args) + electrostatic_e = compute_electrostatic_potential_total(*electrostatic_args) + total_energy = direct_e + protein_e + water_e + burial_e + electrostatic_e + return total_energy +# +######################################################################################### +# PAIR ENERGY: burial(i)+burial(j)+direct(i,j)+protein(i,j)+water(i,j)+electrostatic(i,j) +# important: total energy is NOT sum over all pairs ij of pair_energy(i,j) +# these functions DO NOT check mask conditions +@njit(signature_or_function=float64( + float64, float64, float64, float64, float64, + float64, float64, float64, float64, float64, float64, + float64, float64[:], float64[:], float64, float64)) +def compute_pair_energy_ij_useful( + rho_i, rho_j, thetaI, thetaII, electrostatic_indicator, + lambda_direct, gamma_d, lambda_protein, gamma_p, lambda_water, gamma_w, + lambda_burial, gamma_bi, gamma_bj, lambda_electrostatic, gamma_e): + # useful parameter set for frustration calculations + burial_energy_i = compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma_bi) + burial_energy_j = compute_burial_potential_i_from_rho_gamma(rho_j, lambda_burial, gamma_bj) + direct_energy = compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma_d) + protein_energy, water_energy = compute_long_potentials_ij_from_rho_thetaII_gamma( + rho_i, rho_j, thetaII, lambda_protein, gamma_p, lambda_water, gamma_w) + electrostatic_energy = compute_electrostatic_potential_ij_from_indicator_gamma( + electrostatic_indicator, lambda_electrostatic, gamma_e) + pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ + protein_energy + water_energy + electrostatic_energy + return pair_energy +@njit(signature_or_function=float64( + int64, int64, float64, int64, int64[:], int64[:], + float64[:,:], + float64, float64, + float64, float64, + float64, float64, + float64, float64[:], float64[:], + float64, float64)) +def compute_pair_energy_ij_from_gamma( + i, j, l_D, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_direct, gamma_d, + lambda_protein, gamma_p, + lambda_water, gamma_w, + lambda_burial, gamma_bi, gamma_bj, + lambda_electrostatic, gamma_e): + direct_energy = compute_direct_potential_ij_from_gamma(i, j, + dist_mat, + lambda_direct, gamma_d) + protein_energy, water_energy = compute_long_potentials_ij_from_gamma( + i, j, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_protein, gamma_p, lambda_water, gamma_w) + burial_energy_i = compute_burial_potential_i_from_gamma(i, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_burial, gamma_bi) + burial_energy_j = compute_burial_potential_i_from_gamma(j, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_burial, gamma_bj) + electrostatic_energy = compute_electrostatic_potential_ij_from_gamma(i, j, l_D, + dist_mat, + lambda_electrostatic, gamma_e) + pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ + protein_energy + water_energy + electrostatic_energy + return pair_energy +# +@njit(signature_or_function=float64( + int64, int64, float64, + float64[:,:], + float64, float64, float64, + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64[:])) +def compute_pair_energy_ij_from_rho_sigmawater( + i, j, l_D, + dist_mat, + rho_i, rho_j, sigma_water, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index): + direct_energy = compute_direct_potential_ij(i, j, dist_mat, + lambda_direct, direct_gamma, + seq_index) + protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater( + i, j, + dist_mat, sigma_water, + lambda_protein, protein_gamma, lambda_water, water_gamma, + seq_index) + burial_energy_i = compute_burial_potential_i_from_rho(i, rho_i, + lambda_burial, burial_gamma, + seq_index) + burial_energy_j = compute_burial_potential_i_from_rho(j, rho_j, + lambda_burial, burial_gamma, + seq_index) + electrostatic_energy = compute_electrostatic_potential_ij(i, j, l_D, + dist_mat, + lambda_electrostatic, electrostatic_gamma, + seq_index) + pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ + protein_energy + water_energy + electrostatic_energy + return pair_energy +# +@njit(signature_or_function=float64(int64, int64, float64, + float64[:,:], + float64, float64, + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64[:])) +def compute_pair_energy_ij_from_rho(i, j, l_D, + dist_mat, + rho_i, rho_j, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index): + """ + Compute the "pair energy" for residues i and j, defined as the sum of: + - Direct contact energy + - Protein-mediated contact energy + - Water-mediated contact energy + - Burial energies for both residues + - Electrostatic interaction energy, if requested + + This quantity is used in the calculation of the frustration index: + Frustration Index = -1 * (pair energy - DECOY_AVERAGE) / DECOY_STDEV + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + pair_energy : float + Total "pair energy" of residues i and j + """ + sigma_water = compute_sigma_water(rho_i, rho_j) + pair_energy = compute_pair_energy_ij_from_rho_sigmawater( + i, j, l_D, + dist_mat, + rho_i, rho_j, sigma_water, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index) + return pair_energy +# +@njit(signature_or_function=float64(int64, int64, float64, int64, int64[:], int64[:], + float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64[:])) +def compute_pair_energy_ij(i, j, l_D, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index): + """ + Compute the "pair energy" for residues i and j, defined as the sum of: + - Direct contact energy + - Protein-mediated contact energy + - Water-mediated contact energy + - Burial energies for both residues + - Electrostatic interaction energy, if requested + + This quantity is used in the calculation of the frustration index: + Frustration Index = -1 * (pair energy - DECOY_AVERAGE) / DECOY_STDEV + + Parameters + ---------- + See module-level docstring. + + Returns + ------- + pair_energy : float + Total "pair energy" of residues i and j + """ + direct_energy = compute_direct_potential_ij(i, j, dist_mat, + lambda_direct, direct_gamma, + seq_index) + protein_energy, water_energy = compute_long_potentials_ij( + i, j, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_protein, protein_gamma, lambda_water, water_gamma, + seq_index) + burial_energy_i = compute_burial_potential_i(i, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_burial, burial_gamma, + seq_index) + burial_energy_j = compute_burial_potential_i(j, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_burial, burial_gamma, + seq_index) + electrostatic_energy = compute_electrostatic_potential_ij(i, j, l_D, + dist_mat, + lambda_electrostatic, electrostatic_gamma, + seq_index) + pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ + protein_energy + water_energy + electrostatic_energy + """ + alternatively, the body of this function could look like this: + aa_i = seq_index[i] + aa_j = seq_index[j] + gamma_d = direct_gamma[aa_i, aa_j] + gamma_p = protein_gamma[aa_i, aa_j] + gamma_w = water_gamma[aa_i, aa_j] + gamma_bi = burial_gamma[aa_i,:] + gamma_bj = burial_gamma[aa_j,:] + gamma_e = electrostatic_gamma[aa_i, aa_j] + pair_energy = compute_pair_energy_ij_from_gamma( + i, j, l_D, min_seq_sep_rho, chain_starts, chain_ends, + dist_mat, + lambda_direct, gamma_d, + lambda_protein, gamma_p, + lambda_water, gamma_w, + lambda_burial, gamma_bi, gamma_bj, + lambda_electrostatic, gamma_e) + """ + return pair_energy +# +signature = float64[:,:](float64, + int64, int64, + int64[:], int64[:], + float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + int64[:]) +def compute_pair_energy_matrix(l_D, + min_seq_sep_rho, min_seq_sep_frust_index, + chain_starts, chain_ends, + dist_mat, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index): + """ + Make matrix of the same shape as the distance matrix, + where each element is the pair energy, or np.nan if masked. + + Parameters + ---------- + See module-level docstring + + Returns + ------- + pair_energy_matrix : np.array(dist_mat.shape) + matrix where the element (i,j) is the pair energy of (i,j) + (if unmasked) or np.nan (if masked) + """ + # Pre-compute rho for all residues + num_res = dist_mat.shape[0] + rho_array = np.zeros(num_res) + for i in prange(num_res): + rho_array[i] = compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + # fill in the matrix + num_res = dist_mat.shape[0] + pair_energy_matrix = np.empty((num_res, num_res)) + for i in prange(num_res): + for j in range(i,num_res): + # check mask + same_chain = check_same_chain(i, j, chain_starts, chain_ends) + # we're computing direct, long, and electrostatics, so use + # the most conservative distance cutoff + # (which happens to be electrostatics) + unmasked = mask_of_pair(min_seq_sep_frust_index, abs(i-j), 0.0, 10*l_D, + same_chain, dist_mat[i,j],) + if unmasked: + pair_energy_matrix[i,j] = compute_pair_energy_ij_from_rho( + i, j, l_D, + dist_mat, + rho_array[i], rho_array[j], + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_burial, burial_gamma, + lambda_electrostatic, electrostatic_gamma, + seq_index) + else: + pair_energy_matrix[i,j] = np.nan + pair_energy_matrix[j,i] = pair_energy_matrix[i,j] + return pair_energy_matrix +compute_pair_energy_matrix_parallel = njit(parallel=True)(compute_pair_energy_matrix).compile(signature) +compute_pair_energy_matrix = njit()(compute_pair_energy_matrix).compile(signature) +#pair_energy_matrix_parallel = njit(compute_pair_energy_matrix, +# signature_or_function=signature, parallel=True) +#pair_energy_matrix = njit(compute_pair_energy_matrix, +# signature_or_function=signature) From 5a5de5ed30fe1547d70b38b560a68bdacbafb2b6 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 8 Dec 2025 17:08:43 -0600 Subject: [PATCH 64/76] reworked numba jit compilation for some functions --- .../numba_util/frustration_algorithms.py | 5 -- frustratometer/numba_util/hamiltonian.py | 49 +++++-------------- 2 files changed, 12 insertions(+), 42 deletions(-) diff --git a/frustratometer/numba_util/frustration_algorithms.py b/frustratometer/numba_util/frustration_algorithms.py index 6444a6c9..3409ae32 100644 --- a/frustratometer/numba_util/frustration_algorithms.py +++ b/frustratometer/numba_util/frustration_algorithms.py @@ -197,11 +197,6 @@ def pair_decoy_stats( return mean, stdev pair_decoy_stats_parallel = njit(signature_or_function=signature, parallel=True)(pair_decoy_stats) pair_decoy_stats = njit(signature_or_function=signature)(pair_decoy_stats) -#pair_decoy_stats_parallel = njit(parallel=True)(pair_decoy_stats).compile(signature) -#pair_decoy_stats = njit()(pair_decoy_stats).compile(signature) -#pair_decoy_stats_parallel = njit(pair_decoy_stats, -# signature_or_function=signature, parallel=True) -#pair_decoy_stats = njit(pair_decoy_stats, signature_or_function=signature) # @njit(signature_or_function=numba.types.UniTuple(float64,2)( float64, int64, int64[:], int64[:], float64[:,:], diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py index 18d7ca16..03f93301 100644 --- a/frustratometer/numba_util/hamiltonian.py +++ b/frustratometer/numba_util/hamiltonian.py @@ -318,13 +318,8 @@ def compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat): # only let the residue contribute if it isn't caught by the mask rho_i += compute_thetaI(dist_mat[i,j]) return rho_i -compute_rho_i_parallel = njit(parallel=True)(compute_rho_i) -compute_rho_i_parallel.compile(signature) -compute_rho_i = njit()(compute_rho_i) -compute_rho_i.compile(signature) -#compute_rho_i_parallel = njit(compute_rho_i, -# signature_or_function=signature, parallel=True) -#compute_rho_i = njit(compute_rho_i, signature_or_function=signature) +compute_rho_i_parallel = njit(signature_or_function=signature, parallel=True)(compute_rho_i) +compute_rho_i = njit(signature_or_function=signature)(compute_rho_i) # @njit(signature_or_function=float64[:](float64)) def compute_burial_indicator_i(rho_i): @@ -901,12 +896,8 @@ def compute_burial_potential_total(min_seq_sep_rho, energy = compute_burial_potential_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, burial_gamma, seq_index) total_burial_energy += energy return total_burial_energy -compute_burial_potential_total_parallel = njit(parallel=True)(compute_burial_potential_total).compile(signature) -compute_burial_potential_total = njit()(compute_burial_potential_total).compile(signature) -#compute_burial_potential_total_parallel = njit(compute_burial_potential_total, -# signature_or_function=signature, parallel=True) -#compute_burial_potential_total = njit(compute_burial_potential_total, -# signature_or_function=signature) +compute_burial_potential_total_parallel = njit(signature_or_function=signature, parallel=True)(compute_burial_potential_total) +compute_burial_potential_total = njit(signature_or_function=signature)(compute_burial_potential_total) # signature = float64(int64, int64[:], int64[:], @@ -947,12 +938,8 @@ def compute_direct_potential_total(min_seq_sep_contact, energy = compute_direct_potential_ij(i, j, dist_mat, lambda_direct, direct_gamma, seq_index) total_direct_energy += energy return total_direct_energy -compute_direct_potential_total_parallel = njit(parallel=True)(compute_direct_potential_total).compile(signature) -compute_direct_potential_total = njit()(compute_direct_potential_total).compile(signature) -#compute_direct_potential_total_parallel = njit(compute_direct_potential_total, -# signature_or_function=signature, parallel=True) -#compute_direct_potential_total = njit(compute_direct_potential_total, -# signature_or_function=signature) +compute_direct_potential_total_parallel = njit(signature_or_function=signature, parallel=True)(compute_direct_potential_total) +compute_direct_potential_total = njit(signature_or_function=signature)(compute_direct_potential_total) # signature = numba.types.UniTuple(float64,2)(int64, int64, int64[:], int64[:], @@ -1013,12 +1000,8 @@ def compute_long_potentials_total(min_seq_sep_rho, min_seq_sep_contact, total_protein_energy += protein_energy total_water_energy += water_energy return total_protein_energy, total_water_energy -compute_long_potentials_total_parallel = njit(parallel=True)(compute_long_potentials_total).compile(signature) -compute_long_potential_total = njit()(compute_long_potentials_total).compile(signature) -#compute_long_potentials_total_parallel = njit(compute_long_potentials_total, -# signature_or_function=signature, parallel=True) -#compute_long_potentials_total = njit(compute_long_potentials_total, -# signature_or_function=signature) +compute_long_potentials_total_parallel = njit(signature_or_function=signature, parallel=True)(compute_long_potentials_total) +compute_long_potential_total = njit(signature_or_function=signature)(compute_long_potentials_total) # signature = float64(float64, int64, int64[:], int64[:], @@ -1066,12 +1049,8 @@ def compute_electrostatic_potential_total(l_D, min_seq_sep_electrostatic, energy = compute_electrostatic_potential_ij(i, j, l_D, dist_mat, lambda_electrostatic, electrostatic_gamma, seq_index) total_electrostatic_energy += energy return total_electrostatic_energy -compute_electrostatic_potential_total_parallel = njit(parallel=True)(compute_electrostatic_potential_total).compile(signature) -compute_electrostatic_potential_total = njit()(compute_electrostatic_potential_total).compile(signature) -#compute_electrostatic_potential_total_parallel = njit(compute_electrostatic_potential_total, -# signature_or_function=signature, parallel=True) -#compute_electrostatic_potential_total = njit(compute_electrostatic_potential_total, -# signature_or_function=signature) +compute_electrostatic_potential_total_parallel = njit(signature_or_function=signature, parallel=True)(compute_electrostatic_potential_total) +compute_electrostatic_potential_total = njit(signature_or_function=signature)(compute_electrostatic_potential_total) # # no numba for this function, since it doesn't have any loops or do any intensive computation def compute_potential_total(l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, @@ -1436,9 +1415,5 @@ def compute_pair_energy_matrix(l_D, pair_energy_matrix[i,j] = np.nan pair_energy_matrix[j,i] = pair_energy_matrix[i,j] return pair_energy_matrix -compute_pair_energy_matrix_parallel = njit(parallel=True)(compute_pair_energy_matrix).compile(signature) -compute_pair_energy_matrix = njit()(compute_pair_energy_matrix).compile(signature) -#pair_energy_matrix_parallel = njit(compute_pair_energy_matrix, -# signature_or_function=signature, parallel=True) -#pair_energy_matrix = njit(compute_pair_energy_matrix, -# signature_or_function=signature) +compute_pair_energy_matrix_parallel = njit(signature_or_function=signature, parallel=True)(compute_pair_energy_matrix) +compute_pair_energy_matrix = njit(signature_or_function=signature)(compute_pair_energy_matrix) \ No newline at end of file From 55fece362e23d9b35d4d20be6cc17f8ba7d1775a Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 8 Dec 2025 19:43:47 -0600 Subject: [PATCH 65/76] factored electrostatic lambda out of electrostatics_gamma --- frustratometer/classes/AWSEM.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index bb1bc075..e1ad63ab 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -191,8 +191,7 @@ def __init__(self, self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, - # that only matters if we need to compute a mask from a distance matrix - self.electrostatics_gamma = -self.p.k_electrostatics * charges2[np.newaxis, np.newaxis, :, :] + # that only matters if we need to compute a mask from a distance matrix self.charges2 = charges2 # helpful ? self.gamma = self.p.gamma @@ -300,7 +299,7 @@ def calculate_energy_and_potts(self): protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - electrostatics_energy = self.electrostatics_gamma * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + electrostatics_energy = -self.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) @@ -316,6 +315,7 @@ def calculate_energy_and_potts(self): #self.potts_model['h'][:, 0] = 0 #self.potts_model['J'][:, :, 0, :] = 0 #self.potts_model['J'][:, :, :, 0] = 0 + else: print("""self.potts was False; will not calculate and store potts model. Energies will be computed on the fly as needed for frustration calculations and then discarded. @@ -833,7 +833,7 @@ def calculate_energy_and_potts(self): contact_energy = self.p.k_contact * np.array([direct, protein_mediated, water_mediated]) if self.p.k_electrostatics!=0: template[triu_indices] = self.pairwise_variances[3*num_upper:] - electrostatics_energy = self.electrostatics_gamma * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 + electrostatics_energy = -self.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) # for the variance potts model, there is one more kind of two-body interaction: # burial-pairwise covariance when the pairwise energy term involves the residue in the burial term @@ -857,7 +857,15 @@ def calculate_energy_and_potts(self): direct = direct[direct != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.direct_gamma[J_index[3]]*self.p.k_contact prot = prot[prot != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.protein_gamma[J_index[3]]*self.p.k_contact wat = wat[wat != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.water_gamma[J_index[3]]*self.p.k_contact - elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.electrostatics_gamma[J_index[3]]*self.p.k_contact + ############################################################################################################################ + elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.electrostatics_gamma[np.newaxis,np.newaxis,:,:][J_index[3]]*self.p.k_contact + # ???????????????? why are we multiplying electrostatics by k_contact? + # electrostatics_gamma already had the electrostatics weight k_electrostatics multiplied in and k_electrostatics + # isn't necessarily equal to k_contact. Anyway, i'm now going to factor k_electrostatics out of electrostatics_gamma + # in the AWSEMBase class + # probably should be + # elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.electrostatics_gamma[np.newaxis,np.newaxis,:,:][J_index[3]]*(-self.k_electrostatics) + ############################################################################################################################# contact_energy = np.append(contact_energy, direct[np.newaxis,...], axis=0) contact_energy = np.append(contact_energy, prot[np.newaxis,...], axis=0) From adf9b95382f1b9f6f34247632a3909ea75e7ec13 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 8 Dec 2025 19:45:11 -0600 Subject: [PATCH 66/76] added seq_index and electrostatics_gamma properties --- frustratometer/classes/AWSEM.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index e1ad63ab..afed6c20 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -203,14 +203,28 @@ def __init__(self, self._decoy_fluctuation = {} # used for mutational calculation, possibly others self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ self._native_energy = None + self._seq_index = None + + @property + def electrostatics_gamma(self): # used to be distinct from charges2 but eliminating the distinction + return self.charges2 # makes these gammas more analogous to the other gammas + @property + def electrostatic_gamma(self): + return self.electrostatics_gamma @property def alphabet(self): return self.gamma.alphabet # this allows us to access the alphabet in the same way as for DCA instances - @alphabet.setter # the user might think they can change the alphabet like the conformation (as in AWSEM), but that's not supported + @alphabet.setter # the user might think they can change the alphabet like the conformation (see AWSEM), but that's not supported def alphabet(self): raise AttributeError("Changing the underlying alphabet is prohibited. Instead, create a new AWSEM instance from a different Gamma.") + @property # emphasizes that seq_index is computed from the alphabet + def seq_index(self): + if self._seq_index is None: # so we only have to compute it once + self._seq_index = np.array([self.alphabet.index(aa) for aa in self.sequence]) + return self._seq_index + # carlos wanted to have gamma_array with gammas multiplied by lambda and coefficients @property def coefficient_lambda_gamma_array(self): @@ -518,7 +532,7 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] _AA = self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' if aa_freq is None: - seq_index = np.array([_AA.index(aa) for aa in self.sequence]) + seq_index = self.seq_index N=self.N else: N=self.N*10 @@ -575,7 +589,7 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): def compute_configurational_energies(self): _AA= self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' - seq_index = np.array([_AA.index(aa) for aa in self.sequence]) + seq_index = self.seq_index distances = np.triu(self.distance_matrix) distances = distances[(distances0)] n_contacts=len(distances) From 0adc1ededfa6a360c98709cd50ee5ff8801db887 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 9 Dec 2025 11:12:20 -0600 Subject: [PATCH 67/76] added potts model construction to numba hamiltonian script --- frustratometer/numba_util/hamiltonian.py | 116 +++++++++++++++++++++-- 1 file changed, 107 insertions(+), 9 deletions(-) diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py index 03f93301..19f430cb 100644 --- a/frustratometer/numba_util/hamiltonian.py +++ b/frustratometer/numba_util/hamiltonian.py @@ -71,6 +71,10 @@ Residues closer in space than this distance are masked - max_dist : float Residues further in space than this distance are masked +- max_dist_contact : float + Like the plain max_dist argument (see above) +- max_dist_electrostatic : float + Like the plain max_dist argument (see above) Parameters holding the values of indicator functions or quantities needed to compute indicator functions @@ -1349,9 +1353,102 @@ def compute_pair_energy_ij(i, j, l_D, min_seq_sep_rho, chain_starts, chain_ends, """ return pair_energy # +######################################################################################### +# POTTS MODEL: (N,N,q,q) for (N,N) dist_mat and (q,q) gammas +signature = float64[:,:]( + int64, + int64[:], int64[:], + float64[:,:], + float64, float64[:,:]) +def compute_potts_model_h( + min_seq_sep_rho, + chain_starts, chain_ends, + dist_mat, + lambda_burial, burial_gamma): + assert dist_mat.shape[0] == dist_mat.shape[1] + num_aa = dist_mat.shape[0] + num_aa_types = burial_gamma.shape[0] + assert burial_gamma.shape[1] == 3 + h = np.zeros((num_aa, num_aa_types)) + for i in prange(num_aa): + for q in range(num_aa_types): + gamma = burial_gamma[q] + h[i,q] = compute_burial_potential_i_from_gamma( + i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, gamma) + h = -h # i guess we define it as the negative of the actual potential? + return h +compute_potts_model_h_parallel = njit(signature_or_function=signature, parallel=True)(compute_potts_model_h) +compute_potts_model_h = njit(signature_or_function=signature)(compute_potts_model_h) +# +signature = float64[:,:,:,:]( + float64, int64, int64, int64, + int64[:], int64[:], float64, float64, + float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:]) +def compute_potts_model_J( + l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, + chain_starts, chain_ends, max_dist_contact, max_dist_electrostatic, + dist_mat, + lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_electrostatic, electrostatic_gamma): + # check input + assert dist_mat.shape[0] == dist_mat.shape[1] + assert direct_gamma.shape[0] == direct_gamma.shape[1] + assert direct_gamma.shape == protein_gamma.shape == water_gamma.shape == electrostatic_gamma.shape + # efficiency stuff + #coefficient_lambda_gamma_direct = -lambda_direct*direct_gamma + #coefficient_lambda_gamma_protein = -lambda_protein*protein_gamma + num_aa = dist_mat.shape[0] + num_aa_types = direct_gamma.shape[0] + # precompute rho + rho_array = np.zeros(num_aa) + for i in prange(num_aa): + rho_array[i] = compute_rho_i(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat) + # main code + J = np.zeros((num_aa, num_aa, num_aa_types, num_aa_types)) + for i in prange(num_aa): + for j in range(num_aa): + if i==j: + J[i,j,:,:] = 0.0 + continue + dist_ij = dist_mat[i,j] + rho_i = rho_array[i] + rho_j = rho_array[j] + same_chain = check_same_chain(i, j, chain_starts, chain_ends) + contact_mask_ij = mask_of_pair(min_seq_sep_contact, abs(j-i), 0, max_dist_contact, same_chain, dist_ij) + electrostatic_mask_ij = mask_of_pair(min_seq_sep_electrostatic, abs(j-i), 0, max_dist_electrostatic, same_chain, dist_ij) + for qi in range(num_aa_types): + for qj in range(qi, num_aa_types): + gamma_dij = direct_gamma[qi,qj] + gamma_pij = protein_gamma[qi,qj] + gamma_wij = water_gamma[qi,qj] + gamma_eij = electrostatic_gamma[qi,qj] + direct_energy = compute_direct_potential_ij_from_distij_gamma(dist_ij, lambda_direct, gamma_dij) + protein_energy, water_energy = compute_long_potentials_ij_from_rho_distij_gamma( + dist_ij, rho_i, rho_j, lambda_protein, gamma_pij, lambda_water, gamma_wij) + contact_energy = contact_mask_ij * (direct_energy + protein_energy + water_energy) + electrostatic_energy = electrostatic_mask_ij * compute_electrostatic_potential_ij_from_distij_gamma( + l_D, dist_ij, lambda_electrostatic, gamma_eij) + # vvv I DON'T KNOW WHERE THIS -1 COMES FROM! + energy = contact_energy + -1*electrostatic_energy + J[i,j,qi,qj] = energy + J[i,j,qj,qi] = energy + J = -J # i guess we define it as the negative of the actual potential? + return J +compute_potts_model_J_parallel = njit(signature_or_function=signature, parallel=True)(compute_potts_model_J) +compute_potts_model_J = njit(signature_or_function=signature)(compute_potts_model_J) +# +######################################################################################## +# PAIR ENERGY MATRIX FOR FRUSTRATION CALCULATIONS -- NOT SURE WHAT TO DO WITH THIS. MIGHT DELETE +""" signature = float64[:,:](float64, int64, int64, - int64[:], int64[:], + int64[:], int64[:], float64, float64[:,:], float64, float64[:,:], float64, float64[:,:], @@ -1361,7 +1458,7 @@ def compute_pair_energy_ij(i, j, l_D, min_seq_sep_rho, chain_starts, chain_ends, int64[:]) def compute_pair_energy_matrix(l_D, min_seq_sep_rho, min_seq_sep_frust_index, - chain_starts, chain_ends, + chain_starts, chain_ends, max_dist, dist_mat, lambda_direct, direct_gamma, lambda_protein, protein_gamma, @@ -1369,7 +1466,7 @@ def compute_pair_energy_matrix(l_D, lambda_burial, burial_gamma, lambda_electrostatic, electrostatic_gamma, seq_index): - """ + """""" Make matrix of the same shape as the distance matrix, where each element is the pair energy, or np.nan if masked. @@ -1382,7 +1479,7 @@ def compute_pair_energy_matrix(l_D, pair_energy_matrix : np.array(dist_mat.shape) matrix where the element (i,j) is the pair energy of (i,j) (if unmasked) or np.nan (if masked) - """ + """""" # Pre-compute rho for all residues num_res = dist_mat.shape[0] rho_array = np.zeros(num_res) @@ -1395,10 +1492,10 @@ def compute_pair_energy_matrix(l_D, for j in range(i,num_res): # check mask same_chain = check_same_chain(i, j, chain_starts, chain_ends) - # we're computing direct, long, and electrostatics, so use - # the most conservative distance cutoff - # (which happens to be electrostatics) - unmasked = mask_of_pair(min_seq_sep_frust_index, abs(i-j), 0.0, 10*l_D, + # the idea is that this is the matrix we'll use to calculate frustration indices, + # so we set the minimum distance to 0 (as it always is for frustration calculations) + # and let the maximum distance be a variable + unmasked = mask_of_pair(min_seq_sep_frust_index, abs(i-j), 0.0, max_dist, same_chain, dist_mat[i,j],) if unmasked: pair_energy_matrix[i,j] = compute_pair_energy_ij_from_rho( @@ -1416,4 +1513,5 @@ def compute_pair_energy_matrix(l_D, pair_energy_matrix[j,i] = pair_energy_matrix[i,j] return pair_energy_matrix compute_pair_energy_matrix_parallel = njit(signature_or_function=signature, parallel=True)(compute_pair_energy_matrix) -compute_pair_energy_matrix = njit(signature_or_function=signature)(compute_pair_energy_matrix) \ No newline at end of file +compute_pair_energy_matrix = njit(signature_or_function=signature)(compute_pair_energy_matrix) +""" \ No newline at end of file From ff710569d0efc70712dddbbb991ae1557a8f2f2f Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 9 Dec 2025 11:14:45 -0600 Subject: [PATCH 68/76] now using numba method to compute potts model, but some tests failing --- frustratometer/classes/AWSEM.py | 37 ++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index afed6c20..230af11c 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -53,7 +53,7 @@ class Parameters(BaseModel): #Electrostatics min_sequence_separation_electrostatics: Optional[int] = Field(1, description="Minimum sequence separation for electrostatics calculation.") k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") - electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)") + electrostatics_screening_length: float = Field(10.0, description="Screening length for electrostatic interactions. (Angstrom)") # We might not know the order of amino acids in our alphabet at the time of instantiating this class # (this happens the above gammas are Paths), so we'll have to build the electrostatic "gamma" when @@ -301,8 +301,39 @@ def calculate_masks(self): #np.save('my_mask_new.npy',self.mask) self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function - def calculate_energy_and_potts(self): + def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): + # chain_starts and chain_ends should be calculated based on object attributes + # (maybe Structure.chain?) but i don't know how to do that, so we'll do this for now if self.potts: + self.potts_model = {'h':None, 'J':None} + if chain_starts is None: + chain_starts = np.array([0]) + if chain_ends is None: + chain_ends = np.array([len(self.seq_index)-1]) + if self.distance_cutoff_contact is None: + contact_max_dist = 12.5 + else: + contact_max_dist = self.distance_cutoff_contact + self.potts_model['h'] = ham.compute_potts_model_h_parallel( + self.min_sequence_separation_rho, + chain_starts, chain_ends, + self.distance_matrix, + self.k_contact, self.burial_gamma) + self.potts_model['J'] = ham.compute_potts_model_J_parallel( + self.electrostatics_screening_length, self.min_sequence_separation_rho, + self.min_sequence_separation_contact, self.min_sequence_separation_electrostatics, + chain_starts, chain_ends, + contact_max_dist, 10*self.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics + self.distance_matrix, + self.k_contact, self.direct_gamma, + self.k_contact, self.protein_gamma, + self.k_contact, self.water_gamma, + self.k_electrostatics, self.electrostatics_gamma) + #breakpoint() + #self.potts_model['J'] = ham.compute_potts_model_J( + # self.distance_matrix, ) + + """ J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) @@ -329,7 +360,7 @@ def calculate_energy_and_potts(self): #self.potts_model['h'][:, 0] = 0 #self.potts_model['J'][:, :, 0, :] = 0 #self.potts_model['J'][:, :, :, 0] = 0 - + """ else: print("""self.potts was False; will not calculate and store potts model. Energies will be computed on the fly as needed for frustration calculations and then discarded. From 7eb6610e85dd2574b22e83e0560abe25af2870f9 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Tue, 9 Dec 2025 11:23:41 -0600 Subject: [PATCH 69/76] fixed sign error in numba electrostatic hamiltonian --- frustratometer/numba_util/hamiltonian.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py index 19f430cb..c45e3ed8 100644 --- a/frustratometer/numba_util/hamiltonian.py +++ b/frustratometer/numba_util/hamiltonian.py @@ -806,7 +806,11 @@ def compute_electrostatic_potential_ij_from_indicator_gamma(electrostatic_indica electrostatic_energy : float Energy of the electrostatic interaction between residues i and j """ - return -lambda_electrostatic * electrostatic_indicator * gamma + # gamma is negative if interaction is favorable and positive if + # unfavorable, and our lambdas and indicators are all positive by convention, + # so we don't precede this equation with a negative sign + #return -lambda_electrostatic * electrostatic_indicator * gamma + return lambda_electrostatic * electrostatic_indicator * gamma @njit(signature_or_function=float64(float64, float64, float64, float64)) def compute_electrostatic_potential_ij_from_distij_gamma(l_D, dist_ij, lambda_electrostatic, gamma): """ @@ -1434,8 +1438,8 @@ def compute_potts_model_J( contact_energy = contact_mask_ij * (direct_energy + protein_energy + water_energy) electrostatic_energy = electrostatic_mask_ij * compute_electrostatic_potential_ij_from_distij_gamma( l_D, dist_ij, lambda_electrostatic, gamma_eij) - # vvv I DON'T KNOW WHERE THIS -1 COMES FROM! - energy = contact_energy + -1*electrostatic_energy + # + energy = contact_energy + electrostatic_energy J[i,j,qi,qj] = energy J[i,j,qj,qi] = energy J = -J # i guess we define it as the negative of the actual potential? From 08a6b3c287e1441091dd33bf92f784ed91786e0b Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Wed, 10 Dec 2025 10:48:35 -0600 Subject: [PATCH 70/76] passing tests except for subsequence --- frustratometer/classes/AWSEM.py | 28 ++++- frustratometer/numba_util/hamiltonian.py | 67 ++++++++++- tests/test_awsem_frustratometer.py | 135 +++++++++++++++++++++++ 3 files changed, 222 insertions(+), 8 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 230af11c..13fe9ac7 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -331,8 +331,29 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): self.k_electrostatics, self.electrostatics_gamma) #breakpoint() #self.potts_model['J'] = ham.compute_potts_model_J( - # self.distance_matrix, ) - + # self.distance_matrix, ) + J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) + + # compute burial and contact energies + old_burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] + direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] + water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] + protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] + contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] + + electrostatics_energy = -self.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] + contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) + old_contact_energy = contact_energy + # Compute potts model + old_potts_model = {} + old_potts_model['h'] = old_burial_energy.sum(axis=-1)[:, :] + old_potts_model['J'] = old_contact_energy.sum(axis=0)[:, :, :, :] + diff_h = np.max(np.abs(old_potts_model['h'] - self.potts_model['h'])) + assert diff_h < 3E-4, diff_h + diff_J = np.max(np.abs(old_potts_model['J'] - self.potts_model['J'])) + assert diff_J < 3E-4, diff_J """ J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) @@ -356,11 +377,12 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): assert self.potts_model['h'].shape == (self.N, self.q), self.potts_model['h'].shape self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] assert self.potts_model['J'].shape == (self.N, self.N, self.q, self.q), self.potts_model['J'].shape + breakpoint() + """ # Set the gap energy to zero #self.potts_model['h'][:, 0] = 0 #self.potts_model['J'][:, :, 0, :] = 0 #self.potts_model['J'][:, :, :, 0] = 0 - """ else: print("""self.potts was False; will not calculate and store potts model. Energies will be computed on the fly as needed for frustration calculations and then discarded. diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py index c45e3ed8..98cecae1 100644 --- a/frustratometer/numba_util/hamiltonian.py +++ b/frustratometer/numba_util/hamiltonian.py @@ -225,7 +225,8 @@ def check_same_chain(i, j, chain_starts, chain_ends): break # this could save us a couple iterations, probably doesn't matter return same_chain # -@njit(signature_or_function=boolean(int64, float64, float64, int64, boolean, float64)) +#@njit(signature_or_function=boolean(int64, float64, float64, int64, boolean, float64)) +@njit(signature_or_function=float64(int64, float64, float64, int64, boolean, float64)) def mask_of_pair(min_seq_sep, seq_sep, min_dist, max_dist, same_chain, dist_ij): """ Get a bool representing whether a pair of residues having @@ -241,10 +242,12 @@ def mask_of_pair(min_seq_sep, seq_sep, min_dist, max_dist, same_chain, dist_ij): mask_bit : bool Whether pair should be considered unmasked (True) or masked (False). """ - if min_dist<=dist_ij<=max_dist and (min_seq_sep<=seq_sep or not same_chain): - mask_bit = True + if (min_dist<=dist_ij) and (dist_ij<=max_dist) and ((min_seq_sep<=seq_sep) or (not same_chain)): + if seq_sep==25 and 9.05 0.0003 # the smallest gamma + #assert protein_energy != 0 + #assert water_energy != 0 + # assert abs(protein_energy + water_energy) >= 0.00001 contact_energy = contact_mask_ij * (direct_energy + protein_energy + water_energy) electrostatic_energy = electrostatic_mask_ij * compute_electrostatic_potential_ij_from_distij_gamma( l_D, dist_ij, lambda_electrostatic, gamma_eij) @@ -1442,7 +1482,24 @@ def compute_potts_model_J( energy = contact_energy + electrostatic_energy J[i,j,qi,qj] = energy J[i,j,qj,qi] = energy + #if i==1 and j==26 and qi==0 and qj==0: + # print(direct_energy) + # print(protein_energy) + # print(water_energy) + # print(contact_mask_ij) + # print(contact_energy) + # if contact_mask_ij is False: + # print('contact_mask_ij is False') + # #min_seq_sep_contact, abs(j-i), 0, max_dist_contact, same_chain, dist_ij + # print(f' min_seq_sep_contact: {min_seq_sep_contact}') + # print(f' abs(j-i): {abs(j-i)}') + # print(f' 0.0: {0.0}') + # print(f' max_dist_contact==9.499: {max_dist_contact==9.499}') + # print(f' same_chain: {same_chain}') + # print(f' 9.05 Date: Wed, 10 Dec 2025 12:06:56 -0600 Subject: [PATCH 71/76] restructured AWSEM.py to improve documentation and readability --- frustratometer/classes/AWSEM.py | 233 ++++++++++++++++++++++---------- 1 file changed, 159 insertions(+), 74 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 13fe9ac7..dae70c0d 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -12,7 +12,7 @@ __all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble', 'AWSEMVariancePotts'] -class Parameters(BaseModel): +class AWSEMHamiltonianParameters(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) """Default parameters for AWSEM energy calculations.""" k_contact: float = Field(4.184, description=""" @@ -57,7 +57,7 @@ class Parameters(BaseModel): # We might not know the order of amino acids in our alphabet at the time of instantiating this class # (this happens the above gammas are Paths), so we'll have to build the electrostatic "gamma" when - # initializing AWSEMBase. Fortunately, we can still specify everything we need to know in this dict. + # initializing _AWSEMBase. Fortunately, we can still specify everything we need to know in this dict. charge_dict : dict = Field({'A':0.0,'C':0.0,'D':-1.0,'E':-1.0, 'F':0.0,'G':0.0,'H':0.0,'I':0.0, 'K':1.0,'L':0.0,'M':0.0,'N':0.0,'P':0.0, @@ -66,7 +66,11 @@ class Parameters(BaseModel): description='charge of each amino acid type that may be used') -class AWSEMBase(Frustratometer): +class _AWSEMBase(Frustratometer): + """ + Base class for potts model and frustration calculations + with the AWSEM Hamiltonian. + """ def __init__(self, sequence: str, @@ -74,7 +78,8 @@ def __init__(self, potts: bool=False, **parameters)->object: """ - Generate AWSEM object + Set attributes that do not depend on the implementations of + the indicator function and potts model setup calculations. Parameters ---------- @@ -85,10 +90,13 @@ def __init__(self, potts: bool Whether to set up the potts model (can be RAM-intensive and time-intensive), which is unnecessary if all you want to get is the indicator functions. - + **parameters: + Used to initialize an AWSEMHamiltonianParameters, which becomes an attribute + of this class and helps us organize the parameters of our AWSEM Hamiltonian + Returns ------- - AWSEM object + _AWSEMBase object """ # set sequence based on argument @@ -105,23 +113,26 @@ def __init__(self, if self.potts and not self.expose_indicator_functions: print(f""" - You requested storing the potts model as an object attribute by using potts=True - but requested NOT storing the indicator functions as object attributes by using - expose_indicator_functions=False. Since the potts model requires far more RAM than - the indicator functions, we will override your indicator function request - and store them anyway. This has no effect on the accuracy of any calculations. - - Setting {self.__class__}.expose_indicator_functions = True""") + You requested storing the potts model as an object attribute by using potts=True + but requested NOT storing the indicator functions as object attributes by using + expose_indicator_functions=False. Since the potts model requires far more RAM than + the indicator functions, we will override your indicator function request + and store them anyway. This will have no effect on the accuracy of any calculations. + + Setting {self.__class__}.expose_indicator_functions = True""") self.expose_indicator_functions = True # parse other arguments - p = Parameters(**parameters) + p = AWSEMHamiltonianParameters(**parameters) if p.min_sequence_separation_contact is None: p.min_sequence_separation_contact = 1 if p.min_sequence_separation_rho is None: p.min_sequence_separation_rho = 1 if p.min_sequence_separation_electrostatics is None: p.min_sequence_separation_electrostatics = 1 + # doing this arguably defeats the purpose of having an + # AWSEMHamilonianParams class; + # we should think about how we can clean up the namespace of this (_AWSEMBase) class for field, value in p: setattr(self, field, value) self.p = p @@ -175,10 +186,11 @@ def __init__(self, try: ordered_charges[counter] = self.charge_dict[gamma.alphabet[counter]] except KeyError as e: - raise Exception(f"""One-letter code {order[counter]} from alphabet {gamma.alphabet} - with unknown charge. If use of this noncanonical AA in intentional, - you must supply a custom charge_dict - so that we know how to calculate the electrostatic potential.""") + raise Exception(f""" + One-letter code {order[counter]} from alphabet {gamma.alphabet} + with unknown charge. If use of this noncanonical AA in intentional, + you must supply a custom charge_dict + so that we know how to calculate the electrostatic potential.""") charges2 = ordered_charges[:,np.newaxis] * ordered_charges[np.newaxis,:] if self.p.k_electrostatics != 0: self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) @@ -205,6 +217,9 @@ def __init__(self, self._native_energy = None self._seq_index = None + ################################################################################## + # quantities previously defined as attributes that are calculated based on + # other attributes should be converted to properties @property def electrostatics_gamma(self): # used to be distinct from charges2 but eliminating the distinction return self.charges2 # makes these gammas more analogous to the other gammas @@ -245,17 +260,9 @@ def coefficient_lambda_gamma_array(self): directly is not allowed. Initialize a new instance with a different {self.__class__}.k_contact, {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, {self.__class__}.protein_gamma, or {self.__class__}.water_gamma instead.""") + ################################################################################## - def native_energy(self): - if self.potts: - if not hasattr(self, 'potts_model'): # create potts model if it doesn't already exist - self.calculate_energy_and_potts() - energy = super().native_energy() # method to compute native energy given potts model - else: - energy = 0 # fill in numba function here - #self._native_energy = energy # maybe _native_energy is needed for compatibility with certain things? - return energy - + # methods for subclass initialization def subclass_setup_helper(self): """ This method calls methods to calculate native indicator functions (optional), @@ -263,7 +270,7 @@ def subclass_setup_helper(self): and potts model (optional). This method is intended to be called as the last step of __init__ - in each subclass of AWSEMBase. The subclasses may differ in how + in each subclass of _AWSEMBase. The subclasses may differ in how they load in the structural information (the part of __init__ preceding the call to this method) and how they implement the calculate_indicators and calculate_masks methods called @@ -389,6 +396,18 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): If you want to get the energies for your own purposes, set self.potts to True and then call this method again.""") + # not really sure what the point of this is + def native_energy(self): + if self.potts: + if not hasattr(self, 'potts_model'): # create potts model if it doesn't already exist + self.calculate_energy_and_potts() + energy = super().native_energy() # method to compute native energy given potts model + else: + energy = 0 # fill in numba function here + #self._native_energy = energy # maybe _native_energy is needed for compatibility with certain things? + return energy + + # methods to calculate different kinds of frustration def compute_configurational_decoy_statistics(self): raise NotImplementedError("Subclasses must define this method") @@ -402,7 +421,7 @@ def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): def mutational_frustration(self): # This algorithm is defined in the Frustratometer class # because it applies to both AWSEM and DCA frustratometry, - # and both the AWSEMBase and DCA classes inherit from Frustratometer. + # and both the _AWSEMBase and DCA classes inherit from Frustratometer. # Our goal here is just to provide an interface that matches that used # for configurational frustration, which has no DCA analog and therefore # is not defined in Frustratometer (although Frustratometer.frustration @@ -415,8 +434,14 @@ def singleresidue_frustration(self): return super().frustration(kind='singleresidue') -class AWSEM(AWSEMBase): +class AWSEM(_AWSEMBase): + """ + The main class that the user will invoke + for potts model and frustration calculations + with the AWSEM Hamiltonian. + However, users may also be interested in AWSEMIndicators. + """ def __init__(self, pdb_structure: object | tuple, # tuple is an object, but this clarifies what we expect sequence: str =None, @@ -424,6 +449,38 @@ def __init__(self, potts: bool=False, alt_sigma_wat: bool=False, **parameters)->object: + """ + Pass parameters to _AWSEMBase to + set attributes that DO NOT depend on the implementations of + the indicator function and potts model setup calculations, + then set attributes that DO depend on the implementations + of the indicator function and potts model setup calculations. + + Parameters + ---------- + pdb_structure: object | tuple + A Structure object or tuple of distance matrices characterizing the conformer + to be used (see self.setup_structure) + sequence: str + The amino acid sequence + expose_indicator_functions: bool + If set to True, indicator functions of the contact and burial energy terms can be accessed by user. + potts: bool + Whether to set up the potts model (can be RAM-intensive and therefore time-intensive), + which is unnecessary if all you want to get is the indicator functions or + perform certain frustration calculations, for which energies can be computed + on the fly instead of saved in memory. + alt_sigma_wat : bool=False + Whether to use alternative functional form for sigma_wat (experimental feature) + **parameters: + Used to initialize an AWSEMHamiltonianParameters, which becomes an attribute + of this class and helps us organize the parameters of our AWSEM Hamiltonian + + Returns + ------- + AWSEM object + """ + # assume the user wanted the sequence from the pdb structure if not given if not sequence: try: @@ -437,13 +494,16 @@ def __init__(self, as a separate argument to this class.""") else: raise + # load structure-independent parameters and methods super().__init__(sequence, expose_indicator_functions, potts, **parameters) self.alt_sigma_wat = alt_sigma_wat + # set up strucure self.setup_structure(pdb_structure) self.subclass_setup_helper() + # methods for structure-dependent stuff def setup_structure(self, pdb_structure): if not isinstance(pdb_structure, tuple): # alt_conf should be our custom Structure object # maybe our type check here should be more restrictive, @@ -489,22 +549,6 @@ def setup_structure(self, pdb_structure): else: raise AssertionError("unexpected else block") - @property - def pdb_structure(self): - return self._pdb_structure - @pdb_structure.setter - def pdb_structure(self,pdb_structure): - # reset structural attributes - self.setup_structure(pdb_structure) - # check that our new structure is compatible with our old one - if self.N != len(self.sequence): - breakpoint() - raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") - self.subclass_setup_helper() - def change_conformation(self,alt_conf): - # this method is an alias for the setter - self.pdb_structure = alt_conf - def calculate_indicators(self): if self.expose_indicator_functions: # Calculate rho @@ -542,11 +586,35 @@ def calculate_indicators(self): electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) self.electrostatics_indicator = electrostatics_indicator else: - print("""self.expose_indicator_functions was False; will not calculate and store indicator functions. - Indicator functions will be computed on the fly as needed for energy calculations and then discarded. - If you want to get the indicator functions for your own purposes, set self.expose_indicator_functions - to True and then call this method again.""") + print(""" + self.expose_indicator_functions was False; + will not calculate and store indicator functions. + Indicator functions will be computed on the fly as needed + for energy calculations and then discarded. + If you want to get the indicator functions for your own purposes, + set expose_indicator_functions to True + and then call self.calculate_indicators().""") + + # make self.pdb_structure into a property so that structure-dependent + # stuff is recalculated automatically when we change the conformation + @property + def pdb_structure(self): + return self._pdb_structure + @pdb_structure.setter + def pdb_structure(self,pdb_structure): + # reset structural attributes + self.setup_structure(pdb_structure) + # check that our new structure is compatible with our old one + if self.N != len(self.sequence): + breakpoint() + raise ValueError("The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object.") + self.subclass_setup_helper() + def change_conformation(self,alt_conf): + # this method is an alias for the setter + self.pdb_structure = alt_conf + # self.masked_indicators is calculated from other attributes, + # so it should be made into a property @property def masked_indicators(self): # store indicators and gammas for our particular sequence as attributes @@ -568,19 +636,7 @@ def masked_indicators(self): {self.__class__}.sequence_mask_contact, or {self.__class__}.electrostatics_mask instead.""") - def calculate_energy_and_potts(self): - super().calculate_energy_and_potts() - # if expose_indicator_functions is off, we should never set the attributes in the first place - #if not self.expose_indicator_functions: - # del self.burial_indicator - # del self.direct_indicator - # del self.water_indicator - # del self.protein_indicator - # if "electrostatics_indicator" in dir(self): - # # won't exist if electrostatics are turned off - # del self.electrostatics_indicator - # del self.indicators - + # implementations of frustration algorithms def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] _AA = self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' @@ -697,8 +753,14 @@ def compute_configurational_energies(self): return configurational_energies #, pd.DataFrame(decoy_data, columns=decoy_data_columns) -class AWSEMIndicators(AWSEMBase): # PottsEvaluatorFromIndicators or PottsEnergyEvaluatorFromIndicators? - +class AWSEMIndicators(_AWSEMBase): # PottsEvaluatorFromIndicators or PottsEnergyEvaluatorFromIndicators? + """ + This class is intended to be equivalent to AWSEM + but allows initialization from numpy arrays of + indicator functions, rather than calculating + those indicators from a Structure or set of + distance matrices. + """ def __init__(self, burial_indicator: np.ndarray, direct_indicator: np.ndarray, @@ -707,10 +769,15 @@ def __init__(self, electrostatics_indicator: Union[np.ndarray, None], sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, + potts : bool=False, absolute_value_gamma: bool=False, **parameters)->object: """ - A stripped-down version of the AWSEM class that can be initialized from a set of indicator functions + Pass parameters to _AWSEMBase to + set attributes that DO NOT depend on the implementations of + the indicator function and potts model setup calculations, + then set attributes that DO depend on the implementations + of the indicator function and potts model setup calculations. Parameters ---------- @@ -729,6 +796,10 @@ def __init__(self, The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. + potts : bool + Whether to set up the potts model (can be RAM-intensive and therefore time-intensive), + which is unnecessary if all you want is to perform certain frustration calculations, + for which energies can be computed on the fly instead of saved in memory. absolute_value_gamma: bool If True, replace gammas with their absolute values. This is helpful for the standard deviation approximation @@ -737,10 +808,7 @@ def __init__(self, AWSEMIndicators object """ - # if we already have our indicator functions, - # our goal is probably to compute the potts model, - # so we'll just hard code a value of True for that argument VVVV - super().__init__(sequence, expose_indicator_functions, potts=True, **parameters) + super().__init__(sequence, expose_indicator_functions, potts, **parameters) self.burial_indicator = burial_indicator self.direct_indicator = direct_indicator self.protein_indicator = protein_indicator @@ -777,7 +845,13 @@ def __init__(self, def calculate_indicators(self): pass # the function was initialized with indicators, so there's nothing to do -class AWSEMVariancePotts(AWSEMBase): +class AWSEMVariancePotts(_AWSEMBase): + """ + EXPERIMENTAL CLASS THAT TRIES TO REPURPOSE OUR CODE TO CREATE + A SPECIAL KIND OF "POTTS MODEL" WHERE THE "ENERGY" IS ACTUALLY + THE VARIANCE OF THE ENERGIES OF A PREDEFINED SET OF DECOY CONFORMERS. + THIS CLASS IS STILL UNDER DEVELOPMENT. + """ def __init__(self, covariance_matrix: np.ndarray, sequence: str, # sequence is optional if we initialize from a Structure but not here @@ -785,6 +859,10 @@ def __init__(self, absolute_value_gamma: bool=False, **parameters)->object: """ + EXPERIMENTAL CLASS THAT TRIES TO REPURPOSE OUR CODE TO CREATE + A SPECIAL KIND OF "POTTS MODEL" WHERE THE "ENERGY" IS ACTUALLY + THE VARIANCE OF THE ENERGIES OF A PREDEFINED SET OF DECOY CONFORMERS. + THIS CLASS IS STILL UNDER DEVELOPMENT. Parameters ---------- @@ -929,7 +1007,7 @@ def calculate_energy_and_potts(self): # ???????????????? why are we multiplying electrostatics by k_contact? # electrostatics_gamma already had the electrostatics weight k_electrostatics multiplied in and k_electrostatics # isn't necessarily equal to k_contact. Anyway, i'm now going to factor k_electrostatics out of electrostatics_gamma - # in the AWSEMBase class + # in the _AWSEMBase class # probably should be # elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.electrostatics_gamma[np.newaxis,np.newaxis,:,:][J_index[3]]*(-self.k_electrostatics) ############################################################################################################################# @@ -983,6 +1061,13 @@ def calculate_energy_and_potts(self): self._native_energy=None # don't know what this does class DecoyEnsemble(): + """ + EXPERIMENTAL CLASS THAT ITERATIVELY COMPUTES INDICATOR FUNCTIONS + AND STATISTICS FOR A SET OF CONFORMERS, SPECIFIED AS A PYTHON + Generator OF Structure OBJECTS. + THIS CLASS IS STILL UNDER DEVELOPMENT AND MAYBE SHOULD BE + MOVED OUT OF THIS MODULE. + """ def __init__(self, pdb_structures: Generator[object,None,None], @@ -1000,7 +1085,7 @@ def __init__(self, Returns ------- - DecoyEnsemble object, which holds indicator arrays (and gammas???) computed by the AWSEM class. + DecoyEnsemble object """ # the AWSEM class takes care of the indicator calculation (including masking) for us # AWSEM normally accepts an amino acid sequence argument, but we don't need that here From e6093a707acb3fa735f74887cb198421f8d9d092 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 11 Dec 2025 10:04:29 -0600 Subject: [PATCH 72/76] fixing error in numba hamiltonian function signature --- frustratometer/numba_util/hamiltonian.py | 66 ++---------------------- 1 file changed, 4 insertions(+), 62 deletions(-) diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py index 98cecae1..af65fd14 100644 --- a/frustratometer/numba_util/hamiltonian.py +++ b/frustratometer/numba_util/hamiltonian.py @@ -175,7 +175,7 @@ Parameters to optimize computation efficiency - parallel : bool - Whether to jit-compile the function with parallel=True + Whether to call numba parallelized or not Notes ----- @@ -225,8 +225,7 @@ def check_same_chain(i, j, chain_starts, chain_ends): break # this could save us a couple iterations, probably doesn't matter return same_chain # -#@njit(signature_or_function=boolean(int64, float64, float64, int64, boolean, float64)) -@njit(signature_or_function=float64(int64, float64, float64, int64, boolean, float64)) +@njit(signature_or_function=boolean(int64, int64, float64, float64, boolean, float64)) def mask_of_pair(min_seq_sep, seq_sep, min_dist, max_dist, same_chain, dist_ij): """ Get a bool representing whether a pair of residues having @@ -243,11 +242,9 @@ def mask_of_pair(min_seq_sep, seq_sep, min_dist, max_dist, same_chain, dist_ij): Whether pair should be considered unmasked (True) or masked (False). """ if (min_dist<=dist_ij) and (dist_ij<=max_dist) and ((min_seq_sep<=seq_sep) or (not same_chain)): - if seq_sep==25 and 9.05 0.0003 # the smallest gamma - #assert protein_energy != 0 - #assert water_energy != 0 - # assert abs(protein_energy + water_energy) >= 0.00001 contact_energy = contact_mask_ij * (direct_energy + protein_energy + water_energy) electrostatic_energy = electrostatic_mask_ij * compute_electrostatic_potential_ij_from_distij_gamma( l_D, dist_ij, lambda_electrostatic, gamma_eij) @@ -1482,24 +1441,7 @@ def compute_potts_model_J( energy = contact_energy + electrostatic_energy J[i,j,qi,qj] = energy J[i,j,qj,qi] = energy - #if i==1 and j==26 and qi==0 and qj==0: - # print(direct_energy) - # print(protein_energy) - # print(water_energy) - # print(contact_mask_ij) - # print(contact_energy) - # if contact_mask_ij is False: - # print('contact_mask_ij is False') - # #min_seq_sep_contact, abs(j-i), 0, max_dist_contact, same_chain, dist_ij - # print(f' min_seq_sep_contact: {min_seq_sep_contact}') - # print(f' abs(j-i): {abs(j-i)}') - # print(f' 0.0: {0.0}') - # print(f' max_dist_contact==9.499: {max_dist_contact==9.499}') - # print(f' same_chain: {same_chain}') - # print(f' 9.05 Date: Thu, 11 Dec 2025 11:14:20 -0600 Subject: [PATCH 73/76] added warnings to AWSEM and renamed potts to potts_option --- frustratometer/classes/AWSEM.py | 104 +++++++--------------- frustratometer/frustration/frustration.py | 1 + 2 files changed, 32 insertions(+), 73 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index dae70c0d..9ac476f9 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -1,3 +1,4 @@ +import warnings import numpy as np from ..utils import _path from .. import frustration @@ -12,7 +13,7 @@ __all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble', 'AWSEMVariancePotts'] -class AWSEMHamiltonianParameters(BaseModel): +class ParametersAWSEM(BaseModel): model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) """Default parameters for AWSEM energy calculations.""" k_contact: float = Field(4.184, description=""" @@ -75,7 +76,7 @@ class _AWSEMBase(Frustratometer): def __init__(self, sequence: str, expose_indicator_functions: bool=False, - potts: bool=False, + potts_option: bool=False, **parameters)->object: """ Set attributes that do not depend on the implementations of @@ -87,11 +88,11 @@ def __init__(self, The amino acid sequence expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. - potts: bool + potts_option: bool Whether to set up the potts model (can be RAM-intensive and time-intensive), which is unnecessary if all you want to get is the indicator functions. **parameters: - Used to initialize an AWSEMHamiltonianParameters, which becomes an attribute + Used to initialize an ParametersAWSEM, which becomes an attribute of this class and helps us organize the parameters of our AWSEM Hamiltonian Returns @@ -109,11 +110,11 @@ def __init__(self, # whether to store the potts model as an object attribute, # which requires a lot of ram - self.potts = potts + self.potts_option = potts_option - if self.potts and not self.expose_indicator_functions: - print(f""" - You requested storing the potts model as an object attribute by using potts=True + if self.potts_option and not self.expose_indicator_functions: + warnings.warn(f""" + You requested storing the potts model as an object attribute by using potts_option=True but requested NOT storing the indicator functions as object attributes by using expose_indicator_functions=False. Since the potts model requires far more RAM than the indicator functions, we will override your indicator function request @@ -123,7 +124,7 @@ def __init__(self, self.expose_indicator_functions = True # parse other arguments - p = AWSEMHamiltonianParameters(**parameters) + p = ParametersAWSEM(**parameters) if p.min_sequence_separation_contact is None: p.min_sequence_separation_contact = 1 if p.min_sequence_separation_rho is None: @@ -145,26 +146,6 @@ def __init__(self, self.p.gamma = gamma else: raise ValueError("Gamma parameter must be a path or a Gamma object.") - """ - # CARLOS: if you really want to reorder, we can do something like this, - but it shouldn't be necessary--we always have access to the - order in self.gamma.alphabet - ordered_alphabet = ['A','C','D','E','F','G','H','I','K','L', - 'M','N','P','Q','R','S','T','V','W','Y'] - for aa in ordered_alphabet: - assert aa in gamma.alphabet, f'{aa} missing from gamma.alphabet!' - if len(gamma.alphabet) == 20: # alphabet is exactly the canonical AAs - gamma = gamma.reorder(ordered_alphabet) - elif len(gamma.alphabet) > 20: # includes noncanonical AA(s) (or a "gap") - ncAA = [] - for aa in gamma.alphabet: - if aa not in ordered_alphabet: - ncAA.append(AA) - ordered_alphabet = ncAA + ordered_alphabet # insert at the beginning - gamma = gamma.reorder(ordered_alphabet) - else: - raise ValueError(f"gamma file alphabet {gamma.alphabet} was too short") - """ # burial gamma self.q = len(gamma.alphabet) # most likely 20, but could be different gb = gamma['Burial'] @@ -311,7 +292,11 @@ def calculate_masks(self): def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): # chain_starts and chain_ends should be calculated based on object attributes # (maybe Structure.chain?) but i don't know how to do that, so we'll do this for now - if self.potts: + if self.potts_option: + warnings.warn(""" + Constructing full potts model in RAM. + If you don't want to do this, set potts_option=False + """) self.potts_model = {'h':None, 'J':None} if chain_starts is None: chain_starts = np.array([0]) @@ -361,44 +346,16 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): assert diff_h < 3E-4, diff_h diff_J = np.max(np.abs(old_potts_model['J'] - self.potts_model['J'])) assert diff_J < 3E-4, diff_J - """ - J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) - h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) - - # compute burial and contact energies - self.burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] - direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] - water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] - protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] - contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - - electrostatics_energy = -self.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ - * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] - contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) - - self.contact_energy = contact_energy - - # Compute potts model - self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] - assert self.potts_model['h'].shape == (self.N, self.q), self.potts_model['h'].shape - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] - assert self.potts_model['J'].shape == (self.N, self.N, self.q, self.q), self.potts_model['J'].shape - breakpoint() - """ - # Set the gap energy to zero - #self.potts_model['h'][:, 0] = 0 - #self.potts_model['J'][:, :, 0, :] = 0 - #self.potts_model['J'][:, :, :, 0] = 0 else: - print("""self.potts was False; will not calculate and store potts model. + warnings.warn(""" + potts_option was False; will not calculate and store potts model. Energies will be computed on the fly as needed for frustration calculations and then discarded. - If you want to get the energies for your own purposes, set self.potts - to True and then call this method again.""") + If you want to get the energies for your own purposes, set self.potts_option=True + and then call calculate_energy_and_potts.""") # not really sure what the point of this is def native_energy(self): - if self.potts: + if self.potts_option: if not hasattr(self, 'potts_model'): # create potts model if it doesn't already exist self.calculate_energy_and_potts() energy = super().native_energy() # method to compute native energy given potts model @@ -446,7 +403,7 @@ def __init__(self, pdb_structure: object | tuple, # tuple is an object, but this clarifies what we expect sequence: str =None, expose_indicator_functions: bool=False, - potts: bool=False, + potts_option: bool=False, alt_sigma_wat: bool=False, **parameters)->object: """ @@ -473,7 +430,7 @@ def __init__(self, alt_sigma_wat : bool=False Whether to use alternative functional form for sigma_wat (experimental feature) **parameters: - Used to initialize an AWSEMHamiltonianParameters, which becomes an attribute + Used to initialize an ParametersAWSEM, which becomes an attribute of this class and helps us organize the parameters of our AWSEM Hamiltonian Returns @@ -496,7 +453,7 @@ def __init__(self, raise # load structure-independent parameters and methods - super().__init__(sequence, expose_indicator_functions, potts, **parameters) + super().__init__(sequence, expose_indicator_functions, potts_option, **parameters) self.alt_sigma_wat = alt_sigma_wat # set up strucure @@ -586,14 +543,14 @@ def calculate_indicators(self): electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / self.p.electrostatics_screening_length) self.electrostatics_indicator = electrostatics_indicator else: - print(""" + warnings.warn(""" self.expose_indicator_functions was False; will not calculate and store indicator functions. Indicator functions will be computed on the fly as needed for energy calculations and then discarded. If you want to get the indicator functions for your own purposes, - set expose_indicator_functions to True - and then call self.calculate_indicators().""") + set expose_indicator_functions=True + and then call calculate_indicators().""") # make self.pdb_structure into a property so that structure-dependent # stuff is recalculated automatically when we change the conformation @@ -611,6 +568,7 @@ def pdb_structure(self,pdb_structure): self.subclass_setup_helper() def change_conformation(self,alt_conf): # this method is an alias for the setter + # Keep this method if the setter is too slow self.pdb_structure = alt_conf # self.masked_indicators is calculated from other attributes, @@ -769,7 +727,7 @@ def __init__(self, electrostatics_indicator: Union[np.ndarray, None], sequence: str, # sequence is optional if we initialize from a Structure but not here expose_indicator_functions: bool=False, - potts : bool=False, + potts_option : bool=False, absolute_value_gamma: bool=False, **parameters)->object: """ @@ -796,7 +754,7 @@ def __init__(self, The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. - potts : bool + potts_option : bool Whether to set up the potts model (can be RAM-intensive and therefore time-intensive), which is unnecessary if all you want is to perform certain frustration calculations, for which energies can be computed on the fly instead of saved in memory. @@ -808,7 +766,7 @@ def __init__(self, AWSEMIndicators object """ - super().__init__(sequence, expose_indicator_functions, potts, **parameters) + super().__init__(sequence, expose_indicator_functions, potts_option, **parameters) self.burial_indicator = burial_indicator self.direct_indicator = direct_indicator self.protein_indicator = protein_indicator @@ -883,7 +841,7 @@ def __init__(self, # if we already have our indicator functions, # our goal is probably to compute the potts model, # so we'll just hard code a value of True for that argument VVVV - super().__init__(sequence, expose_indicator_functions, potts=True, **parameters) + super().__init__(sequence, expose_indicator_functions, potts_option=True, **parameters) self.covariance_matrix = covariance_matrix self.num_indicators = 3*self.N + 4*(self.N**2-self.N)/2 # low, med, high burial for each N, 4 classes of pair interactions self.subclass_setup_helper() diff --git a/frustratometer/frustration/frustration.py b/frustratometer/frustration/frustration.py index 61adb1b7..93374813 100644 --- a/frustratometer/frustration/frustration.py +++ b/frustratometer/frustration/frustration.py @@ -365,6 +365,7 @@ def compute_singleresidue_decoy_energy_fluctuation(seq: str, pos1, aa1 = np.meshgrid(np.arange(seq_len), np.arange(q), indexing='ij', sparse=True) decoy_energy = np.zeros([seq_len, q]) + # potts_model['h'][pos1, aa1] == potts_model['h'] decoy_energy -= (potts_model['h'][pos1, aa1] - potts_model['h'][pos1, seq_index[pos1]]) # h correction aa1 j_correction = np.zeros([seq_len, seq_len, q]) From ddba96fd2f047d02bea0dd4ce6920cb4270e6e24 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Thu, 11 Dec 2025 13:45:21 -0600 Subject: [PATCH 74/76] adding properties and restructuring hierarchy of attributes in AWSEM; still a work in progress. passing all tests except subselection and the usual dca failed ones that call external software --- frustratometer/classes/AWSEM.py | 305 +++++++++++++++++------ frustratometer/classes/Frustratometer.py | 8 +- tests/test_awsem_frustratometer.py | 74 +++--- 3 files changed, 268 insertions(+), 119 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 9ac476f9..94aab91b 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -100,9 +100,8 @@ def __init__(self, _AWSEMBase object """ - # set sequence based on argument - self.N = len(sequence) - self.sequence = sequence + # set sequence based on arguments + self._sequence = sequence # set indicator function exposure based on argument # (not exposing them saves a tiny bit of RAM but it's useful to Ezequiel) @@ -134,8 +133,8 @@ def __init__(self, # doing this arguably defeats the purpose of having an # AWSEMHamilonianParams class; # we should think about how we can clean up the namespace of this (_AWSEMBase) class - for field, value in p: - setattr(self, field, value) + #for field, value in p: + # setattr(self, field, value) self.p = p # set gamma @@ -150,57 +149,68 @@ def __init__(self, self.q = len(gamma.alphabet) # most likely 20, but could be different gb = gamma['Burial'] if gb.shape == (3,self.q): - self.burial_gamma = gb.T + self._burial_gamma = gb.T elif gb.shape == (self.q,3): - self.burial_gamma = gb + self._burial_gamma = gb else: raise ValueError(f"""Don't know how to parse burial gamma with shape {gb.shape}. Expected ({self.q},3) or (3,{self.q}).""") # pairwise gamma: squeeze to remove extra axis that is commonly present - self.direct_gamma = np.squeeze(gamma['Direct']) - self.protein_gamma = np.squeeze(gamma['Protein']) - self.water_gamma = np.squeeze(gamma['Water']) - assert self.direct_gamma.shape == self.protein_gamma.shape == self.water_gamma.shape == (self.q,self.q) + self._direct_gamma = np.squeeze(gamma['Direct']) + self._protein_gamma = np.squeeze(gamma['Protein']) + self._water_gamma = np.squeeze(gamma['Water']) + if not self._direct_gamma.shape == self._protein_gamma.shape ==\ + self._water_gamma.shape == (self.q,self.q): + raise ValueError(f""" + direct and/or protein and/or water gammas had unexpected shape. + Expected (q,q) or (1,q,q). Got shapes of: + direct: {self._direct_gamma.shape} + protein: {self._protein_gamma.shape} + water: {self._direct_gamma.shape}""") # electrostatic gamma ordered_charges = np.zeros(self.q) for counter in range(self.q): try: - ordered_charges[counter] = self.charge_dict[gamma.alphabet[counter]] + ordered_charges[counter] = self.p.charge_dict[self.p.gamma.alphabet[counter]] except KeyError as e: raise Exception(f""" - One-letter code {order[counter]} from alphabet {gamma.alphabet} + One-letter code {order[counter]} from alphabet {self.p.gamma.alphabet} with unknown charge. If use of this noncanonical AA in intentional, you must supply a custom charge_dict so that we know how to calculate the electrostatic potential.""") charges2 = ordered_charges[:,np.newaxis] * ordered_charges[np.newaxis,:] if self.p.k_electrostatics != 0: - self.sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) - self.distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, + self._sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) + self._distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix else: - self.sequence_cutoff=self.p.min_sequence_separation_contact - self.distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, + self._sequence_cutoff=self.p.min_sequence_separation_contact + self._distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, # but it doesn't hurt to define the distance_cutoff attribute-- # it's just like any other parameter, such as sequence_cutoff, # that only matters if we need to compute a mask from a distance matrix - self.charges2 = charges2 + self._charges2 = charges2 # helpful ? - self.gamma = self.p.gamma + self._gamma = self.p.gamma # set other attributes self.burial_in_context = self.p.burial_in_context - self.aa_freq = frustration.compute_aa_freq(self.sequence, self.gamma.alphabet) - self.contact_freq = frustration.compute_contact_freq(self.sequence, self.gamma.alphabet) + self._aa_freq = frustration.compute_aa_freq(self.sequence, self.p.gamma.alphabet) + self._contact_freq = frustration.compute_contact_freq(self.sequence, self.p.gamma.alphabet) self._decoy_fluctuation = {} # used for mutational calculation, possibly others - self.minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ + self._minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ self._native_energy = None self._seq_index = None ################################################################################## # quantities previously defined as attributes that are calculated based on # other attributes should be converted to properties + @property + def N(self): + return len(self.sequence) + @property def electrostatics_gamma(self): # used to be distinct from charges2 but eliminating the distinction return self.charges2 # makes these gammas more analogous to the other gammas @@ -210,7 +220,7 @@ def electrostatic_gamma(self): @property def alphabet(self): - return self.gamma.alphabet # this allows us to access the alphabet in the same way as for DCA instances + return self.p.gamma.alphabet # this allows us to access the alphabet in the same way as for DCA instances @alphabet.setter # the user might think they can change the alphabet like the conformation (see AWSEM), but that's not supported def alphabet(self): raise AttributeError("Changing the underlying alphabet is prohibited. Instead, create a new AWSEM instance from a different Gamma.") @@ -239,8 +249,137 @@ def coefficient_lambda_gamma_array(self): def coefficient_lambda_gamma_array(self): raise AttributeError(f"""Setting {self.__class__}.coefficient_lambda_gamma_array directly is not allowed. Initialize a new instance with a different - {self.__class__}.k_contact, {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, - {self.__class__}.protein_gamma, or {self.__class__}.water_gamma instead.""") + {self.__class__}.p.k_contact, {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, + {self.__class__}.p.gamma.protein_gamma, or {self.__class__}.water_gamma instead.""") + + # other properties for extra protection + @property + def sequence(self): + return self._sequence + @sequence.setter + def sequence(self): + raise NotImplementedError("Modifying the sequence is not permitted. May add support at some point.") + + # making properties into setters + @property + def burial_gamma(self): + return self._burial_gamma + @burial_gamma.setter + def burial_gamma(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.burial_gamma") + + @property + def direct_gamma(self): + return self._direct_gamma + @direct_gamma.setter + def direct_gamma(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.direct_gamma") + + @property + def protein_gamma(self): + return self._protein_gamma + @protein_gamma.setter + def protein_gamma(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.protein_gamma") + + @property + def water_gamma(self): + return self._water_gamma + @water_gamma.setter + def water_gamma(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.water_gamma") + + @property + def sequence_cutoff(self): + return self._sequence_cutoff + @sequence_cutoff.setter + def sequence_cutoff(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.sequence_cutoff") + + @property + def distance_cutoff(self): + return self._distance_cutoff + @distance_cutoff.setter + def distance_cutoff(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.distance_cutoff") + + @property + def charges2(self): + return self._charges2 + @charges2.setter + def charges2(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.charges2") + + @property + def gamma(self): + return self._gamma + @gamma.setter + def gamma(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.gamma") + + @property + def aa_freq(self): + return self._aa_freq + @aa_freq.setter + def aa_freq(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.aa_freq") + + @property + def contact_freq(self): + return self._contact_freq + @contact_freq.setter + def contact_freq(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.contact_freq") + + @property + def minimally_frustrated_threshold(self): + return self._minimally_frustrated_threshold + @minimally_frustrated_threshold.setter + def minimally_frustrated_threshold(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.minimally_frustrated_threshold") + + @property + def sequence_mask_rho(self): + return self._sequence_mask_rho + @sequence_mask_rho.setter + def sequence_mask_rho(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.sequence_mask_rho") + + @property + def sequence_mask_contact(self): + return self._sequence_mask_contact + @sequence_mask_contact.setter + def sequence_mask_contact(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.sequence_mask_contact") + + @property + def electrostatics_mask(self): + return self._electrostatics_mask + @electrostatics_mask.setter + def electrostatics_mask(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.electrostatics_mask") + + @property + def mask(self): + return self._mask + @mask.setter + def mask(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.mask") + + @property + def selected_matrix(self): + return self._selected_matrix + @selected_matrix.setter + def selected_matrix(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.selected_matrix") + + @property + def potts_model(self): + return self._potts_model + @potts_model.setter + def potts_model(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.potts_model") + ################################################################################## # methods for subclass initialization @@ -270,24 +409,24 @@ def calculate_masks(self): selected_matrix=self.full_pdb_distance_matrix else: selected_matrix=self.distance_matrix - self.sequence_mask_rho = frustration.compute_mask(selected_matrix, + self._sequence_mask_rho = frustration.compute_mask(selected_matrix, maximum_contact_distance=None, minimum_sequence_separation = self.p.min_sequence_separation_rho) - self.sequence_mask_contact = frustration.compute_mask(self.distance_matrix, + self._sequence_mask_contact = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.p.distance_cutoff_contact, minimum_sequence_separation = self.p.min_sequence_separation_contact) - self.electrostatics_mask = frustration.compute_mask(self.distance_matrix, + self._electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=self.p.min_sequence_separation_electrostatics) #with open('my_data.txt','w') as f: # f.write(f"self.distance_cutoff: {self.distance_cutoff}\n") # f.write(f"self.sequence_cutoff: {self.sequence_cutoff}\n") #np.save('my_distance_matrix.npy',self.distance_matrix) - self.mask = frustration.compute_mask(self.distance_matrix, + self._mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) #np.save('my_mask_new.npy',self.mask) - self.selected_matrix = selected_matrix # we'll need this in the calculate_indicators function + self._selected_matrix = selected_matrix # we'll need this in the calculate_indicators function def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): # chain_starts and chain_ends should be calculated based on object attributes @@ -297,30 +436,30 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): Constructing full potts model in RAM. If you don't want to do this, set potts_option=False """) - self.potts_model = {'h':None, 'J':None} + self._potts_model = {'h':None, 'J':None} if chain_starts is None: chain_starts = np.array([0]) if chain_ends is None: chain_ends = np.array([len(self.seq_index)-1]) - if self.distance_cutoff_contact is None: + if self.p.distance_cutoff_contact is None: contact_max_dist = 12.5 else: - contact_max_dist = self.distance_cutoff_contact - self.potts_model['h'] = ham.compute_potts_model_h_parallel( - self.min_sequence_separation_rho, + contact_max_dist = self.p.distance_cutoff_contact + self._potts_model['h'] = ham.compute_potts_model_h_parallel( + self.p.min_sequence_separation_rho, chain_starts, chain_ends, self.distance_matrix, - self.k_contact, self.burial_gamma) - self.potts_model['J'] = ham.compute_potts_model_J_parallel( - self.electrostatics_screening_length, self.min_sequence_separation_rho, - self.min_sequence_separation_contact, self.min_sequence_separation_electrostatics, + self.p.k_contact, self.burial_gamma) + self._potts_model['J'] = ham.compute_potts_model_J_parallel( + self.p.electrostatics_screening_length, self.p.min_sequence_separation_rho, + self.p.min_sequence_separation_contact, self.p.min_sequence_separation_electrostatics, chain_starts, chain_ends, - contact_max_dist, 10*self.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics + contact_max_dist, 10*self.p.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics self.distance_matrix, - self.k_contact, self.direct_gamma, - self.k_contact, self.protein_gamma, - self.k_contact, self.water_gamma, - self.k_electrostatics, self.electrostatics_gamma) + self.p.k_contact, self.direct_gamma, + self.p.k_contact, self.protein_gamma, + self.p.k_contact, self.water_gamma, + self.p.k_electrostatics, self.electrostatics_gamma) #breakpoint() #self.potts_model['J'] = ham.compute_potts_model_J( # self.distance_matrix, ) @@ -334,7 +473,7 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - electrostatics_energy = -self.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + electrostatics_energy = -self.p.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) old_contact_energy = contact_energy @@ -356,7 +495,7 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): # not really sure what the point of this is def native_energy(self): if self.potts_option: - if not hasattr(self, 'potts_model'): # create potts model if it doesn't already exist + if not hasattr(self, '_potts_model'): # create potts model if it doesn't already exist self.calculate_energy_and_potts() energy = super().native_energy() # method to compute native energy given potts model else: @@ -422,7 +561,7 @@ def __init__(self, The amino acid sequence expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. - potts: bool + potts_option: bool Whether to set up the potts model (can be RAM-intensive and therefore time-intensive), which is unnecessary if all you want to get is the indicator functions or perform certain frustration calculations, for which energies can be computed @@ -466,12 +605,22 @@ def setup_structure(self, pdb_structure): # maybe our type check here should be more restrictive, # but the __init__ only requires pdb_structure to be an object, # so I'll take my cue from that + # check that the sequence of our Structure is consistent with the current sequence + if self.sequence != pdb_structure.sequence: + raise NotImplementedError(f""" + You are attempting to modify the sequence of your {self.__class__} + by passing in a Structure with a different sequence than self.sequence. + This is currently not supported but may be in the future.""") + # check that the length of the sequence of our Structure is consistent + if self.N != len(pdb_structure.sequence): + raise NotImplementedError(f""" + You are attempting to modify the length of the sequence of your + {self.__class__} by passing in a Structure with a different sequence than + self.sequence. This is currently not supported but may be in the future.""") # check structure selection_CB = pdb_structure.structure.select('name CB or (resname GLY IGL and name CA)') resid = selection_CB.getResindices() - N=len(resid) self.resid = resid - self.N = N # set structure-dependent properties self._pdb_structure = pdb_structure self.structure=pdb_structure.structure @@ -503,6 +652,10 @@ def setup_structure(self, pdb_structure): # either remain the same (if this method has been previously called with a Structure) # or go undefined (if we are passing a list of arrays the first time that we are calling # this method) + else: + raise TypeError(""" + Could not parse pdb_structure tuple. + Check the source code or make pdb_structure into a Structure object.""") else: raise AssertionError("unexpected else block") @@ -597,7 +750,7 @@ def masked_indicators(self): # implementations of frustration algorithms def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - _AA = self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' + _AA = self.p.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' if aa_freq is None: seq_index = self.seq_index N=self.N @@ -608,22 +761,22 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): seq_index = np.random.choice(a=len(aa_freq), size=N, p=probabilities) distances = np.triu(self.distance_matrix) - distances = distances[(distances0)] + distances = distances[(distances0)] rho_b = np.expand_dims(self.rho_r, 1) #(n,1) rho1 = np.expand_dims(self.rho_r, 0) #(1,n) rho2 = np.expand_dims(self.rho_r, 1) #(n,1) - sigma_water = 0.25 * (1 - np.tanh(self.eta_sigma * (rho1 - self.rho_0))) * (1 - np.tanh(self.eta_sigma * (rho2 - self.rho_0))) #(n,n) + sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) #(n,n) sigma_protein = 1 - sigma_water #(n,n) #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_min))) * (1 + np.tanh(self.eta * (self.r_max - distances))) # (c,) - thetaII = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_minII))) * (1 + np.tanh(self.eta * (self.r_maxII - distances))) #(c,) - burial_indicator = np.tanh(self.burial_kappa * (rho_b - self.burial_ro_min)) + np.tanh(self.burial_kappa * (self.burial_ro_max - rho_b)) #(n,3) + theta = 0.25 * (1 + np.tanh(self.p.eta * (distances - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - distances))) # (c,) + thetaII = 0.25 * (1 + np.tanh(self.p.eta * (distances - self.p.r_minII))) * (1 + np.tanh(self.p.eta * (self.p.r_maxII - distances))) #(c,) + burial_indicator = np.tanh(self.p.burial_kappa * (rho_b - self.p.burial_ro_min)) + np.tanh(self.p.burial_kappa * (self.p.burial_ro_max - rho_b)) #(n,3) charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - electrostatics_indicator = np.exp(-distances / self.electrostatics_screening_length) / distances + electrostatics_indicator = np.exp(-distances / self.p.electrostatics_screening_length) / distances decoy_energies=np.zeros(n_decoys) #decoy_data=[None]*n_decoys @@ -638,14 +791,14 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): q2=seq_index[qi2] - burial_energy1 = (-0.5 * self.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) - burial_energy2 = (-0.5 * self.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) + burial_energy1 = (-0.5 * self.p.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) + burial_energy2 = (-0.5 * self.p.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) direct = theta[c] * self.direct_gamma[q1, q2] water_mediated = sigma_water[n1,n2] * thetaII[c] * self.water_gamma[q1,q2] protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.protein_gamma[q1,q2] - contact_energy = -self.k_contact * (direct+water_mediated+protein_mediated) - electrostatics_energy = self.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] + contact_energy = -self.p.k_contact * (direct+water_mediated+protein_mediated) + electrostatics_energy = self.p.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] decoy_energies[i]=(burial_energy1+burial_energy2+contact_energy+electrostatics_energy) #decoy_data[i]=[i, qi1, qi2, q1, q2, n1, n2, distances[c], self.rho_r[n1], self.rho_r[n2], contact_energy/4.184, burial_energy1/4.184, burial_energy2/4.184, electrostatics_energy/4.184, decoy_energies[i]] @@ -655,15 +808,15 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): return mean_decoy_energy, std_decoy_energy def compute_configurational_energies(self): - _AA= self.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' + _AA= self.p.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' seq_index = self.seq_index distances = np.triu(self.distance_matrix) - distances = distances[(distances0)] + distances = distances[(distances0)] n_contacts=len(distances) n = self.distance_matrix.shape[0] # Assuming self.distance_matrix is defined and square tri_upper_indices = np.triu_indices(n, k=1) # k=1 excludes the diagonal - valid_pairs = (self.distance_matrix[tri_upper_indices] < self.distance_cutoff_contact) & \ + valid_pairs = (self.distance_matrix[tri_upper_indices] < self.p.distance_cutoff_contact) & \ (self.distance_matrix[tri_upper_indices] > 0) indices1,indices2 = (tri_upper_indices[0][valid_pairs], tri_upper_indices[1][valid_pairs]) @@ -674,16 +827,16 @@ def compute_configurational_energies(self): rho1 = np.expand_dims(self.rho_r, 0) #(1,n) rho2 = np.expand_dims(self.rho_r, 1) #(n,1) - sigma_water = 0.25 * (1 - np.tanh(self.eta_sigma * (rho1 - self.rho_0))) * (1 - np.tanh(self.eta_sigma * (rho2 - self.rho_0))) #(n,n) + sigma_water = 0.25 * (1 - np.tanh(self.p.eta_sigma * (rho1 - self.p.rho_0))) * (1 - np.tanh(self.p.eta_sigma * (rho2 - self.p.rho_0))) #(n,n) sigma_protein = 1 - sigma_water #(n,n) #Calculate theta and indicators - theta = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_min))) * (1 + np.tanh(self.eta * (self.r_max - distances))) # (c,) - thetaII = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_minII))) * (1 + np.tanh(self.eta * (self.r_maxII - distances))) #(c,) - burial_indicator = np.tanh(self.burial_kappa * (rho_b - self.burial_ro_min)) + np.tanh(self.burial_kappa * (self.burial_ro_max - rho_b)) #(n,3) + theta = 0.25 * (1 + np.tanh(self.p.eta * (distances - self.p.r_min))) * (1 + np.tanh(self.p.eta * (self.p.r_max - distances))) # (c,) + thetaII = 0.25 * (1 + np.tanh(self.p.eta * (distances - self.p.r_minII))) * (1 + np.tanh(self.p.eta * (self.p.r_maxII - distances))) #(c,) + burial_indicator = np.tanh(self.p.burial_kappa * (rho_b - self.p.burial_ro_min)) + np.tanh(self.p.burial_kappa * (self.p.burial_ro_max - rho_b)) #(n,3) charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - electrostatics_indicator = np.exp(-distances / self.electrostatics_screening_length) / distances + electrostatics_indicator = np.exp(-distances / self.p.electrostatics_screening_length) / distances # decoy_data_columns=['decoy_i','i_resno','j_resno','ires_type','jres_type','aa1','aa2','rij','rho_i','rho_j','water_energy','burial_energy_i','burial_energy_j','electrostatic_energy','total_energies'] # decoy_data=[] @@ -694,14 +847,14 @@ def compute_configurational_energies(self): q1=seq_index[n1] q2=seq_index[n2] - burial_energy1 = (-0.5 * self.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) - burial_energy2 = (-0.5 * self.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) + burial_energy1 = (-0.5 * self.p.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) + burial_energy2 = (-0.5 * self.p.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) direct = theta[c] * self.direct_gamma[q1, q2] water_mediated = sigma_water[n1,n2] * thetaII[c] * self.water_gamma[q1,q2] protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.protein_gamma[q1,q2] - contact_energy = -self.k_contact * (direct+water_mediated+protein_mediated) - electrostatics_energy = self.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] + contact_energy = -self.p.k_contact * (direct+water_mediated+protein_mediated) + electrostatics_energy = self.p.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] energy=(burial_energy1+burial_energy2+contact_energy+electrostatics_energy) configurational_energies[n1,n2]=energy @@ -936,7 +1089,7 @@ def calculate_energy_and_potts(self): contact_energy = self.p.k_contact * np.array([direct, protein_mediated, water_mediated]) if self.p.k_electrostatics!=0: template[triu_indices] = self.pairwise_variances[3*num_upper:] - electrostatics_energy = -self.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 + electrostatics_energy = -self.p.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) # for the variance potts model, there is one more kind of two-body interaction: # burial-pairwise covariance when the pairwise energy term involves the residue in the burial term @@ -1009,9 +1162,9 @@ def calculate_energy_and_potts(self): self.contact_energy = contact_energy # Compute potts model - self.potts_model = {} - self.potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] - self.potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] + self._potts_model = {} + self._potts_model['h'] = self.burial_energy.sum(axis=-1)[:, :]#self.aa_map_awsem_list] + self._potts_model['J'] = self.contact_energy.sum(axis=0)[:, :, :, :]#self.aa_map_awsem_x, self.aa_map_awsem_y] # Set the gap energy to zero #self.potts_model['h'][:, 0] = 0 #self.potts_model['J'][:, :, 0, :] = 0 diff --git a/frustratometer/classes/Frustratometer.py b/frustratometer/classes/Frustratometer.py index 462b1113..ee608706 100644 --- a/frustratometer/classes/Frustratometer.py +++ b/frustratometer/classes/Frustratometer.py @@ -67,10 +67,6 @@ def native_energy(self,sequence:str = None,ignore_couplings_of_gaps:bool=False,i self._native_energy=frustration.compute_native_energy(sequence, self.potts_model, self.mask, self.alphabet, ignore_couplings_of_gaps, ignore_fields_of_gaps) else: - # For the direct children of this Frustratometer class, DCA and AWSEMBase, - # changing the alphabet or gammas of an instance is not allowed, so there should never be a case - # where we have a previously defined (not None) but "out-of-date" _native_energy. - # Still, we will check that our code is working as intended new = frustration.compute_native_energy( sequence, self.potts_model, self.mask, self.alphabet, ignore_couplings_of_gaps, ignore_fields_of_gaps) @@ -80,8 +76,8 @@ def native_energy(self,sequence:str = None,ignore_couplings_of_gaps:bool=False,i the native energy of your system is now different from what it was originally computed to be. Our code probably should prevent this from happening, but you can prevent it too by not changing the alphabet - or any other parameters after initializing your DCA or child of - AWSEMBase. + or any other parameters after initializing your DCA or AWSEM-family + class (anything that inherits from _AWSEMBase). Previous value of {self.__class__}._native_energy: {self._native_energy} New value of {self.__class__}._native_energy: {new}""") diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index 510be940..8798070c 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -31,7 +31,7 @@ def test_density_residues(test_data): sequence_separation = 2 if test_data['seqsep'] == 3 else 13 model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, k_electrostatics=0, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) data = pd.read_csv(test_data['singleresidue'], delim_whitespace=True) data['Calculated_density'] = model.rho_r data['Expected_density'] = data['DensityRes'] @@ -50,7 +50,7 @@ def test_single_residue_frustration(test_data): sequence_separation = 2 if test_data['seqsep'] == 3 else 13 model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=2, k_electrostatics=test_data['k_electrostatics'] * 4.184, - min_sequence_separation_electrostatics=1, expose_indicator_functions=True, potts=True) + min_sequence_separation_electrostatics=1, expose_indicator_functions=True, potts_option=True) data = pd.read_csv(test_data['singleresidue'], delim_whitespace=True) data['Calculated_frustration'] = model.frustration(kind='singleresidue') data['Expected_frustration'] = data['FrstIndex'] @@ -70,7 +70,7 @@ def test_mutational_frustration(test_data): return model = frustratometer.AWSEM(structure, distance_cutoff_contact=9.5, min_sequence_separation_rho=sequence_separation, min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, - min_sequence_separation_electrostatics=1, expose_indicator_functions=True, potts=True) + min_sequence_separation_electrostatics=1, expose_indicator_functions=True, potts_option=True) data = pd.read_csv(test_data['mutational'], delim_whitespace=True) if test_data['pdb']!="ijge": @@ -112,7 +112,7 @@ def test_configurational_frustration(test_data): min_sequence_separation_contact=0, k_electrostatics=test_data['k_electrostatics'] * 4.184, min_sequence_separation_electrostatics=1, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) data = pd.read_csv(test_data['configurational'], delim_whitespace=True) @@ -151,13 +151,13 @@ def test_residue_density_calculation(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) assert np.round(model.rho_r,2).all()==np.round(expected_rho_values,2).all() def test_AWSEM_native_energy(): structure=frustratometer.Structure(test_data_path/f'1l63.pdb',"A") model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, - distance_cutoff_contact = None, expose_indicator_functions=True, potts=True) + distance_cutoff_contact = None, expose_indicator_functions=True, potts_option=True) e = model.native_energy() print(e) assert np.round(e, 0) == -915 @@ -165,7 +165,7 @@ def test_AWSEM_native_energy(): def test_AWSEM_fields_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, - distance_cutoff_contact = None, expose_indicator_functions=True, potts=True) + distance_cutoff_contact = None, expose_indicator_functions=True, potts_option=True) e = model.fields_energy() print(e) assert np.round(e, 0) == -555 @@ -173,14 +173,14 @@ def test_AWSEM_fields_energy(): def test_AWSEM_couplings_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,k_electrostatics=0, min_sequence_separation_contact = 10, distance_cutoff_contact = None, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) e = model.couplings_energy() print(e) assert np.round(e, 0) == -362 def test_fields_couplings_AWSEM_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") - model = frustratometer.AWSEM(structure, expose_indicator_functions=True, potts=True) + model = frustratometer.AWSEM(structure, expose_indicator_functions=True, potts_option=True) assert model.fields_energy() + model.couplings_energy() - model.native_energy() < 1E-6 def test_single_residue_AWSEM_energy(): @@ -191,7 +191,7 @@ def test_single_residue_AWSEM_energy(): model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, k_electrostatics=0, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) #Calculate fields seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) @@ -236,7 +236,7 @@ def test_numba_potts_construction(): protein_mediated = model.protein_indicator * model.protein_gamma[J_index[2], J_index[3]] contact_energy = model.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * model.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - electrostatics_energy = -model.k_electrostatics * model.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * model.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + electrostatics_energy = -model.p.k_electrostatics * model.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * model.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ * model.electrostatics_mask[:,:,np.newaxis,np.newaxis] contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) old_contact_energy = contact_energy @@ -251,25 +251,25 @@ def test_numba_potts_construction(): new_potts_model = {'h':None, 'J':None} chain_starts = np.array([0]) chain_ends = np.array([len(model.seq_index)-1]) - if model.distance_cutoff_contact is None: + if model.p.distance_cutoff_contact is None: contact_max_dist = 12.5 else: - contact_max_dist = model.distance_cutoff_contact + contact_max_dist = model.p.distance_cutoff_contact new_potts_model['h'] = ham.compute_potts_model_h_parallel( - model.min_sequence_separation_rho, + model.p.min_sequence_separation_rho, chain_starts, chain_ends, model.distance_matrix, - model.k_contact, model.burial_gamma) + model.p.k_contact, model.burial_gamma) new_potts_model['J'] = ham.compute_potts_model_J_parallel( - model.electrostatics_screening_length, model.min_sequence_separation_rho, - model.min_sequence_separation_contact, model.min_sequence_separation_electrostatics, + model.p.electrostatics_screening_length, model.p.min_sequence_separation_rho, + model.p.min_sequence_separation_contact, model.p.min_sequence_separation_electrostatics, chain_starts, chain_ends, - contact_max_dist, 10*model.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics + contact_max_dist, 10*model.p.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics model.distance_matrix, - model.k_contact, model.direct_gamma, - model.k_contact, model.protein_gamma, - model.k_contact, model.water_gamma, - model.k_electrostatics, model.electrostatics_gamma) + model.p.k_contact, model.direct_gamma, + model.p.k_contact, model.protein_gamma, + model.p.k_contact, model.water_gamma, + model.p.k_electrostatics, model.electrostatics_gamma) #np.save('new_way_h.npy',new_potts_model['h']) #np.save('new_way_J.npy',new_potts_model['J']) assert np.max(np.abs(old_potts_model['h'] - new_potts_model['h'])) < 1E-5 # 10^-5 kJ/mol error is acceptable @@ -287,7 +287,7 @@ def test_contact_pair_AWSEM_energy(): structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=0, - k_electrostatics=0, expose_indicator_functions=True, potts=True) + k_electrostatics=0, expose_indicator_functions=True, potts_option=True) #Calculate fields seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) @@ -307,13 +307,13 @@ def test_contact_pair_AWSEM_energy(): def test_selected_subsequence_AWSEM_contact_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 3to26") - model=frustratometer.AWSEM(structure, expose_indicator_functions=True, potts=True) + model=frustratometer.AWSEM(structure, expose_indicator_functions=True, potts_option=True) q = len(model.gamma.alphabet) assert model.potts_model['h'].shape==(24,q) def test_selected_subsequence_AWSEM_burial_energy_matrix(): structure=frustratometer.Structure(test_data_path/f'4wnc.pdb',"A",seq_selection="resnum 150to315") - model=frustratometer.AWSEM(structure, expose_indicator_functions=True, potts=True) + model=frustratometer.AWSEM(structure, expose_indicator_functions=True, potts_option=True) q = len(model.gamma.alphabet) assert model.potts_model['J'].shape==(166,166,q,q) @@ -325,13 +325,13 @@ def test_selected_subsequence_AWSEM_rho_calculations(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts_option=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts_option=True) #Check if shape and entries of rho matrices are identical assert model_1.rho_r.shape==model_2.rho_r[model_1_init_index:model_1_fin_index].shape @@ -341,13 +341,13 @@ def test_selected_subsequence_AWSEM_burial_energy(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts_option=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts_option=True) #Check if burial energies are identical assert model_1.burial_energy.shape==model_2.burial_energy[model_1_init_index:model_1_fin_index].shape @@ -357,13 +357,13 @@ def test_selected_subsequence_AWSEM_contact_energy(): #Substructure object substructure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model_1=frustratometer.AWSEM(substructure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts_option=True) model_1_init_index=model_1.init_index_shift; model_1_fin_index=model_1.fin_index_shift #Full structure object structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A") model_2=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0, expose_indicator_functions=True, potts_option=True) #Check if contact energies are identical assert model_1.contact_energy.shape==model_2.contact_energy[:,model_1_init_index:model_1_fin_index,model_1_init_index:model_1_fin_index,:,:].shape @@ -372,7 +372,7 @@ def test_selected_subsequence_AWSEM_contact_energy(): def test_selected_subsequence_AWSEM_burial_energy_without_protein_context(): structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True, potts_option=True) selected_region_burial=model.fields_energy() # Energy units are in kJ/mol assert np.round(selected_region_burial, 2) == -377.95 @@ -380,7 +380,7 @@ def test_selected_subsequence_AWSEM_burial_energy_without_protein_context(): def test_selected_subsequence_AWSEM_contact_energy_without_protein_context(): structure=frustratometer.Structure(test_data_path/f'1MBA_A.pdb',"A",seq_selection="resnum 39to146") model=frustratometer.AWSEM(structure, k_electrostatics=0.0,min_sequence_separation_contact=10, - distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True, potts=True) + distance_cutoff_contact=10.0,burial_in_context=False, expose_indicator_functions=True, potts_option=True) selected_region_contact=model.couplings_energy() # Energy units are in kJ/mol assert np.round(selected_region_contact, 2) == -148.92 @@ -391,7 +391,7 @@ def test_single_residue_decoy_AWSEM_energy_statistics(): ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.499, min_sequence_separation_contact=2, k_electrostatics=0, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) #Calculate fields seq_index = np.array([model.alphabet.index(aa) for aa in structure.sequence]) seq_len = len(seq_index) @@ -422,7 +422,7 @@ def test_contact_pair_decoy_AWSEM_energy_statistics(): ### structure=frustratometer.Structure(test_data_path/f'6u5e.pdb',"A") model=frustratometer.AWSEM(structure,distance_cutoff_contact=9.5, min_sequence_separation_contact=None, k_electrostatics=0, - expose_indicator_functions=True, potts=True) + expose_indicator_functions=True, potts_option=True) q = len(model.alphabet) #Calculate fields @@ -467,7 +467,7 @@ def test_expose_indicators(structure, k_electrostatics, min_sequence_separation_ """ Check that the AWSEM indicators exposed can reproduce the native energy, where E_native = -sum_{i} h_i - sum_{i,j} J_ij = sum_{i} gamma_i * I_i """ model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, min_sequence_separation_contact = min_sequence_separation_contact, - distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True, potts=True) + distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True, potts_option=True) q = len(model.alphabet) model_seq_index=np.array([model.alphabet.index(aa) for aa in model.sequence]) indicators1D=np.array(model.masked_indicators[0:3]) @@ -497,7 +497,7 @@ def test_expose_indicators(structure, k_electrostatics, min_sequence_separation_ # compute the potential accurately.""" # model=frustratometer.AWSEM(structure,k_electrostatics=k_electrostatics, # min_sequence_separation_contact = min_sequence_separation_contact, -# distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True, potts=True) +# distance_cutoff_contact = distance_cutoff_contact, expose_indicator_functions=True, potts_option=True) # ######################################## # # old way -- note that these are all negatives of the actual energy: # # burial, direct, protein, water don't have a factor of -1 when the should, From a8ede326dab1299c3922070ecde10401ead354ef Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Fri, 12 Dec 2025 11:05:31 -0600 Subject: [PATCH 75/76] refactored attributes and properties of _AWSEMBase --- frustratometer/classes/AWSEM.py | 498 +++++++++++++++-------------- frustratometer/classes/Gamma.py | 4 + tests/test_awsem_frustratometer.py | 24 +- 3 files changed, 275 insertions(+), 251 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 94aab91b..a451f6c1 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -14,6 +14,10 @@ __all__ = ['AWSEM','AWSEMIndicators','DecoyEnsemble', 'AWSEMVariancePotts'] class ParametersAWSEM(BaseModel): + # due to the presence of the pydantic BaseModel, + # these variables that look like class attributes + # are actually instance attributes + model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True) """Default parameters for AWSEM energy calculations.""" k_contact: float = Field(4.184, description=""" @@ -56,9 +60,9 @@ class ParametersAWSEM(BaseModel): k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)") electrostatics_screening_length: float = Field(10.0, description="Screening length for electrostatic interactions. (Angstrom)") - # We might not know the order of amino acids in our alphabet at the time of instantiating this class - # (this happens the above gammas are Paths), so we'll have to build the electrostatic "gamma" when - # initializing _AWSEMBase. Fortunately, we can still specify everything we need to know in this dict. + # We might not know the order of amino acids in our alphabet at the time of initialization + # (this happens the above gammas are Paths), so we'll have to build the electrostatic "gamma" + # later (see self.model_post_init) charge_dict : dict = Field({'A':0.0,'C':0.0,'D':-1.0,'E':-1.0, 'F':0.0,'G':0.0,'H':0.0,'I':0.0, 'K':1.0,'L':0.0,'M':0.0,'N':0.0,'P':0.0, @@ -66,6 +70,130 @@ class ParametersAWSEM(BaseModel): 'V':0.0,'W':0.0,'Y':0.0}, description='charge of each amino acid type that may be used') + def model_post_init(self, __context__): + """Pydantic v2 hook called after model initialization. + + The signature must accept a single positional argument named + ``__context__`` (per pydantic v2). If ``gamma`` was provided + as a path, convert it to a ``Gamma`` instance here. + """ + if isinstance(self.gamma, Path): + self.gamma = Gamma(self.gamma) + + # make properties that reshape gammas stored inside the Gamma object (self.gamma) + @property + def burial_gamma(self): + # check shape of gamma + gb = self.gamma['Burial'] + if gb.shape == (3,self.gamma.q): + return gb.T + elif gb.shape == (self.gamma.q,3): + return gb + else: + raise ValueError(f""" + Don't know how to parse burial gamma with shape {gb.shape}. + Expected ({self.gamma.q},3) or (3,{self.gamma.q}).""") + @burial_gamma.setter + def burial_gamma(self, _): + raise AttributeError(""" + Modifying burial_gamma directly is not allowed. + Instead, modify the underlying Gamma object, accessible at self.gamma.""") + @property + def direct_gamma(self): + gd = np.squeeze(self.gamma['Direct']) # gammas commonly formatted as (1,q,q) + # check shape of gamma + if gd.shape != (self.gamma.q,self.gamma.q): + raise ValueError(f""" + Don't know how to parse direct gamma with shape {gd.shape}. + Expected ({self.gamma.q}, {self.gamma.q})""") + else: + return gd + @direct_gamma.setter + def direct_gamma(self, _): + raise AttributeError(""" + Modifying direct_gamma directly is not allowed. + Instead, modify the underlying Gamma object, accessible at self.gamma.""") + @property + def protein_gamma(self): + gp = np.squeeze(self.gamma['Protein']) # gammas commonly formatted as (1,q,q) + # check shape of gamma + if gp.shape != (self.gamma.q,self.gamma.q): + raise ValueError(f""" + Don't know how to parse protein gamma with shape {gp.shape}. + Expected ({self.gamma.q}, {self.gamma.q})""") + else: + return gp + @protein_gamma.setter + def protein_gamma(self, _): + raise AttributeError(""" + Modifying protein_gamma directly is not allowed. + Instead, modify the underlying Gamma object, accessible at self.gamma.""") + @property + def water_gamma(self): + gw = np.squeeze(self.gamma['Water']) # gammas commonly formatted as (1,q,q) + # check shape of gamma + if gw.shape != (self.gamma.q,self.gamma.q): + raise ValueError(f""" + Don't know how to parse water gamma with shape {gw.shape}. + Expected ({self.gamma.q}, {self.gamma.q})""") + else: + return gw + @direct_gamma.setter + def direct_gamma(self, _): + raise AttributeError(""" + Modifying direct_gamma directly is not allowed. + Instead, modify the underlying Gamma object, accessible at self.gamma.""") + @property + def electrostatic_gamma(self): + charges = [] + for oneletter in self.gamma.alphabet: + if oneletter not in self.charge_dict.keys(): + raise ValueError(f""" + One letter code {oneletter} found in Gamma.alphabet + is not known to the electrostatic potential. + Provide your ParametersAWSEM object with a complete + charge_dict specifying the electric charge, in fundamental + units, of {oneletter} and all other amino acids + in your alphabet so that an electrostatic "gamma" array + of the same shape and amino acid order as your direct, + protein, and water gammas can be created.""") + else: + charges.append(self.charge_dict[oneletter]) + assert len(charges) == self.gamma.q + return np.outer(charges, charges) # our electrostatic "gamma" + @electrostatic_gamma.setter + def electrostatic_gamma(self, _): + raise AttributeError(""" + Electrostatic_gamma is a property computed from the + ParametersAWSEM.charge_dict and ParametersAWSEM.Gamma.alphabet. + To set the electrostatic gamma, change the charge_dict + of your parameters object.""") + @property + def electrostatics_gamma(self): + return self.electrostatic_gamma + @electrostatics_gamma.setter + def electrostatics_gamma(self, _): + self.electrostatic_gamma = _ + + # if we're going to make gamma arrays from self.gamma available as + # properties of this class, we should also make the alphabet and q from + # self.gamma available as a property of this class + @property + def alphabet(self): + return self.gamma.alphabet + @alphabet.setter + def alphabet(self, new_alphabet): + # we might need new_alphabet to be either a list or str, but not the other + self.gamma.reorder(alphabet=new_alphabet) + #raise AttributeError(""" + #Resetting the alphabet must be done using self.gamma.reorder() + #(self.gamma is an instance of the Gamma class). + #Changes made to the underlying Gamma object will then + #propagate upward.""") + @property + def q(self): + return self.gamma.q + # gamma.q is itself a property that calculates seq len and returns it class _AWSEMBase(Frustratometer): """ @@ -111,6 +239,7 @@ def __init__(self, # which requires a lot of ram self.potts_option = potts_option + # check consistency of potts_option and expose_indicator_functions arguments if self.potts_option and not self.expose_indicator_functions: warnings.warn(f""" You requested storing the potts model as an object attribute by using potts_option=True @@ -123,221 +252,110 @@ def __init__(self, self.expose_indicator_functions = True # parse other arguments - p = ParametersAWSEM(**parameters) - if p.min_sequence_separation_contact is None: - p.min_sequence_separation_contact = 1 - if p.min_sequence_separation_rho is None: - p.min_sequence_separation_rho = 1 - if p.min_sequence_separation_electrostatics is None: - p.min_sequence_separation_electrostatics = 1 - # doing this arguably defeats the purpose of having an - # AWSEMHamilonianParams class; - # we should think about how we can clean up the namespace of this (_AWSEMBase) class - #for field, value in p: - # setattr(self, field, value) - self.p = p - - # set gamma - if isinstance(self.p.gamma, Gamma): - gamma = self.p.gamma - elif isinstance(self.p.gamma, Path): - gamma = Gamma(self.p.gamma) - self.p.gamma = gamma - else: - raise ValueError("Gamma parameter must be a path or a Gamma object.") - # burial gamma - self.q = len(gamma.alphabet) # most likely 20, but could be different - gb = gamma['Burial'] - if gb.shape == (3,self.q): - self._burial_gamma = gb.T - elif gb.shape == (self.q,3): - self._burial_gamma = gb - else: - raise ValueError(f"""Don't know how to parse burial gamma with shape {gb.shape}. - Expected ({self.q},3) or (3,{self.q}).""") - # pairwise gamma: squeeze to remove extra axis that is commonly present - self._direct_gamma = np.squeeze(gamma['Direct']) - self._protein_gamma = np.squeeze(gamma['Protein']) - self._water_gamma = np.squeeze(gamma['Water']) - if not self._direct_gamma.shape == self._protein_gamma.shape ==\ - self._water_gamma.shape == (self.q,self.q): - raise ValueError(f""" - direct and/or protein and/or water gammas had unexpected shape. - Expected (q,q) or (1,q,q). Got shapes of: - direct: {self._direct_gamma.shape} - protein: {self._protein_gamma.shape} - water: {self._direct_gamma.shape}""") - # electrostatic gamma - ordered_charges = np.zeros(self.q) - for counter in range(self.q): - try: - ordered_charges[counter] = self.p.charge_dict[self.p.gamma.alphabet[counter]] - except KeyError as e: - raise Exception(f""" - One-letter code {order[counter]} from alphabet {self.p.gamma.alphabet} - with unknown charge. If use of this noncanonical AA in intentional, - you must supply a custom charge_dict - so that we know how to calculate the electrostatic potential.""") - charges2 = ordered_charges[:,np.newaxis] * ordered_charges[np.newaxis,:] - if self.p.k_electrostatics != 0: - self._sequence_cutoff=min(self.p.min_sequence_separation_electrostatics, self.p.min_sequence_separation_contact) - self._distance_cutoff=None # the distance matrix isn't guaranteed to exist in all subclasses, - # but it doesn't hurt to define the distance_cutoff attribute-- - # it's just like any other parameter, such as sequence_cutoff, - # that only matters if we need to compute a mask from a distance matrix - else: - self._sequence_cutoff=self.p.min_sequence_separation_contact - self._distance_cutoff=self.p.distance_cutoff_contact # the distance matrix isn't guaranteed to exist in all subclasses, - # but it doesn't hurt to define the distance_cutoff attribute-- - # it's just like any other parameter, such as sequence_cutoff, - # that only matters if we need to compute a mask from a distance matrix - self._charges2 = charges2 - # helpful ? - self._gamma = self.p.gamma + self.p = ParametersAWSEM(**parameters) + + # i don't know why these aren't the defaults in ParametersAWSEM + # if we're going to override them anyway + if self.p.min_sequence_separation_contact is None: + self.p.min_sequence_separation_contact = 1 + if self.p.min_sequence_separation_rho is None: + self.p.min_sequence_separation_rho = 1 + if self.p.min_sequence_separation_electrostatics is None: + self.p.min_sequence_separation_electrostatics = 1 # set other attributes - self.burial_in_context = self.p.burial_in_context - self._aa_freq = frustration.compute_aa_freq(self.sequence, self.p.gamma.alphabet) - self._contact_freq = frustration.compute_contact_freq(self.sequence, self.p.gamma.alphabet) - self._decoy_fluctuation = {} # used for mutational calculation, possibly others + self.burial_in_context = self.p.burial_in_context #i'd prefer to move this out of ParametersAWSEM completely + self._decoy_fluctuation = {} # used for non-configurational frustration calculations self._minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ self._native_energy = None - self._seq_index = None - - ################################################################################## - # quantities previously defined as attributes that are calculated based on - # other attributes should be converted to properties - @property - def N(self): - return len(self.sequence) - @property - def electrostatics_gamma(self): # used to be distinct from charges2 but eliminating the distinction - return self.charges2 # makes these gammas more analogous to the other gammas + # although the alphabet is really an attribute of the AWSEM + # Hamiltonian, and therefore belongs in the ParametersAWSEM instance, + # at least one method in the Frustratometer class requires + # it to be accessible in this namespace, so we make it a property @property - def electrostatic_gamma(self): - return self.electrostatics_gamma - - @property def alphabet(self): - return self.p.gamma.alphabet # this allows us to access the alphabet in the same way as for DCA instances - @alphabet.setter # the user might think they can change the alphabet like the conformation (see AWSEM), but that's not supported - def alphabet(self): - raise AttributeError("Changing the underlying alphabet is prohibited. Instead, create a new AWSEM instance from a different Gamma.") - - @property # emphasizes that seq_index is computed from the alphabet - def seq_index(self): - if self._seq_index is None: # so we only have to compute it once - self._seq_index = np.array([self.alphabet.index(aa) for aa in self.sequence]) - return self._seq_index - - # carlos wanted to have gamma_array with gammas multiplied by lambda and coefficients - @property - def coefficient_lambda_gamma_array(self): - _coefficient_lambda_gamma_array = [] - _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.burial_gamma[:,0]) - _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.burial_gamma[:,1]) - _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.burial_gamma[:,2]) - _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.direct_gamma) - _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.protein_gamma) - _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.water_gamma) - _coefficient_lambda_gamma_array.append(0.5 * self.p.k_electrostatics * self.charges2) - # not a typo, supposed to be positive ^^^ - # charges2 is our electrostatic "gamma" - return _coefficient_lambda_gamma_array - @coefficient_lambda_gamma_array.setter # clarifies that this is derived from more fundamental quantities - def coefficient_lambda_gamma_array(self): - raise AttributeError(f"""Setting {self.__class__}.coefficient_lambda_gamma_array - directly is not allowed. Initialize a new instance with a different - {self.__class__}.p.k_contact, {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, - {self.__class__}.p.gamma.protein_gamma, or {self.__class__}.water_gamma instead.""") - - # other properties for extra protection + return self.p.alphabet + @alphabet.setter + def alphabet(self, new_alphabet): + # we might need new_alphabet to be either a list or str, but not the other + self.p.alphabet = new_alphabet + + # we make these attributes into properties for protection; + # may write setters at some point to update everything appropriately + # and allow modification of initialized objects @property def sequence(self): return self._sequence @sequence.setter - def sequence(self): + def sequence(self, _): raise NotImplementedError("Modifying the sequence is not permitted. May add support at some point.") - - # making properties into setters - @property - def burial_gamma(self): - return self._burial_gamma - @burial_gamma.setter - def burial_gamma(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.burial_gamma") - - @property - def direct_gamma(self): - return self._direct_gamma - @direct_gamma.setter - def direct_gamma(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.direct_gamma") - @property - def protein_gamma(self): - return self._protein_gamma - @protein_gamma.setter - def protein_gamma(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.protein_gamma") - - @property - def water_gamma(self): - return self._water_gamma - @water_gamma.setter - def water_gamma(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.water_gamma") + def minimally_frustrated_threshold(self): + return self._minimally_frustrated_threshold + @minimally_frustrated_threshold.setter + def minimally_frustrated_threshold(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.minimally_frustrated_threshold. May add support in the future") + # these attributes are computed from other attributes, + # so we make them into properties @property def sequence_cutoff(self): - return self._sequence_cutoff - @sequence_cutoff.setter - def sequence_cutoff(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.sequence_cutoff") - + if self.p.k_electrostatics == 0: + return self.p.min_sequence_separation_contact + else: + return min(self.p.min_sequence_separation_contact, + self.p.min_sequence_separation_electrostatics) @property def distance_cutoff(self): - return self._distance_cutoff - @distance_cutoff.setter - def distance_cutoff(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.distance_cutoff") - - @property - def charges2(self): - return self._charges2 - @charges2.setter - def charges2(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.charges2") - - @property - def gamma(self): - return self._gamma - @gamma.setter - def gamma(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.gamma") - + # the distance cutoff might not exist in all subclasses, + # but they just won't use it. there's no harm in defining it + if self.p.k_electrostatics == 0: + return self.p.distance_cutoff_contact + else: + return None + @property + def N(self): + return len(self.sequence) + @property # emphasizes that seq_index is computed from the alphabet + def seq_index(self): + self._seq_index = np.array([self.p.alphabet.index(aa) for aa in self.sequence]) + return self._seq_index @property def aa_freq(self): - return self._aa_freq - @aa_freq.setter - def aa_freq(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.aa_freq") - + return frustration.compute_aa_freq(self.sequence, self.p.alphabet) @property def contact_freq(self): - return self._contact_freq - @contact_freq.setter - def contact_freq(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.contact_freq") - + return frustration.compute_contact_freq(self.sequence, self.p.alphabet) + + # this format is a little bit unusual but is useful for the optimization code @property - def minimally_frustrated_threshold(self): - return self._minimally_frustrated_threshold - @minimally_frustrated_threshold.setter - def minimally_frustrated_threshold(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.minimally_frustrated_threshold") + def coefficient_lambda_gamma_array(self): + _coefficient_lambda_gamma_array = [] + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.p.burial_gamma[:,0]) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.p.burial_gamma[:,1]) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.p.burial_gamma[:,2]) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.p.direct_gamma) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.p.protein_gamma) + _coefficient_lambda_gamma_array.append(-0.5 * self.p.k_contact * self.p.water_gamma) + _coefficient_lambda_gamma_array.append(0.5 * self.p.k_electrostatics * self.p.electrostatic_gamma) + # not a typo, supposed to be positive ^^^ + # charges2 is our electrostatic "gamma" + return _coefficient_lambda_gamma_array + @coefficient_lambda_gamma_array.setter # clarifies that this is derived from more fundamental quantities + def coefficient_lambda_gamma_array(self, _): + raise AttributeError(f"""Setting {self.__class__}.coefficient_lambda_gamma_array + directly is not allowed. Initialize a new instance with a different + {self.__class__}.p.k_contact, {self.__class__}.burial_gamma, {self.__class__}.direct_gamma, + {self.__class__}.p.gamma.protein_gamma, or {self.__class__}.water_gamma instead.""") + + + + + + + @property def sequence_mask_rho(self): return self._sequence_mask_rho @@ -449,31 +467,31 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): self.p.min_sequence_separation_rho, chain_starts, chain_ends, self.distance_matrix, - self.p.k_contact, self.burial_gamma) + self.p.k_contact, self.p.burial_gamma) self._potts_model['J'] = ham.compute_potts_model_J_parallel( self.p.electrostatics_screening_length, self.p.min_sequence_separation_rho, self.p.min_sequence_separation_contact, self.p.min_sequence_separation_electrostatics, chain_starts, chain_ends, contact_max_dist, 10*self.p.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics self.distance_matrix, - self.p.k_contact, self.direct_gamma, - self.p.k_contact, self.protein_gamma, - self.p.k_contact, self.water_gamma, - self.p.k_electrostatics, self.electrostatics_gamma) + self.p.k_contact, self.p.direct_gamma, + self.p.k_contact, self.p.protein_gamma, + self.p.k_contact, self.p.water_gamma, + self.p.k_electrostatics, self.p.electrostatic_gamma) #breakpoint() #self.potts_model['J'] = ham.compute_potts_model_J( # self.distance_matrix, ) - J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) - h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) + J_index = np.meshgrid(range(self.N), range(self.N), range(self.p.q), range(self.p.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(self.N), range(self.p.q), indexing='ij', sparse=False) # compute burial and contact energies - old_burial_energy = 0.5 * self.p.k_contact * self.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] - direct = self.direct_indicator * self.direct_gamma[J_index[2], J_index[3]] - water_mediated = self.water_indicator * self.water_gamma[J_index[2], J_index[3]] - protein_mediated = self.protein_indicator * self.protein_gamma[J_index[2], J_index[3]] + old_burial_energy = 0.5 * self.p.k_contact * self.p.burial_gamma[h_index[1]] * self.burial_indicator[:, np.newaxis, :] + direct = self.direct_indicator * self.p.direct_gamma[J_index[2], J_index[3]] + water_mediated = self.water_indicator * self.p.water_gamma[J_index[2], J_index[3]] + protein_mediated = self.protein_indicator * self.p.protein_gamma[J_index[2], J_index[3]] contact_energy = self.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * self.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - electrostatics_energy = -self.p.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + electrostatics_energy = -self.p.k_electrostatics * self.p.electrostatic_gamma[np.newaxis,np.newaxis,:,:] * self.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ * self.electrostatics_mask[:,:,np.newaxis,np.newaxis] contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) old_contact_energy = contact_energy @@ -482,6 +500,8 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): old_potts_model['h'] = old_burial_energy.sum(axis=-1)[:, :] old_potts_model['J'] = old_contact_energy.sum(axis=0)[:, :, :, :] diff_h = np.max(np.abs(old_potts_model['h'] - self.potts_model['h'])) + assert self.distance_matrix.shape == (self.N, self.N) + assert self.distance_matrix.shape == (len(self.sequence), len(self.sequence)) assert diff_h < 3E-4, diff_h diff_J = np.max(np.abs(old_potts_model['J'] - self.potts_model['J'])) assert diff_J < 3E-4, diff_J @@ -750,7 +770,7 @@ def masked_indicators(self): # implementations of frustration algorithms def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] - _AA = self.p.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' + _AA = self.p.alphabet #'ARNDCQEGHILKMFPSTWYV' if aa_freq is None: seq_index = self.seq_index N=self.N @@ -791,12 +811,12 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): q2=seq_index[qi2] - burial_energy1 = (-0.5 * self.p.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) - burial_energy2 = (-0.5 * self.p.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) + burial_energy1 = (-0.5 * self.p.k_contact * self.p.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) + burial_energy2 = (-0.5 * self.p.k_contact * self.p.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) - direct = theta[c] * self.direct_gamma[q1, q2] - water_mediated = sigma_water[n1,n2] * thetaII[c] * self.water_gamma[q1,q2] - protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.protein_gamma[q1,q2] + direct = theta[c] * self.p.direct_gamma[q1, q2] + water_mediated = sigma_water[n1,n2] * thetaII[c] * self.p.water_gamma[q1,q2] + protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.p.protein_gamma[q1,q2] contact_energy = -self.p.k_contact * (direct+water_mediated+protein_mediated) electrostatics_energy = self.p.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] @@ -808,7 +828,7 @@ def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): return mean_decoy_energy, std_decoy_energy def compute_configurational_energies(self): - _AA= self.p.gamma.alphabet #'ARNDCQEGHILKMFPSTWYV' + _AA= self.p.alphabet #'ARNDCQEGHILKMFPSTWYV' seq_index = self.seq_index distances = np.triu(self.distance_matrix) distances = distances[(distances0)] @@ -847,12 +867,12 @@ def compute_configurational_energies(self): q1=seq_index[n1] q2=seq_index[n2] - burial_energy1 = (-0.5 * self.p.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) - burial_energy2 = (-0.5 * self.p.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) + burial_energy1 = (-0.5 * self.p.k_contact * self.p.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) + burial_energy2 = (-0.5 * self.p.k_contact * self.p.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) - direct = theta[c] * self.direct_gamma[q1, q2] - water_mediated = sigma_water[n1,n2] * thetaII[c] * self.water_gamma[q1,q2] - protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.protein_gamma[q1,q2] + direct = theta[c] * self.p.direct_gamma[q1, q2] + water_mediated = sigma_water[n1,n2] * thetaII[c] * self.p.water_gamma[q1,q2] + protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.p.protein_gamma[q1,q2] contact_energy = -self.p.k_contact * (direct+water_mediated+protein_mediated) electrostatics_energy = self.p.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] @@ -938,13 +958,13 @@ def __init__(self, self.mask = frustration.compute_mask(np.zeros((self.N,self.N)), maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) - if absolute_value_gamma: - self.burial_gamma = np.abs(self.burial_gamma) - self.direct_gamma = np.abs(self.direct_gamma) - self.protein_gamma = np.abs(self.protein_gamma) - self.water_gamma = np.abs(self.water_gamma) - self.electrostatics_gamma = np.abs(self.electrostatics_gamma) - self.absolute_value_gamma = absolute_value_gamma + #if absolute_value_gamma: + # self.burial_gamma = np.abs(self.burial_gamma) + # self.direct_gamma = np.abs(self.direct_gamma) + # self.protein_gamma = np.abs(self.protein_gamma) + # self.water_gamma = np.abs(self.water_gamma) + # self.electrostatics_gamma = np.abs(self.electrostatics_gamma) + #self.absolute_value_gamma = absolute_value_gamma #np.save('absolute_value_gamma_1.npy',absolute_value_gamma) #np.save('burial_indicator_1.npy',burial_indicator) #np.save('direct_indicator_1.npy', direct_indicator) @@ -1074,31 +1094,31 @@ def calculate_energy_and_potts(self): # compute burial and contact energies # the "energy" of our potts model representing the covariance, not a physical energy # this "burial energy" is the sum of variances of the burial indicators (the one-body part of the model) - self.burial_energy = (0.5*self.p.k_contact*self.burial_gamma[h_index[1]])**2 * self.burial_variances.reshape((self.N,1,3)) + self.burial_energy = (0.5*self.p.k_contact*self.p.burial_gamma[h_index[1]])**2 * self.burial_variances.reshape((self.N,1,3)) # the "contact energy" is ordinarily the sum of all two-body components of the model # (direct, protein, water, electrostatics), so we do the analogous thing here template = np.zeros((self.N,self.N)) num_upper = int((self.N**2-self.N)/2) triu_indices = np.triu_indices(self.N,k=1) template[triu_indices] = self.pairwise_variances[:num_upper] - direct = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.direct_gamma[J_index[2], J_index[3]]**2 + direct = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.p.direct_gamma[J_index[2], J_index[3]]**2 template[triu_indices] = self.pairwise_variances[num_upper:2*num_upper] - protein_mediated = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.protein_gamma[J_index[2], J_index[3]]**2 + protein_mediated = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.p.protein_gamma[J_index[2], J_index[3]]**2 template[triu_indices] = self.pairwise_variances[2*num_upper:3*num_upper] - water_mediated = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.water_gamma[J_index[2], J_index[3]]**2 + water_mediated = (template+template.T)[:,:,np.newaxis,np.newaxis] * self.p.water_gamma[J_index[2], J_index[3]]**2 contact_energy = self.p.k_contact * np.array([direct, protein_mediated, water_mediated]) if self.p.k_electrostatics!=0: template[triu_indices] = self.pairwise_variances[3*num_upper:] - electrostatics_energy = -self.p.k_electrostatics * self.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 + electrostatics_energy = -self.p.k_electrostatics * self.p.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * (template+template.T)[:,:,np.newaxis,np.newaxis]**2 contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) # for the variance potts model, there is one more kind of two-body interaction: # burial-pairwise covariance when the pairwise energy term involves the residue in the burial term # self.burial_pairwise_covariances_2 has shape (3N, 4(N^2-N)/2) # we first multiply each row by the appropriate burial energy temp = self.burial_pairwise_covariances_2 - low = temp[::3,:,np.newaxis]*0.5*self.p.k_contact*self.burial_gamma[h_index[1],0] - med = temp[1::3,:,:]*0.5*self.p.k_contact*self.burial_gamma[h_index[1],1] - high = temp[2::3,:,:]*0.5*self.p.k_contact*self.burial_gamma[h_index[1],2] + low = temp[::3,:,np.newaxis]*0.5*self.p.k_contact*self.p.burial_gamma[h_index[1],0] + med = temp[1::3,:,:]*0.5*self.p.k_contact*self.p.burial_gamma[h_index[1],1] + high = temp[2::3,:,:]*0.5*self.p.k_contact*self.p.burial_gamma[h_index[1],2] # we can now collapse our 3 burial indicator types temp = np.sum(np.concatenate((low[None,...], med[None,...], high[None,...]), axis=0), axis=0) assert temp.shape == (self.N, 4*((self.N**2-self.N)/2)), temp.shape @@ -1110,11 +1130,11 @@ def calculate_energy_and_potts(self): # but only N of them include the same residue from the burial indicator; # others have a value of 0, which we can easily eliminate) # we also need to multiply by our pairwise gammas - direct = direct[direct != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.direct_gamma[J_index[3]]*self.p.k_contact - prot = prot[prot != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.protein_gamma[J_index[3]]*self.p.k_contact - wat = wat[wat != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.water_gamma[J_index[3]]*self.p.k_contact + direct = direct[direct != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.p.direct_gamma[J_index[3]]*self.p.k_contact + prot = prot[prot != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.p.protein_gamma[J_index[3]]*self.p.k_contact + wat = wat[wat != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.p.water_gamma[J_index[3]]*self.p.k_contact ############################################################################################################################ - elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.electrostatics_gamma[np.newaxis,np.newaxis,:,:][J_index[3]]*self.p.k_contact + elec = elec[elec != 0].reshape((self.N,self.N,self.q))[...,np.newaxis]*self.p.electrostatics_gamma[np.newaxis,np.newaxis,:,:][J_index[3]]*self.p.k_contact # ???????????????? why are we multiplying electrostatics by k_contact? # electrostatics_gamma already had the electrostatics weight k_electrostatics multiplied in and k_electrostatics # isn't necessarily equal to k_contact. Anyway, i'm now going to factor k_electrostatics out of electrostatics_gamma @@ -1146,9 +1166,9 @@ def calculate_energy_and_potts(self): for counter,row in enumerate(self.burial_pairwise_covariances_2): direct_indicators = row[:len(row)//4] direct_energy = direct_indicators[direct_indicators>0].reshape((-1,1))\ - *0.5*self.p.k_contact*self.burial_gamma[:,counter%3]\ + *0.5*self.p.k_contact*self.p.burial_gamma[:,counter%3]\ *self.p.k_contact*self.direct_gamma[] - direct = row[row!=0] * 0.5*self.p.k_contact*self.burial_gamma[] + direct = row[row!=0] * 0.5*self.p.k_contact*self.p.burial_gamma[] template[counter] = row[row==1] # where the residue corresponding to the burial row is involved in the pairwise indicator burial_pairwise_2 = self.burial_pairwise_covariances_2[] """ diff --git a/frustratometer/classes/Gamma.py b/frustratometer/classes/Gamma.py index a31c361b..d7103f04 100644 --- a/frustratometer/classes/Gamma.py +++ b/frustratometer/classes/Gamma.py @@ -34,6 +34,10 @@ def __init__(self, data, segment_definition=None, description=None, alphabet=Non self._validate_segments() + @property + def q(self): + return len(self.alphabet) + def _init_from_array(self, gamma_array): self.gamma_array = gamma_array diff --git a/tests/test_awsem_frustratometer.py b/tests/test_awsem_frustratometer.py index 8798070c..f186ccd9 100644 --- a/tests/test_awsem_frustratometer.py +++ b/tests/test_awsem_frustratometer.py @@ -226,17 +226,17 @@ def test_numba_potts_construction(): # burial, direct, protein, water don't have a factor of -1 when the should, # and electrostatics has a factor of -1 when it shouldn't. # For some reason, this is just how we do the potts model. - J_index = np.meshgrid(range(model.N), range(model.N), range(model.q), range(model.q), indexing='ij', sparse=False) - h_index = np.meshgrid(range(model.N), range(model.q), indexing='ij', sparse=False) + J_index = np.meshgrid(range(model.N), range(model.N), range(model.p.q), range(model.p.q), indexing='ij', sparse=False) + h_index = np.meshgrid(range(model.N), range(model.p.q), indexing='ij', sparse=False) # compute burial and contact energies - old_burial_energy = 0.5 * model.p.k_contact * model.burial_gamma[h_index[1]] * model.burial_indicator[:, np.newaxis, :] - direct = model.direct_indicator * model.direct_gamma[J_index[2], J_index[3]] - water_mediated = model.water_indicator * model.water_gamma[J_index[2], J_index[3]] - protein_mediated = model.protein_indicator * model.protein_gamma[J_index[2], J_index[3]] + old_burial_energy = 0.5 * model.p.k_contact * model.p.burial_gamma[h_index[1]] * model.burial_indicator[:, np.newaxis, :] + direct = model.direct_indicator * model.p.direct_gamma[J_index[2], J_index[3]] + water_mediated = model.water_indicator * model.p.water_gamma[J_index[2], J_index[3]] + protein_mediated = model.protein_indicator * model.p.protein_gamma[J_index[2], J_index[3]] contact_energy = model.p.k_contact * np.array([direct, water_mediated, protein_mediated]) * model.sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] - electrostatics_energy = -model.p.k_electrostatics * model.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * model.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ + electrostatics_energy = -model.p.k_electrostatics * model.p.electrostatics_gamma[np.newaxis,np.newaxis,:,:] * model.electrostatics_indicator[:,:,np.newaxis,np.newaxis]\ * model.electrostatics_mask[:,:,np.newaxis,np.newaxis] contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) old_contact_energy = contact_energy @@ -259,17 +259,17 @@ def test_numba_potts_construction(): model.p.min_sequence_separation_rho, chain_starts, chain_ends, model.distance_matrix, - model.p.k_contact, model.burial_gamma) + model.p.k_contact, model.p.burial_gamma) new_potts_model['J'] = ham.compute_potts_model_J_parallel( model.p.electrostatics_screening_length, model.p.min_sequence_separation_rho, model.p.min_sequence_separation_contact, model.p.min_sequence_separation_electrostatics, chain_starts, chain_ends, contact_max_dist, 10*model.p.electrostatics_screening_length, # maximum distance for contact potential, maximum for electrostatics model.distance_matrix, - model.p.k_contact, model.direct_gamma, - model.p.k_contact, model.protein_gamma, - model.p.k_contact, model.water_gamma, - model.p.k_electrostatics, model.electrostatics_gamma) + model.p.k_contact, model.p.direct_gamma, + model.p.k_contact, model.p.protein_gamma, + model.p.k_contact, model.p.water_gamma, + model.p.k_electrostatics, model.p.electrostatics_gamma) #np.save('new_way_h.npy',new_potts_model['h']) #np.save('new_way_J.npy',new_potts_model['J']) assert np.max(np.abs(old_potts_model['h'] - new_potts_model['h'])) < 1E-5 # 10^-5 kJ/mol error is acceptable From 5d8a30a9354251378d67a8c8f6493c4fd0e8fb22 Mon Sep 17 00:00:00 2001 From: Finley Clark Date: Mon, 22 Dec 2025 13:34:15 -0600 Subject: [PATCH 76/76] saving changes --- frustratometer/classes/AWSEM.py | 67 +++-- frustratometer/numba_util/hamiltonian.py | 325 ++++++++++++++--------- 2 files changed, 247 insertions(+), 145 deletions(-) diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index a451f6c1..ea0f86ca 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -268,6 +268,7 @@ def __init__(self, self._decoy_fluctuation = {} # used for non-configurational frustration calculations self._minimally_frustrated_threshold=.78 # this should be a class variable or an argument to __init__ self._native_energy = None + self._potts_model = None # although the alphabet is really an attribute of the AWSEM # Hamiltonian, and therefore belongs in the ParametersAWSEM instance, @@ -327,6 +328,55 @@ def aa_freq(self): @property def contact_freq(self): return frustration.compute_contact_freq(self.sequence, self.p.alphabet) + @property + def potts_model(self): + if self._potts_model is None: + if self.potts_option: + #raise AssertionError(f""" + #The user requested potts model calculation but apparently + #it wasn't done upon initialization of {self.__class__}. + #This is likely an issue with the _AWSEMBase class.""") + # + # the user may have changed their mind after initializing + # the object (see the else block of this conditional), + # so the above assertion is inappropriate + self.calculate_energy_and_potts() + return self._potts_model + else: + warnings.warn(f""" + Attempting to access (N,N,q,q)-shaped numpy array of potts model, + but {self.__class__}.potts_option evaluated to False. + Will not return potts model. + To get the potts model, set self.potts_option=True and try again.""") + return None + else: + return self._potts_model + @potts_model.setter + def potts_model(self, value): + raise NotImplementedError(f"Cannot directly set {self.__class__}.potts_model") + + # this is like derived properties (see above), but certain functions + # require native_energy to be a callable method instead of a property + def native_energy(self): + if self._potts_model is not None: + energy = super().native_energy() # method to compute native energy given potts model + else: + l_D = float(self.p.electrostatics_screening_length) + min_seq_sep_rho = self.p.min_sequence_separation_rho + min_seq_sep_contact = self.p.min_sequence_separation_contact + min_seq_sep_electrostatic = self.p.min_sequence_separation_electrostatics + energy = ham.compute_potential_total( + l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, + chain_starts, chain_ends, + dist_mat=self.distance_matrix, + lambda_direct=self.p.k_contact, direct_gamma=self.p.direct_gamma, + lambda_protein=self.p.k_contact, protein_gamma=self.p.protein_gamma, + lambda_water=self.p.k_contact, water_gamma=self.p.water_gamma, + lambda_burial=self.p.k_contact, burial_gamma=self.p.burial_gamma, + lambda_electrostatic=self.p.k_electrostatics, electrostatic_gamma=self.p.electrostatic_gamma, + seq_index=self.seq_index, parallel=True) # can set to False if having numba issues + #self._native_energy = energy # maybe _native_energy is needed for compatibility with certain things? + return energy # this format is a little bit unusual but is useful for the optimization code @property @@ -391,12 +441,6 @@ def selected_matrix(self): def selected_matrix(self, value): raise NotImplementedError(f"Cannot directly set {self.__class__}.selected_matrix") - @property - def potts_model(self): - return self._potts_model - @potts_model.setter - def potts_model(self, value): - raise NotImplementedError(f"Cannot directly set {self.__class__}.potts_model") ################################################################################## @@ -512,16 +556,7 @@ def calculate_energy_and_potts(self, chain_starts=None, chain_ends=None): If you want to get the energies for your own purposes, set self.potts_option=True and then call calculate_energy_and_potts.""") - # not really sure what the point of this is - def native_energy(self): - if self.potts_option: - if not hasattr(self, '_potts_model'): # create potts model if it doesn't already exist - self.calculate_energy_and_potts() - energy = super().native_energy() # method to compute native energy given potts model - else: - energy = 0 # fill in numba function here - #self._native_energy = energy # maybe _native_energy is needed for compatibility with certain things? - return energy + # methods to calculate different kinds of frustration def compute_configurational_decoy_statistics(self): diff --git a/frustratometer/numba_util/hamiltonian.py b/frustratometer/numba_util/hamiltonian.py index af65fd14..10fb58d2 100644 --- a/frustratometer/numba_util/hamiltonian.py +++ b/frustratometer/numba_util/hamiltonian.py @@ -401,9 +401,10 @@ def compute_electrostatic_indicator(l_D, dist_ij): # THESE FUNCTIONS **DON'T** CHECK MASK CONDITIONS! # # BURIAL POTENTIAL +""" @njit(signature_or_function=float64(float64[:], float64, float64[:])) def compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_burial, gamma): - """ + """""" Compute the burial energy for residue i based on its local density. Note that this function computes and sums the 3 types of burial energies: low-density, medium-density, and high-density. @@ -416,7 +417,7 @@ def compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_bur ------- burial_energy : float Total burial energy for residue i, sum across all three burial wells. - """ + """""" # Caution: the burial indicator functions range from 0 to 2, # not 0 to 1, like the other indicator functions. # This is why we have a coefficient of 0.5 in the energy expression. @@ -429,6 +430,7 @@ def compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_bur burial_energy = -0.5*lambda_burial *\ (low_indicator*low_gamma+medium_indicator*medium_gamma+high_indicator*high_gamma) return burial_energy +""" @njit(signature_or_function=float64(float64, float64, float64[:])) def compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma): """ @@ -446,11 +448,20 @@ def compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma): Total burial energy for residue i, sum across all three burial wells. """ burial_indicator = compute_burial_indicator_i(rho_i) - burial_energy = compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_burial, gamma) + #burial_energy = compute_burial_potential_i_from_indicator_gamma(burial_indicator, lambda_burial, gamma) + low_indicator = burial_indicator[0] + low_gamma = gamma[0] + medium_indicator = burial_indicator[1] + medium_gamma = gamma[1] + high_indicator = burial_indicator[2] + high_gamma = gamma[2] + burial_energy = -0.5*lambda_burial *\ + (low_indicator*low_gamma+medium_indicator*medium_gamma+high_indicator*high_gamma) return burial_energy +""" @njit(signature_or_function=float64(int64, float64, float64, float64[:,:], int64[:])) def compute_burial_potential_i_from_rho(i, rho_i, lambda_burial, burial_gamma, seq_index): - """ + """""" Compute the burial energy for residue i based on its local density. Note that this function computes and sums the 3 types of burial energies: low-density, medium-density, and high-density. @@ -463,10 +474,11 @@ def compute_burial_potential_i_from_rho(i, rho_i, lambda_burial, burial_gamma, s ------- burial_energy : float Total burial energy for residue i, sum across all three burial wells. - """ + """""" gamma = burial_gamma[seq_index[i]] burial_energy = compute_burial_potential_i_from_rho_gamma(rho_i, lambda_burial, gamma) return burial_energy +""" @njit(signature_or_function=float64(int64, int64, int64[:], int64[:], float64[:,:], float64, float64[:])) def compute_burial_potential_i_from_gamma(i, min_seq_sep_rho, chain_starts, chain_ends, dist_mat, lambda_burial, gamma): """ @@ -508,22 +520,22 @@ def compute_burial_potential_i(i, min_seq_sep_rho, chain_starts, chain_ends, dis # feel free to add more functions with different signatures for greater flexibility of use # # DIRECT POTENTIAL -@njit(signature_or_function=float64(float64, float64, float64)) -def compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma): - """ - Compute the direct interaction potential for a pair of residues. - - Parameters - ---------- - See module-level docstring. - - Returns - ------- - direct_energy : float - Energy of the direct contact term for the pair (i,j), - set to 0 if the pair is masked. - """ - return -lambda_direct * thetaI * gamma +#@njit(signature_or_function=float64(float64, float64, float64)) +#def compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma): +# """ +# Compute the direct interaction potential for a pair of residues. +# +# Parameters +# ---------- +# See module-level docstring. +# +# Returns +# ------- +# direct_energy : float +# Energy of the direct contact term for the pair (i,j), +# set to 0 if the pair is masked. +# """ +# return -lambda_direct * thetaI * gamma @njit(signature_or_function=float64(float64, float64, float64)) def compute_direct_potential_ij_from_distij_gamma(dist_ij, lambda_direct, gamma): """ @@ -542,7 +554,7 @@ def compute_direct_potential_ij_from_distij_gamma(dist_ij, lambda_direct, gamma) # get indicator thetaI = compute_thetaI(dist_ij) # put it all together - direct_energy = compute_direct_potential_ij_from_thetaI_gamma(thetaI, lambda_direct, gamma) + direct_energy = -lambda_direct * thetaI * gamma return direct_energy @njit(signature_or_function=float64(int64, int64, float64[:,:], float64, float64)) def compute_direct_potential_ij_from_gamma(i, j, dist_mat, lambda_direct, gamma): @@ -580,61 +592,62 @@ def compute_direct_potential_ij(i, j, dist_mat, lambda_direct, direct_gamma, seq return compute_direct_potential_ij_from_gamma(i, j, dist_mat, lambda_direct, gamma) # # LONG RANGE (protein-mediated and water-mediated) CONTACT POTENTIALS -@njit(signature_or_function=numba.types.UniTuple(float64,2)( - float64, float64, float64, float64, float64, float64)) -def compute_long_potentials_ij_from_sigmawater_thetaII_gamma(thetaII, sigma_water, - lambda_protein, gamma_p, lambda_water, gamma_w): - """ - Compute the protein-mediated and water-mediated (long-range) potentials - for a pair of residues. - - Parameters - ---------- - See module-level docstring. - - Returns - ------- - protein_energy : float - Energy of the protein-mediated contact term for the pair (i,j), - set to 0 if the pair is masked. - water_energy : float - Energy of the water-mediated contact term for the pair (i,j), - set to 0 if the pair is masked. - """ - # this function is defined so that we have the details of the - # calculation in one place and don't have to type the equation - # in several different places. probably not a big deal, - # but just trying to follow best practices - sigma_protein = 1.0 - sigma_water - protein_energy = -lambda_protein * thetaII * sigma_protein * gamma_p - water_energy = -lambda_water * thetaII * sigma_water * gamma_w - return protein_energy, water_energy -@njit(signature_or_function=numba.types.UniTuple(float64,2)(float64, float64, float64, float64, float64, float64)) -def compute_long_potentials_ij_from_sigmawater_distij_gamma(dist_ij, sigma_water, - lambda_protein, gamma_p, lambda_water, gamma_w): - """ - Compute the protein-mediated and water-mediated (long-range) potentials - for a pair of residues. - - Parameters - ---------- - See module-level docstring. - - Returns - ------- - protein_energy : float - Energy of the protein-mediated contact term for the pair (i,j), - set to 0 if the pair is masked. - water_energy : float - Energy of the water-mediated contact term for the pair (i,j), - set to 0 if the pair is masked. - """ - # get indicators and sigma values - thetaII = compute_thetaII(dist_ij) - # compute energies - protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_thetaII_gamma( - thetaII, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) - return protein_energy, water_energy +#@njit(signature_or_function=numba.types.UniTuple(float64,2)( +# float64, float64, float64, float64, float64, float64)) +#def compute_long_potentials_ij_from_sigmawater_thetaII_gamma(thetaII, sigma_water, +# lambda_protein, gamma_p, lambda_water, gamma_w): +# """ +# Compute the protein-mediated and water-mediated (long-range) potentials +# for a pair of residues. +# +# Parameters +# ---------- +# See module-level docstring. +# +# Returns +# ------- +# protein_energy : float +# Energy of the protein-mediated contact term for the pair (i,j), +# set to 0 if the pair is masked. +# water_energy : float +# Energy of the water-mediated contact term for the pair (i,j), +# set to 0 if the pair is masked. +# """ +# # this function is defined so that we have the details of the +# # calculation in one place and don't have to type the equation +# # in several different places. probably not a big deal, +# # but just trying to follow best practices +# sigma_protein = 1.0 - sigma_water +# protein_energy = -lambda_protein * thetaII * sigma_protein * gamma_p +# water_energy = -lambda_water * thetaII * sigma_water * gamma_w +# return protein_energy, water_energy +#@njit(signature_or_function=numba.types.UniTuple(float64,2)(float64, float64, float64, float64, float64, float64)) +#def compute_long_potentials_ij_from_sigmawater_distij_gamma(dist_ij, sigma_water, +# lambda_protein, gamma_p, lambda_water, gamma_w): +# """ +# Compute the protein-mediated and water-mediated (long-range) potentials +# for a pair of residues. +# +# Parameters +# ---------- +# See module-level docstring. +# +# Returns +# ------- +# protein_energy : float +# Energy of the protein-mediated contact term for the pair (i,j), +# set to 0 if the pair is masked. +# water_energy : float +# Energy of the water-mediated contact term for the pair (i,j), +# set to 0 if the pair is masked. +# """ +# # get indicators and sigma values +# thetaII = compute_thetaII(dist_ij) +# # compute energies +# sigma_protein = 1.0 - sigma_water +# protein_energy = -lambda_protein * thetaII * sigma_protein * gamma_p +# water_energy = -lambda_water * thetaII * sigma_water * gamma_w +# return protein_energy, water_energy @njit(signature_or_function=numba.types.UniTuple(float64,2)(int64, int64, float64, float64, float64, float64[:,:], float64, float64[:,:], int64[:])) def compute_long_potentials_ij_from_sigmawater_distij(i, j, dist_ij, sigma_water, @@ -688,31 +701,31 @@ def compute_long_potentials_ij_from_sigmawater(i, j, dist_mat, sigma_water, protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_distij(i, j, dist_ij, sigma_water, lambda_protein, protein_gamma, lambda_water, water_gamma, seq_index) return protein_energy, water_energy -@njit(signature_or_function=numba.types.UniTuple(float64,2)( - float64, float64, float64, float64, float64, float64, float64)) -def compute_long_potentials_ij_from_rho_thetaII_gamma(rho_i, rho_j, thetaII, - lambda_protein, gamma_p, lambda_water, gamma_w): - """ - Compute the protein-mediated and water-mediated (long-range) potentials - for a pair of residues. - - Parameters - ---------- - See module-level docstring. - - Returns - ------- - protein_energy : float - Energy of the protein-mediated contact term for the pair (i,j), - set to 0 if the pair is masked. - water_energy : float - Energy of the water-mediated contact term for the pair (i,j), - set to 0 if the pair is masked. - """ - sigma_water = compute_sigma_water(rho_i, rho_j) - protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_thetaII_gamma( - thetaII, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) - return protein_energy, water_energy +#@njit(signature_or_function=numba.types.UniTuple(float64,2)( +# float64, float64, float64, float64, float64, float64, float64)) +#def compute_long_potentials_ij_from_rho_thetaII_gamma(rho_i, rho_j, thetaII, +# lambda_protein, gamma_p, lambda_water, gamma_w): +# """ +# Compute the protein-mediated and water-mediated (long-range) potentials +# for a pair of residues. +# +# Parameters +# ---------- +# See module-level docstring. +# +# Returns +# ------- +# protein_energy : float +# Energy of the protein-mediated contact term for the pair (i,j), +# set to 0 if the pair is masked. +# water_energy : float +# Energy of the water-mediated contact term for the pair (i,j), +# set to 0 if the pair is masked. +# """ +# sigma_water = compute_sigma_water(rho_i, rho_j) +# protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_thetaII_gamma( +# thetaII, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) +# return protein_energy, water_energy @njit(signature_or_function=numba.types.UniTuple(float64,2)( float64, float64, float64, float64, float64, float64, float64)) def compute_long_potentials_ij_from_rho_distij_gamma(dist_ij, rho_i, rho_j, @@ -736,8 +749,14 @@ def compute_long_potentials_ij_from_rho_distij_gamma(dist_ij, rho_i, rho_j, """ sigma_water = compute_sigma_water(rho_i, rho_j) #assert 0 < sigma_water < 1, f'rho_i: {repr(rho_i)}, rho_j: {repr(rho_j)}, sigma_water: {repr(sigma_water)}' - protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_distij_gamma( - dist_ij, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) + #protein_energy, water_energy = compute_long_potentials_ij_from_sigmawater_distij_gamma( + # dist_ij, sigma_water, lambda_protein, gamma_p, lambda_water, gamma_w) + + thetaII = compute_thetaII(dist_ij) + # compute energies + sigma_protein = 1.0 - sigma_water + protein_energy = -lambda_protein * thetaII * sigma_protein * gamma_p + water_energy = -lambda_water * thetaII * sigma_water * gamma_w return protein_energy, water_energy @njit(signature_or_function=numba.types.UniTuple(float64,2)( int64, int64, int64, int64[:], int64[:], float64[:,:], float64, float64, float64, float64)) @@ -792,8 +811,28 @@ def compute_long_potentials_ij(i, j, min_seq_sep_rho, chain_starts, chain_ends, lambda_protein, lambda_water, gamma_p, gamma_w) # feel free to add more functions with different signatures for greater flexibility of use # -@njit(signature_or_function=float64(float64, float64, float64)) -def compute_electrostatic_potential_ij_from_indicator_gamma(electrostatic_indicator, lambda_electrostatic, gamma): +#@njit(signature_or_function=float64(float64, float64, float64)) +#def compute_electrostatic_potential_ij_from_indicator_gamma(electrostatic_indicator, lambda_electrostatic, gamma): +# """ +# Compute the solvation-averaged electrostatic potential +# for a pair of residues. +# +# Parameters +# ---------- +# See module-level docstring. +# +# Returns +# ------- +# electrostatic_energy : float +# Energy of the electrostatic interaction between residues i and j +# """ +# # gamma is negative if interaction is favorable and positive if +# # unfavorable, and our lambdas and indicators are all positive by convention, +# # so we don't precede this equation with a negative sign +# #return -lambda_electrostatic * electrostatic_indicator * gamma +# return lambda_electrostatic * electrostatic_indicator * gamma +@njit(signature_or_function=float64(float64, float64, float64, float64)) +def compute_electrostatic_potential_ij_from_distij_gamma(l_D, dist_ij, lambda_electrostatic, gamma): """ Compute the solvation-averaged electrostatic potential for a pair of residues. @@ -807,33 +846,20 @@ def compute_electrostatic_potential_ij_from_indicator_gamma(electrostatic_indica electrostatic_energy : float Energy of the electrostatic interaction between residues i and j """ + indicator = compute_electrostatic_indicator(l_D, dist_ij) + #electrostatic_energy = compute_electrostatic_potential_ij_from_indicator_gamma( + # indicator, lambda_electrostatic, gamma) + # gamma is negative if interaction is favorable and positive if # unfavorable, and our lambdas and indicators are all positive by convention, # so we don't precede this equation with a negative sign #return -lambda_electrostatic * electrostatic_indicator * gamma return lambda_electrostatic * electrostatic_indicator * gamma -@njit(signature_or_function=float64(float64, float64, float64, float64)) -def compute_electrostatic_potential_ij_from_distij_gamma(l_D, dist_ij, lambda_electrostatic, gamma): - """ - Compute the solvation-averaged electrostatic potential - for a pair of residues. - - Parameters - ---------- - See module-level docstring. - - Returns - ------- - electrostatic_energy : float - Energy of the electrostatic interaction between residues i and j - """ - indicator = compute_electrostatic_indicator(l_D, dist_ij) - electrostatic_energy = compute_electrostatic_potential_ij_from_indicator_gamma( - indicator, lambda_electrostatic, gamma) return electrostatic_energy +""" @njit(signature_or_function=float64(int64, int64, float64, float64[:,:], float64, float64)) def compute_electrostatic_potential_ij_from_gamma(i, j, l_D, dist_mat, lambda_electrostatic, gamma): - """ + """""" Compute the solvation-averaged electrostatic potential for a pair of residues. @@ -845,10 +871,11 @@ def compute_electrostatic_potential_ij_from_gamma(i, j, l_D, dist_mat, lambda_el ------- electrostatic_energy : float Energy of the electrostatic interaction between residues i and j - """ + """""" dist_ij = dist_mat[i,j] electrostatic_energy = compute_electrostatic_potential_ij_from_distij_gamma(l_D, dist_ij, lambda_electrostatic, gamma) return electrostatic_energy +""" @njit(signature_or_function=float64(int64, int64, float64, float64[:,:], float64, float64[:,:], int64[:])) def compute_electrostatic_potential_ij(i, j, l_D, dist_mat, lambda_electrostatic, electrostatic_gamma, seq_index): """ @@ -1131,6 +1158,7 @@ def compute_potential_total(l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_s # PAIR ENERGY: burial(i)+burial(j)+direct(i,j)+protein(i,j)+water(i,j)+electrostatic(i,j) # important: total energy is NOT sum over all pairs ij of pair_energy(i,j) # these functions DO NOT check mask conditions +""" @njit(signature_or_function=float64( float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, @@ -1150,6 +1178,7 @@ def compute_pair_energy_ij_useful( pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ protein_energy + water_energy + electrostatic_energy return pair_energy +""" @njit(signature_or_function=float64( int64, int64, float64, int64, int64[:], int64[:], float64[:,:], @@ -1186,6 +1215,7 @@ def compute_pair_energy_ij_from_gamma( protein_energy + water_energy + electrostatic_energy return pair_energy # +""" @njit(signature_or_function=float64( int64, int64, float64, float64[:,:], @@ -1227,7 +1257,9 @@ def compute_pair_energy_ij_from_rho_sigmawater( pair_energy = burial_energy_i + burial_energy_j + direct_energy +\ protein_energy + water_energy + electrostatic_energy return pair_energy +""" # +""" @njit(signature_or_function=float64(int64, int64, float64, float64[:,:], float64, float64, @@ -1246,7 +1278,7 @@ def compute_pair_energy_ij_from_rho(i, j, l_D, lambda_burial, burial_gamma, lambda_electrostatic, electrostatic_gamma, seq_index): - """ + """""" Compute the "pair energy" for residues i and j, defined as the sum of: - Direct contact energy - Protein-mediated contact energy @@ -1265,7 +1297,7 @@ def compute_pair_energy_ij_from_rho(i, j, l_D, ------- pair_energy : float Total "pair energy" of residues i and j - """ + """""" sigma_water = compute_sigma_water(rho_i, rho_j) pair_energy = compute_pair_energy_ij_from_rho_sigmawater( i, j, l_D, @@ -1278,6 +1310,7 @@ def compute_pair_energy_ij_from_rho(i, j, l_D, lambda_electrostatic, electrostatic_gamma, seq_index) return pair_energy +""" # @njit(signature_or_function=float64(int64, int64, float64, int64, int64[:], int64[:], float64[:,:], @@ -1384,6 +1417,34 @@ def compute_potts_model_h( return h compute_potts_model_h_parallel = njit(signature_or_function=signature, parallel=True)(compute_potts_model_h) compute_potts_model_h = njit(signature_or_function=signature)(compute_potts_model_h) + + +def potts_model_functions( l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, + chain_starts, chain_ends, max_dist_contact, max_dist_electrostatic): + + signature = float64[:,:,:,:]( + float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:], + float64, float64[:,:]) + def compute_potts_model_J(dist_mat, lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_electrostatic, electrostatic_gamma): + return J + return compute_potts_model_J + +compute_potts_model_J_function = potts_model_functions(l_D, min_seq_sep_rho, min_seq_sep_contact, min_seq_sep_electrostatic, + chain_starts, chain_ends, max_dist_contact, max_dist_electrostatic) + +compute_potts_model_J_function(dist_mat, lambda_direct, direct_gamma, + lambda_protein, protein_gamma, + lambda_water, water_gamma, + lambda_electrostatic, electrostatic_gamma) + + + # signature = float64[:,:,:,:]( float64, int64, int64, int64, @@ -1517,4 +1578,10 @@ def compute_pair_energy_matrix(l_D, return pair_energy_matrix compute_pair_energy_matrix_parallel = njit(signature_or_function=signature, parallel=True)(compute_pair_energy_matrix) compute_pair_energy_matrix = njit(signature_or_function=signature)(compute_pair_energy_matrix) -""" \ No newline at end of file +""" + +#def get_args_for_numba(self, target): +# name_dict = {'l_D' : self.electrostatics_screening_length, +# # etc. } +# needed_args = inspect.get_args(target) +# return name_dict[needed_args] \ No newline at end of file