-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathgenerate_data.py
More file actions
154 lines (121 loc) · 4.61 KB
/
generate_data.py
File metadata and controls
154 lines (121 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
'''
generate_data.py
Updated: 3/29/18
This script is used to generate pairwise distance matricies used for
convolutional neural network training. The script will store representations
in npz files within a /pairwise_data/ subdirectory. This script is used specifically to
generate data used for CASP experiments.
'''
import os
import numpy as np
from mpi4py import MPI
from scipy.ndimage.filters import gaussian_filter
from scipy.spatial.distance import pdist
from itertools import combinations
# Data generation parameters
data_folder = '../../../data/T0/' # Path to data folder
pairwise_distance_bins = [i*5 for i in range(10)]
################################################################################
# Static Parameters
chain = 'A' # Chain Id might need to be changed for PDBs missing identifier
seed = 458762 # For random distribution of tasks using MPI
residues = ['ALA', 'ARG', 'ASN', 'ASP', 'ASX', 'CYS', 'GLN',
'GLU', 'GLX', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR',
'UNK', 'VAL']
def parse_pdb(path, chain):
'''
Method parses atomic coordinate data from PDB.
Params:
path - str; PDB file path
chain - str; chain identifier
Returns:
data - np.array; PDB data
'''
# Parse residue, atom type and atomic coordinates
data = []
with open(path, 'r') as f:
lines = f.readlines()
residue = None
residue_data = []
flag = False
for row in lines:
if row[:4] == 'ATOM' and row[21] == chain:
flag = True
if residue != row[17:20]:
data.append(residue_data)
residue_data = []
residue = row[17:20]
atom_data = [row[17:20], row[12:16].strip(), row[30:38], row[38:46], row[47:54]]
residue_data.append(atom_data)
if row[:3] == 'TER' and flag: break
data = np.array(data[1:])
return data
def bin_pairwise_distances(protein_data, pairwise_distance_bins):
'''
Method bins pairwise distances of residue alpha carbons into 2D data grids.
Params:
protein_data - np.array;
pairwise_distance_bins - list; list of bins used to bin pairwise distances
Returns:
binned_pairwise - np.array;
'''
# Get alpha carbons
alpha_carbons = []
for i in range(len(protein_data)):
residue = np.array(protein_data[i])
ac_i = np.where(residue[:,1] == 'CA')
alpha_carbons.append(residue[ac_i][0])
alpha_carbons = np.array(alpha_carbons)
# Pairwise distances
dist = np.array(pdist(alpha_carbons[:,2:]))
labels = list(combinations(alpha_carbons[:,0],2))
labels = np.array([i[0] + i[1] for i in labels])
# Bin pairwise distances
bin_x = []
for r1 in residues:
bin_y = []
for r2 in residues:
i = np.where(labels == r1+r2)
H, bins = np.histogram(dist[i], bins=pairwise_distance_bins)
H = gaussian_filter(H, 0.5)
bin_y.append(H)
bin_x.append(bin_y)
binned_pairwise = np.array(bin_x)
return binned_pairwise
if __name__ == '__main__':
# Set paths relative to this file
os.chdir(os.path.dirname(os.path.realpath(__file__)))
# MPI init
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
cores = comm.Get_size()
# MPI task distribution
if rank == 0:
tasks = []
if not os.path.exists(data_folder+'pairwise_data'): os.mkdir(data_folder+'pairwise_data')
# Search for data directories
for data_path in sorted(os.listdir(data_folder+'pdbs')):
if data_path.endswith('.pdb'):
tasks.append(data_folder+'pdbs/'+data_path)
# Shuffle for random distribution
np.random.seed(seed)
np.random.shuffle(tasks)
else: tasks = None
# Broadcast tasks to all nodes and select tasks according to rank
tasks = comm.bcast(tasks, root=0)
tasks = np.array_split(tasks, cores)[rank]
for t in tasks:
path = t
if chain == None: chain == 'A'
save_path = '/'.join(t.split('/')[:-2]) + '/pairwise_data/'+ t.split('/')[-1][:-3]+'npz'
# Parse PDB
protein_data = parse_pdb(path, chain)
try:
# Bin pairwise distances
binned_pairwise_distances = bin_pairwise_distances(protein_data, pairwise_distance_bins)
# Save data
np.savez(save_path, binned_pairwise_distances)
print("Generated:", '/'.join(save_path.split('/')[-3:]))
except: print("Error generating data...")
print("Data Generation Complete.")