-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDNAToolkit.py
More file actions
103 lines (92 loc) · 3.36 KB
/
Copy pathDNAToolkit.py
File metadata and controls
103 lines (92 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import collections
from sequences import *
#function for validating if DNA or not
def validateSeq(dna_seq):
tmpseq = dna_seq.upper()
for nuc in tmpseq:
if nuc not in Nucleotides:
return False
return tmpseq
#function for counting DNA seq length
def countNucFrequency(seq):
tmpFreqDict = {"A":0, "C":0, "T":0, "G":0}
for nuc in seq:
tmpFreqDict[nuc] += 1
return tmpFreqDict
#return dict(collections.Counter(seq))
#DNA -> RNA
def transcription(seq):
"""DNA -> RNA Transcription. Replacing Thymine(T) with Uracil(U)"""
return seq.replace("T", "U")
#reverse complimentation:
def reverse_complement(seq):
"""Reverse Complement converts a DNA sequence into its reverse"""
return ''.join([DNA_ReverseComplement[nuc] for nuc in seq])[::-1]
def gc_content(seq):
""" Calculating GC Content in DNA/RNA sequence"""
return round((seq.count('C') + seq.count('G')) / len(seq) * 100, 6)
def gc_content_subsec(seq, k=20):
"""GC content in DNA/RNA sub-sequence length k"""
res = []
for i in range(0, len(seq) -k + 1, k):
subseq = seq[i:i +k]
res.append(gc_content(subseq))
return res
def translate_seq(seq, init_pos=0):
"""Translates a DNA sequence into an Aminoacid sequence"""
return [DNA_Codons[seq[pos:pos +3]] for pos in range(init_pos, len(seq) - 2, 3)]
def codon_usage(seq, aminoacid):
"""provides the frequency of each codon encoding a given aminoacid in a DNA sequnce"""
tmpList = []
for i in range(0, len(seq) - 2, 3):
if DNA_Codons[seq[i:i +3]] == aminoacid:
tmpList.append(seq[i:i + 3])
freqDict = dict(Counter(tmpList))
totalWeight = sum(freqDict.values())
for seq in freqDict:
freqDict[seq] = round(freqDict[seq] / totalWeight, 2)
return freqDict
def gen_reading_frames(seq):
"""Generate the six reading frames of a DNA sequence"""
frames = []
frames.append(translate_seq(seq, 0))
frames.append(translate_seq(seq, 1))
frames.append(translate_seq(seq, 2))
frames.append(translate_seq(reverse_complement(seq), 0))
frames.append(translate_seq(reverse_complement(seq), 1))
frames.append(translate_seq(reverse_complement(seq), 2))
return frames
def protein_from_rf(aa_seq):
"""Compute all possible proteins in an Aminoacid sequence and return a list of possible proteins"""
current_prot = []
proteins = []
for aa in aa_seq:
if aa == '_':
#STOP accumulating aminoacids if _ -STOP was found
if current_prot:
for p in current_prot:
proteins.append(p)
current_prot = []
else:
#START accumulating aminoacids if M -START was found
if aa == 'M':
current_prot.append("")
for i in range(len(current_prot)):
current_prot[i] += aa
return proteins
#Generate RFs
#Extract all proteins
#Return a list of proteins (un)sorted
def all_proteins_from_orfs(seq, startReadPos=0, endReadPos=0, ordered=False):
if endReadPos > startReadPos:
rfs = gen_reading_frames(seq[startReadPos:endReadPos])
else:
rfs = gen_reading_frames(seq)
res = []
for rf in rfs:
prots = protein_from_rf(rf)
for p in prots:
res.append(p)
if ordered:
return sorted(res, key=len, reverse=True)
return res