-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathprepareMicroarrayProbes.py
More file actions
140 lines (123 loc) · 4.73 KB
/
prepareMicroarrayProbes.py
File metadata and controls
140 lines (123 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import subprocess
import sys
usage = """
python prepareMicroarrayProbes.py MODE ...
there are four modes:
BUILD:
builds a bowtie index and requires two additional arguments:
(i) the cDNA sequence fasta file
(ii) the bowtie index name/path
example:
python prepareMicroarrayProbes.py BUILD myCDNA.fasta myBowtieIndex
TABTOFASTA:
reformats a table of probe names and sequences into a fasta file.
Requires five additional arguments:
(i) file with the tabular probe name and sequence data
(ii) tells if there is a header in the table (1 for yes, 0 for no)
(iii) column number (starting with 1) holding the probe names
(iv) column number (starting with 1) holding the sequences
(v) fasta file where the sequences will be written
ALIGN:
aligns the probes to the cDNA and requires four additional arguments:
(i) the bowtie index name/path (see BUILD above)
(ii) a fasta file with the probe sequences
(iii) a file where the unaligned sequences will be stored
(iv) a file where the aligned sequences will be stored
example:
python prepareMicroarrayProbes.py BUILD myCDNA.fasta myBowtieIndex
EXTRACT:
extracts the mappings from probe names to locus IDs and vice versa.
Requires three additional arguments:
(i) file with the aligned sequences (see ALIGN above)
(ii) file with the probe name to locus ID mappings
(iii) file with the locus ID to probe name mappings
"""
if len(sys.argv) < 2:
sys.exit(usage)
mode = sys.argv[1]
# build bowtie index
def buildIndex(fastaFile, indexPath):
command = "bowtie-build -f -q -o 0 %s %s" % (fastaFile, indexPath)
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(stdout_data, stderr_data) = process.communicate()
if stderr_data is not None:
print >> sys.stderr, "errors raised by bowtie-build: " + stderr_data
if stdout_data is not None:
print >> sys.stdout, "bowtie-build message: " + stdout_data
print >> sys.stderr, "build bowtie index"
# convert the sequence table with probeName, sequence, spot_id to a fasta file
def convertSeqTabToFasta(seqTab, hasHeader, probeColumn, sequenceColumn, fastaFile):
with open(seqTab, 'r') as infile, open(fastaFile, 'w') as outfile:
if hasHeader:
header = infile.readline()
print >> sys.stderr, "removed this header:", header[:-1]
for line in infile:
fields = line[:-1].split('\t')
pn = fields[probeColumn]
sequence = fields[sequenceColumn]
print >> outfile, '>'+pn
print >> outfile, sequence
print >> sys.stderr, "converted sequence table to fasta"
# align sequences
def alignSequences(bowtieIndex, sequenceFile, unalignedFile, alignedFile):
command = "bowtie -v 3 -m 10 -a --best --strata -p 4 -t %s -f %s --un %s %s" % (bowtieIndex, sequenceFile, unalignedFile, alignedFile)
# ask the user if he wants to run the command
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(stdout_data, stderr_data) = process.communicate()
if stderr_data is not None:
print >> sys.stderr, "errors raised by bowtie: " + stderr_data
if stdout_data is not None:
print >> sys.stdout, "bowtie message: " + stdout_data
print >> sys.stderr, "aligned sequences"
# get mapping tables
def getMapping(alignedFile, PNtoLOCUSfile, LOCUStoPNfile):
LOCUStoPN = {}
PNtoLOCUS = {}
with open(alignedFile, 'r') as infile:
for line in infile:
pn, strand, modelID, pos, sequence, quality, mm, conversions = line[:-1].split('\t')
geneID = modelID.split('.')[0]
try:
if pn not in LOCUStoPN[geneID]:
LOCUStoPN[geneID].append(pn)
except KeyError:
LOCUStoPN[geneID] = [pn]
try:
if geneID not in PNtoLOCUS[pn]:
PNtoLOCUS[pn].append(geneID)
except KeyError:
PNtoLOCUS[pn] = [geneID]
print >> sys.stderr, "extracted mappings"
with open(PNtoLOCUSfile, 'w') as outfile:
for pn, locusList in PNtoLOCUS.items():
print >> outfile, pn + '\t' + ';'.join(locusList)
with open(LOCUStoPNfile, 'w') as outfile:
for locus, pnList in LOCUStoPN.items():
print >> outfile, locus + '\t' + ';'.join(pnList)
#
mode = sys.argv[1]
if mode == "BUILD":
fastaFile = sys.argv[2]
indexPath = sys.argv[3]
buildIndex(fastaFile, indexPath)
elif mode == "TABTOFASTA":
seqTab = sys.argv[2]
hasHeader = bool(int(sys.argv[3]))
probeColumn = int(sys.argv[4])-1
sequenceColumn = int(sys.argv[5])-1
fastaFile = sys.argv[6]
convertSeqTabToFasta(seqTab, hasHeader, probeColumn, sequenceColumn, fastaFile)
elif mode == "ALIGN":
bowtieIndex = sys.argv[2]
sequenceFile = sys.argv[3]
unalignedFile = sys.argv[4]
alignedFile = sys.argv[5]
alignSequences(bowtieIndex, sequenceFile, unalignedFile, alignedFile)
elif mode == "EXTRACT":
alignedFile = sys.argv[2]
PNtoLOCUSfile = sys.argv[3]
LOCUStoPNfile = sys.argv[4]
getMapping(alignedFile, PNtoLOCUSfile, LOCUStoPNfile)
else:
print >> sys.stderr, "unkown mode"
sys.exit(usage)