-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathsubsampling.py
More file actions
100 lines (86 loc) · 4.25 KB
/
subsampling.py
File metadata and controls
100 lines (86 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!usr/bin/env python
import numpy as np
import os
import hicstraw
import pandas as pd
import glob
import argparse
# Using the argparse library to take in user inputs and making it user friendly
parser = argparse.ArgumentParser()
parser.add_argument("-hic", "--hicfile", dest="hic", help="input .hic file\n\n", metavar="HiC File", required=True)
parser.add_argument("-res", dest="res", help="input the resolutions to process the HiC file, can be comma spearated list\n\n", metavar="LIST", required=True)
parser.add_argument("-ratio", dest="ratio", help="input the ratio for subsampling\n\n", metavar="FLOAT", default="1.0", required=True)
parser.add_argument("-o", dest="output", help="path to output files\n\n", metavar="OUTPUT PATH", required=True)
parser.add_argument("-juicer", dest="juicer", help="path to juicer tools\n\n", metavar="JUICER_TOOLS PATH", required=True)
parser.add_argument("-sizes", dest="sizes", help="chromosome size file with two columns corresponding to chromosome and size respectively\n\n", metavar="SIZE FILE")
parser.add_argument("-cpu", dest="cpu", help="number of Gigs of RAM to use; default is 4GB\n\n", metavar="INT", default=4)
args = parser.parse_args()
# Creating an ouput directory
if not os.path.exists(args.output):
os.mkdir(args.output)
# List of chromosomes
chrom = [str(i) for i in range(1,22)]
chrom += ["X", "Y"]
# Calculating the highest resolution from the set of resolutions entered by the user
inputres = args.res
myres = inputres.split(",")
myres = list(map(int, myres))
highres = min(myres)
# Parsing through each chromosome from the chromosome list
for curchrom in chrom:
# Using the straw library to dump the intra chromosomal raw reads from HiC file
rowidxs = []
colidxs = []
scores = []
dumped = hicstraw.straw("observed","NONE", args.hic, curchrom, curchrom, "BP", highres)
for i in range(0,len(dumped)):
rowidxs.append(dumped[i].binX)
colidxs.append(dumped[i].binY)
scores.append(dumped[i].counts)
#rowidxs = np.array(dumped[:].binX)
#colidxs = np.array(dumped[:].binY)
#scores = np.array(dumped[:].counts)
#rowidxs = np.array(dumped[0][:])
#colidxs = np.array(dumped[1][:])
#scores = np.array(dumped[2][:])
length = len(scores)
str1 = list(); frag1=list(); frag2=list(); chr_ls = list()
# Subsampling the HiC file using the random.random_sample function
# Adding values to the str1, frag1, frag2, chr_ls lists to create short score fomrat file
for idx in range(length):
rand_reads = np.random.random_sample(int(scores[idx]))
read_count = len(np.where(rand_reads <= float(args.ratio))[0])
scores[idx] = read_count
str1.append(0)
frag1.append(1)
frag2.append(2)
chr_ls.append(curchrom)
# Creating a dataframe of the subsampled HiC file in short score format with values
data = {"rowidxs":rowidxs, "colidxs":colidxs, "scores":scores}
df = pd.DataFrame(data)
# inserting the columns in the dataframe in such way to represent the short score with values format
df.insert(0, "str1", str1, True)
df.insert(1, "chr1", chr_ls, True)
df.insert(3, "frag1", frag1, True)
df.insert(4, "str2", str1, True)
df.insert(5, "chr2", chr_ls, True)
df.insert(7, "frag2", frag2, True)
output_path = os.path.join(args.output, curchrom+".ssfv")
# creating a space separated values files which represents the short score format
df.to_csv(output_path, sep=" ", header=False, index=False)
# Merging all the short score format files of each chromosome to a single file
read_files = glob.glob(os.path.join(args.output,"*.ssfv"))
output_path = os.path.join(args.output, "subsampled")
with open(output_path, "wb") as outfile:
for f in read_files:
with open(f, "rb") as infile:
outfile.write(infile.read())
# Specifying the path for creation of the subsampled HiC file
hicoutpath = output_path + ".hic"
del_files = os.path.join(args.output, "*.ssfv")
my_cmd = "rm " + del_files
os.system(my_cmd)
cpu = "-Xmx"+str(args.cpu)+"g"
# Using juicer tools to create the HiC map with the arguments specified by the user
mycmd = 'java ' + cpu + ' -jar ' + args.juicer + ' pre ' + '-r ' + str(args.res) + ' ' + output_path + ' ' + hicoutpath + ' ' + args.sizes
os.system(mycmd)