Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__pycache__/
.coverage
decombinator.egg-info
.pytest_cache
.decombinatorenv
Expand Down
149 changes: 96 additions & 53 deletions src/decombinator/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,11 @@ def check_umi_quality(qualstring: tuple[str], parameters: list[int]) -> bool:
return number_below_min > parameters[1] or average_quality < parameters[2]


def are_seqs_equivalent(seq1, seq2, lev_percent_threshold):
def are_seqs_equivalent(seq1: str, seq2: str, lev_threshold_fraction: float):
# Returns True if seqs can be considered the same, False otherwise
# Definition of equivalent:
# levenshtein distance as a percentage of the shorter of the two seqs is <= threshold
threshold = len(min(seq1, seq2, key=len)) * lev_percent_threshold
# levenshtein distance as a fraction of the shorter of the two seqs is <= threshold
threshold = len(min(seq1, seq2, key=len)) * lev_threshold_fraction
return polyleven.levenshtein(seq1, seq2) <= threshold


Expand Down Expand Up @@ -482,7 +482,7 @@ def read_in_data(
data,
inputargs,
barcode_quality_parameters,
lev_threshold,
lev_threshold_fraction,
dont_count,
opener,
):
Expand Down Expand Up @@ -566,15 +566,17 @@ def read_in_data(
if not inputargs["sampling_analysis"]:
dcretc = "|".join([str(dcr), seq, seq_qualstring, seq_id])
else:
pre_collapse_barcode = line[8]
full_barcode_region = line[8]
v_tail = line[10]
dcretc = "|".join(
[
str(dcr),
seq,
seq_qualstring,
seq_id,
pre_collapse_barcode,
barcode,
barcode_qualstring,
full_barcode_region,
v_tail,
]
)
Expand All @@ -590,54 +592,80 @@ def read_in_data(

if barcode in barcode_lookup:

for index in barcode_lookup[barcode]:
if are_seqs_equivalent(index[1], seq, lev_threshold):
for barcode_index, barcode_protoseq in barcode_lookup[barcode]:
if are_seqs_equivalent(
barcode_protoseq, seq, lev_threshold_fraction
):
barcode_dcretc[
barcode + "|" + str(index[0]) + "|" + index[1]
barcode
+ "|"
+ str(barcode_index)
+ "|"
+ barcode_protoseq
].append(dcretc)
protodcretc_list = barcode_dcretc[
barcode + "|" + str(index[0]) + "|" + index[1]
barcode
+ "|"
+ str(barcode_index)
+ "|"
+ barcode_protoseq
]
seq_counter = coll.Counter(
map(lambda x: x.split("|")[1], protodcretc_list)
)
protoseq = seq_counter.most_common(1)[0][
new_protoseq = seq_counter.most_common(1)[0][
0
] # find most common sequence in group

if not index[1] == protoseq:
if barcode_protoseq != new_protoseq:
# if there is a new protoseq, replace record with old protoseq
# with identical record with updated protoseq
barcode_dcretc[
barcode + "|" + str(index[0]) + "|" + protoseq
barcode
+ "|"
+ str(barcode_index)
+ "|"
+ new_protoseq
] = barcode_dcretc[
barcode + "|" + str(index[0]) + "|" + index[1]
barcode
+ "|"
+ str(barcode_index)
+ "|"
+ new_protoseq
]
del barcode_dcretc[
barcode + "|" + str(index[0]) + "|" + index[1]
barcode
+ "|"
+ str(barcode_index)
+ "|"
+ new_protoseq
]

barcode_lookup[barcode][index[0]] = [
index[0],
protoseq,
barcode_lookup[barcode][barcode_index] = [
barcode_index,
new_protoseq,
]

group_assigned = True
# if assigned to a group, stop and move onto next read
break
group_assigned = True
# if assigned to a group, stop and move onto next read
break

if not group_assigned:
# if no appropriate group found, create new group with correctly incremented index
barcode_lookup[barcode].append([index[0] + 1, seq])
new_index = len(barcode_lookup[barcode])
barcode_lookup[barcode].append([new_index, seq])
barcode_dcretc[
"|".join([barcode, str(index[0] + 1), seq])
"|".join([barcode, str(new_index), seq])
].append(dcretc)
group_assigned = True

else:
# if no identical barcode found, create new barcode group with index zero
barcode_lookup[barcode].append([0, seq])
barcode_dcretc["|".join([barcode, "0", seq])].append(dcretc)
first_index = 0
barcode_lookup[barcode].append([first_index, seq])
barcode_dcretc["|".join([barcode, str(first_index), seq])].append(
dcretc
)
group_assigned = True

counts["readdata_barcode_dcretc_keys"] = len(barcode_dcretc.keys())
Expand Down Expand Up @@ -712,7 +740,7 @@ def make_merge_groups(
def make_clusters(
merge_groups: sparse.coo_matrix,
barcode_dcretc_list: list[tuple[str, list[str]]],
seq_threshold: int,
lev_threshold_fraction: float,
) -> coll.defaultdict[str, list[str]]:
# Considers clusters as an undirected graph composed of disconnected subgraphs.
# The nodes of the graph are the initial groups of barcode/protosequences. Edges between nodes
Expand All @@ -728,15 +756,13 @@ def make_clusters(
# initialise empty graph
G = nx.Graph()

percent_seq_threshold = seq_threshold / 100.0

for i, j in zip(merge_groups.row, merge_groups.col):
protoseqs = [
barcode_dcretc_list[i][0].split("|")[2],
barcode_dcretc_list[j][0].split("|")[2],
]
if are_seqs_equivalent(
protoseqs[0], protoseqs[1], percent_seq_threshold
protoseqs[0], protoseqs[1], lev_threshold_fraction
):
G.add_edge(i, j)
n_merged_UMIs += 1
Expand Down Expand Up @@ -769,33 +795,48 @@ def make_clusters(
return clusters


def write_clusters(clusters):
# create directory to store cluster data without overwriting exiting directories
dirname = "clusters"
def write_clusters(
clusters: coll.defaultdict[str, list[str]],
inputargs: dict[str, typing.Union[str, bool, int]],
) -> None:
# create file to store cluster data without overwriting exiting files
chain: str = inputargs["chain"]
filename = "clusters_" + chain
ftype = ".psv.gz"
count = 1
while os.path.isdir(dirname):
dirname = "clusters" + str(count)
while os.path.isfile(filename + ftype):
filename = filename + str(count)
count += 1
os.mkdir(dirname)

filename = filename + ftype
print(
" Writing clusters to directory: ", os.path.abspath(dirname), "..."
" Writing clusters to directory: ", os.path.abspath(filename), "..."
)
# write data of each cluster to a separate file and store in clusters directory
for k in clusters:
with open(
dirname + os.sep + "|".join(k.split("|")[:2]) + ".txt", "w"
) as ofile:
for j in clusters[k]:
print(j, file=ofile)
return 1
header_names = [
"umi_id",
"dcr",
"inter_tag",
"inter_tag_qual",
"read_id",
"umi",
"umi_qual",
"full_oligo",
"v_tail",
]
# write data of each cluster to file
with gzip.open(filename, "wt") as cluster_file:
cluster_file.write("|".join(header_names) + "\n")
for cluster_id, dcretc_list in clusters.items():
cluster_name = ":".join(cluster_id.split("|")[:2])
for dcretc in dcretc_list:
cluster_file.write(cluster_name + "|" + dcretc + "\n")


def cluster_UMIs(
barcode_dcretc: coll.defaultdict[str, list[str]],
inputargs: dict[str, typing.Union[str, bool, int]],
barcode_threshold: int,
seq_threshold: int,
lev_threshold_fraction: float,
dont_count: bool,
) -> coll.defaultdict[str, list[str]]:
# input data of form: {'barcode1|index|protoseq': [dcretc1, dcretc2,...], 'barcode2|index|protoseq|: [dcretc1, dcretc2,...], ...}
Expand All @@ -818,7 +859,9 @@ def cluster_UMIs(
)

print(" ", "comparing TCR sequences of similar UMIs...")
clusters = make_clusters(matches, barcode_dcretc_list, seq_threshold)
clusters = make_clusters(
matches, barcode_dcretc_list, lev_threshold_fraction
)

print(
" ",
Expand All @@ -832,7 +875,7 @@ def cluster_UMIs(

# dump clusters to separate files if desired
if inputargs["writeclusters"]:
write_clusters(clusters)
write_clusters(clusters, inputargs)

return clusters

Expand All @@ -841,7 +884,7 @@ def collapsinate(
data,
inputargs,
barcode_quality_parameters,
lev_threshold,
lev_threshold_fraction,
barcode_distance_threshold,
outpath,
file_id,
Expand All @@ -854,7 +897,7 @@ def collapsinate(
data,
inputargs,
barcode_quality_parameters,
lev_threshold,
lev_threshold_fraction,
dont_count,
opener,
)
Expand All @@ -864,7 +907,7 @@ def collapsinate(
barcode_dcretc,
inputargs,
barcode_distance_threshold,
lev_threshold,
lev_threshold_fraction,
dont_count,
)

Expand Down Expand Up @@ -942,8 +985,8 @@ def collapsinator(inputargs: dict, data: list = None) -> list:
inputargs["avgQthreshold"],
]

## this is the percentage lev distance that is allowed to determine whether two sequences are equivalent
lev_threshold = inputargs["percentlevdist"]
## this is the fractional lev distance that is allowed to determine whether two sequences are equivalent
lev_threshold_fraction = inputargs["percentlevdist"] / 100

## this is the number of barcode edits that are allowed to call two barcodes equivalent
barcode_distance_threshold = inputargs["bcthreshold"]
Expand All @@ -963,7 +1006,7 @@ def collapsinator(inputargs: dict, data: list = None) -> list:
data,
inputargs,
barcode_quality_parameters,
lev_threshold,
lev_threshold_fraction,
barcode_distance_threshold,
outpath,
file_id,
Expand Down
3 changes: 2 additions & 1 deletion tests/resources/dcr_TINY_1_beta.freq
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 3
15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 2
15, 10, 4, 1, CTACCCCCGCAAAGAC, 1, 1
15, 10, 3, 1, CGGGGACCTACCCCC, 1, 1
43, 0, 5, 6, GGAGGGACAG, 1, 2
15, 6, 3, 9, CCTAGCGGAATACTCCTACAC, 1, 1
9, 6, 4, 0, CTCACGGGGGGTT, 1, 1
Expand Down
Loading