innate2adaptive · mvcowley · Mar 20, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 __pycache__/
+.coverage
 decombinator.egg-info
 .pytest_cache
 .decombinatorenv

diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py
@@ -351,11 +351,11 @@ def check_umi_quality(qualstring: tuple[str], parameters: list[int]) -> bool:
     return number_below_min > parameters[1] or average_quality < parameters[2]
 
 
-def are_seqs_equivalent(seq1, seq2, lev_percent_threshold):
+def are_seqs_equivalent(seq1: str, seq2: str, lev_threshold_fraction: float):
     # Returns True if seqs can be considered the same, False otherwise
     # Definition of equivalent:
-    #   levenshtein distance as a percentage of the shorter of the two seqs is <= threshold
-    threshold = len(min(seq1, seq2, key=len)) * lev_percent_threshold
+    #   levenshtein distance as a fraction of the shorter of the two seqs is <= threshold
+    threshold = len(min(seq1, seq2, key=len)) * lev_threshold_fraction
     return polyleven.levenshtein(seq1, seq2) <= threshold
 
 
@@ -482,7 +482,7 @@ def read_in_data(
     data,
     inputargs,
     barcode_quality_parameters,
-    lev_threshold,
+    lev_threshold_fraction,
     dont_count,
     opener,
 ):
@@ -566,15 +566,17 @@ def read_in_data(
         if not inputargs["sampling_analysis"]:
             dcretc = "|".join([str(dcr), seq, seq_qualstring, seq_id])
         else:
-            pre_collapse_barcode = line[8]
+            full_barcode_region = line[8]
             v_tail = line[10]
             dcretc = "|".join(
                 [
                     str(dcr),
                     seq,
                     seq_qualstring,
                     seq_id,
-                    pre_collapse_barcode,
+                    barcode,
+                    barcode_qualstring,
+                    full_barcode_region,
                     v_tail,
                 ]
             )
@@ -590,54 +592,80 @@ def read_in_data(
 
         if barcode in barcode_lookup:
 
-            for index in barcode_lookup[barcode]:
-                if are_seqs_equivalent(index[1], seq, lev_threshold):
+            for barcode_index, barcode_protoseq in barcode_lookup[barcode]:
+                if are_seqs_equivalent(
+                    barcode_protoseq, seq, lev_threshold_fraction
+                ):
                     barcode_dcretc[
-                        barcode + "|" + str(index[0]) + "|" + index[1]
+                        barcode
+                        + "|"
+                        + str(barcode_index)
+                        + "|"
+                        + barcode_protoseq
                     ].append(dcretc)
                     protodcretc_list = barcode_dcretc[
-                        barcode + "|" + str(index[0]) + "|" + index[1]
+                        barcode
+                        + "|"
+                        + str(barcode_index)
+                        + "|"
+                        + barcode_protoseq
                     ]
                     seq_counter = coll.Counter(
                         map(lambda x: x.split("|")[1], protodcretc_list)
                     )
-                    protoseq = seq_counter.most_common(1)[0][
+                    new_protoseq = seq_counter.most_common(1)[0][
                         0
                     ]  # find most common sequence in group
 
-                    if not index[1] == protoseq:
+                    if barcode_protoseq != new_protoseq:
                         # if there is a new protoseq, replace record with old protoseq
                         # with identical record with updated  protoseq
                         barcode_dcretc[
-                            barcode + "|" + str(index[0]) + "|" + protoseq
+                            barcode
+                            + "|"
+                            + str(barcode_index)
+                            + "|"
+                            + new_protoseq
                         ] = barcode_dcretc[
-                            barcode + "|" + str(index[0]) + "|" + index[1]
+                            barcode
+                            + "|"
+                            + str(barcode_index)
+                            + "|"
+                            + new_protoseq
                         ]
                         del barcode_dcretc[
-                            barcode + "|" + str(index[0]) + "|" + index[1]
+                            barcode
+                            + "|"
+                            + str(barcode_index)
+                            + "|"
+                            + new_protoseq
                         ]
 
-                        barcode_lookup[barcode][index[0]] = [
-                            index[0],
-                            protoseq,
+                        barcode_lookup[barcode][barcode_index] = [
+                            barcode_index,
+                            new_protoseq,
                         ]
 
-                group_assigned = True
-                # if assigned to a group, stop and move onto next read
-                break
+                    group_assigned = True
+                    # if assigned to a group, stop and move onto next read
+                    break
 
             if not group_assigned:
                 # if no appropriate group found, create new group with correctly incremented index
-                barcode_lookup[barcode].append([index[0] + 1, seq])
+                new_index = len(barcode_lookup[barcode])
+                barcode_lookup[barcode].append([new_index, seq])
                 barcode_dcretc[
-                    "|".join([barcode, str(index[0] + 1), seq])
+                    "|".join([barcode, str(new_index), seq])
                 ].append(dcretc)
                 group_assigned = True
 
         else:
             # if no identical barcode found, create new barcode group with index zero
-            barcode_lookup[barcode].append([0, seq])
-            barcode_dcretc["|".join([barcode, "0", seq])].append(dcretc)
+            first_index = 0
+            barcode_lookup[barcode].append([first_index, seq])
+            barcode_dcretc["|".join([barcode, str(first_index), seq])].append(
+                dcretc
+            )
             group_assigned = True
 
     counts["readdata_barcode_dcretc_keys"] = len(barcode_dcretc.keys())
@@ -712,7 +740,7 @@ def make_merge_groups(
 def make_clusters(
     merge_groups: sparse.coo_matrix,
     barcode_dcretc_list: list[tuple[str, list[str]]],
-    seq_threshold: int,
+    lev_threshold_fraction: float,
 ) -> coll.defaultdict[str, list[str]]:
     # Considers clusters as an undirected graph composed of disconnected subgraphs.
     # The nodes of the graph are the initial groups of barcode/protosequences. Edges between nodes
@@ -728,15 +756,13 @@ def make_clusters(
     # initialise empty graph
     G = nx.Graph()
 
-    percent_seq_threshold = seq_threshold / 100.0
-
     for i, j in zip(merge_groups.row, merge_groups.col):
         protoseqs = [
             barcode_dcretc_list[i][0].split("|")[2],
             barcode_dcretc_list[j][0].split("|")[2],
         ]
         if are_seqs_equivalent(
-            protoseqs[0], protoseqs[1], percent_seq_threshold
+            protoseqs[0], protoseqs[1], lev_threshold_fraction
         ):
             G.add_edge(i, j)
             n_merged_UMIs += 1
@@ -769,33 +795,48 @@ def make_clusters(
     return clusters
 
 
-def write_clusters(clusters):
-    # create directory to store cluster data without overwriting exiting directories
-    dirname = "clusters"
+def write_clusters(
+    clusters: coll.defaultdict[str, list[str]],
+    inputargs: dict[str, typing.Union[str, bool, int]],
+) -> None:
+    # create file to store cluster data without overwriting exiting files
+    chain: str = inputargs["chain"]
+    filename = "clusters_" + chain
+    ftype = ".psv.gz"
     count = 1
-    while os.path.isdir(dirname):
-        dirname = "clusters" + str(count)
+    while os.path.isfile(filename + ftype):
+        filename = filename + str(count)
         count += 1
-    os.mkdir(dirname)
 
+    filename = filename + ftype
     print(
-        "   Writing clusters to directory: ", os.path.abspath(dirname), "..."
+        "   Writing clusters to directory: ", os.path.abspath(filename), "..."
     )
-    # write data of each cluster to a separate file and store in clusters directory
-    for k in clusters:
-        with open(
-            dirname + os.sep + "|".join(k.split("|")[:2]) + ".txt", "w"
-        ) as ofile:
-            for j in clusters[k]:
-                print(j, file=ofile)
-    return 1
+    header_names = [
+        "umi_id",
+        "dcr",
+        "inter_tag",
+        "inter_tag_qual",
+        "read_id",
+        "umi",
+        "umi_qual",
+        "full_oligo",
+        "v_tail",
+    ]
+    # write data of each cluster to file
+    with gzip.open(filename, "wt") as cluster_file:
+        cluster_file.write("|".join(header_names) + "\n")
+        for cluster_id, dcretc_list in clusters.items():
+            cluster_name = ":".join(cluster_id.split("|")[:2])
+            for dcretc in dcretc_list:
+                cluster_file.write(cluster_name + "|" + dcretc + "\n")
 
 
 def cluster_UMIs(
     barcode_dcretc: coll.defaultdict[str, list[str]],
     inputargs: dict[str, typing.Union[str, bool, int]],
     barcode_threshold: int,
-    seq_threshold: int,
+    lev_threshold_fraction: float,
     dont_count: bool,
 ) -> coll.defaultdict[str, list[str]]:
     # input data of form: {'barcode1|index|protoseq': [dcretc1, dcretc2,...], 'barcode2|index|protoseq|: [dcretc1, dcretc2,...], ...}
@@ -818,7 +859,9 @@ def cluster_UMIs(
     )
 
     print("  ", "comparing TCR sequences of similar UMIs...")
-    clusters = make_clusters(matches, barcode_dcretc_list, seq_threshold)
+    clusters = make_clusters(
+        matches, barcode_dcretc_list, lev_threshold_fraction
+    )
 
     print(
         "  ",
@@ -832,7 +875,7 @@ def cluster_UMIs(
 
     # dump clusters to separate files if desired
     if inputargs["writeclusters"]:
-        write_clusters(clusters)
+        write_clusters(clusters, inputargs)
 
     return clusters
 
@@ -841,7 +884,7 @@ def collapsinate(
     data,
     inputargs,
     barcode_quality_parameters,
-    lev_threshold,
+    lev_threshold_fraction,
     barcode_distance_threshold,
     outpath,
     file_id,
@@ -854,7 +897,7 @@ def collapsinate(
         data,
         inputargs,
         barcode_quality_parameters,
-        lev_threshold,
+        lev_threshold_fraction,
         dont_count,
         opener,
     )
@@ -864,7 +907,7 @@ def collapsinate(
         barcode_dcretc,
         inputargs,
         barcode_distance_threshold,
-        lev_threshold,
+        lev_threshold_fraction,
         dont_count,
     )
 
@@ -942,8 +985,8 @@ def collapsinator(inputargs: dict, data: list = None) -> list:
         inputargs["avgQthreshold"],
     ]
 
-    ## this is the percentage lev distance that is allowed to determine whether two sequences are equivalent
-    lev_threshold = inputargs["percentlevdist"]
+    ## this is the fractional lev distance that is allowed to determine whether two sequences are equivalent
+    lev_threshold_fraction = inputargs["percentlevdist"] / 100
 
     ## this is the number of barcode edits that are allowed to call two barcodes equivalent
     barcode_distance_threshold = inputargs["bcthreshold"]
@@ -963,7 +1006,7 @@ def collapsinator(inputargs: dict, data: list = None) -> list:
         data,
         inputargs,
         barcode_quality_parameters,
-        lev_threshold,
+        lev_threshold_fraction,
         barcode_distance_threshold,
         outpath,
         file_id,

diff --git a/tests/resources/dcr_TINY_1_beta.freq b/tests/resources/dcr_TINY_1_beta.freq
@@ -1,5 +1,6 @@
-15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 3
+15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 2
 15, 10, 4, 1, CTACCCCCGCAAAGAC, 1, 1
+15, 10, 3, 1, CGGGGACCTACCCCC, 1, 1
 43, 0, 5, 6, GGAGGGACAG, 1, 2
 15, 6, 3, 9, CCTAGCGGAATACTCCTACAC, 1, 1
 9, 6, 4, 0, CTCACGGGGGGTT, 1, 1