From 004ef499302bb1f60eab9a095af067e816d62433 Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Mon, 18 May 2026 11:34:33 +0100 Subject: [PATCH 1/5] feat(collapse): drop UMIs with multiple TCRs --- src/decombinator/collapse.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py index 9be7f01..7aa85c3 100644 --- a/src/decombinator/collapse.py +++ b/src/decombinator/collapse.py @@ -513,6 +513,7 @@ def read_in_data( t0 = time.time() barcode_dcretc = coll.defaultdict(list) barcode_lookup = coll.defaultdict(list) + barcode_multi_tcr = set() input_dcr_counts = coll.Counter() ratio = 1 @@ -590,6 +591,11 @@ def read_in_data( # where index counts upwards from zero to help disinguish identical barcodes in different groups, # protoseq is the most common sequence present in the group, and dcretc are the input reads + if barcode in barcode_multi_tcr: + # if barcode already marked as multi-tcr barcode, drop read + counts["number_multi_tcr_barcode_reads"] += 1 + continue + if barcode in barcode_lookup: for barcode_index, barcode_protoseq in barcode_lookup[barcode]: @@ -631,14 +637,14 @@ def read_in_data( + "|" + str(barcode_index) + "|" - + new_protoseq + + barcode_protoseq ] del barcode_dcretc[ barcode + "|" + str(barcode_index) + "|" - + new_protoseq + + barcode_protoseq ] barcode_lookup[barcode][barcode_index] = [ @@ -651,13 +657,20 @@ def read_in_data( break if not group_assigned: - # if no appropriate group found, create new group with correctly incremented index - new_index = len(barcode_lookup[barcode]) - barcode_lookup[barcode].append([new_index, seq]) - barcode_dcretc[ - "|".join([barcode, str(new_index), seq]) - ].append(dcretc) - group_assigned = True + # If barcode already assigned to non-equivalent TCR, register barcode as multi- + # tcr UMI and remove from analysis + counts["number_multi_tcr_barcode_reads"] += 1 + if barcode in barcode_multi_tcr: + raise ValueError( + "Barcode cannot be added twice to multi_tcr list" + ) + barcode_multi_tcr.add(barcode) + del barcode_lookup[barcode] + del barcode_dcretc[ + barcode + "|" + str(barcode_index) + "|" + barcode_protoseq + ] + counts["multi_tcr_barcodes"] += 1 + continue else: # if no identical barcode found, create new barcode group with index zero From b2bc48e0bf59633781236a4bc83e8869db3da0c4 Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Mon, 18 May 2026 11:35:12 +0100 Subject: [PATCH 2/5] test(collapse): update read in func tests --- tests/test_collapse.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/test_collapse.py b/tests/test_collapse.py index 5f66b10..07f994d 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -289,15 +289,7 @@ def test_barcode_collision( dont_count=False, opener=open, ) - assert barcode_dcretc == { - "CACCCGCTGACT|0|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTCCAGCCCCAGCATTTTGGTGATGGGACTCGACTC": [ - "['15', '4', '1', '7', 'CCCCCAGGGGGCTC']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTCCAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375", - "['15', '4', '1', '7', 'CCCCCAGGGGGCTG']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTGCAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375", - ], - "CACCCGCTGACT|1|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGAAAAAAAAAAAAAACAGCCCCAGCATTTTGGTGATGGGACTCGACTC": [ - "['15', '4', '1', '7', 'AAAAAAAAAAAAAA']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGAAAAAAAAAAAAAACAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375" - ], - } + assert barcode_dcretc == {} def test_barcode_collision_no_tcr_check( self, From be7b05bfd360cca293092811ed66d6cd184098ff Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Mon, 18 May 2026 11:55:05 +0100 Subject: [PATCH 3/5] test(collapse): update freq reference for beta chain tests --- tests/resources/dcr_TINY_1_beta.freq | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/resources/dcr_TINY_1_beta.freq b/tests/resources/dcr_TINY_1_beta.freq index 1803482..59e185f 100644 --- a/tests/resources/dcr_TINY_1_beta.freq +++ b/tests/resources/dcr_TINY_1_beta.freq @@ -1,6 +1,6 @@ -15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 2 +15, 10, 4, 1, CTACCCCCGCGGAGAC, 1, 2 15, 10, 4, 1, CTACCCCCGCAAAGAC, 1, 1 -15, 10, 3, 1, CGGGGACCTACCCCC, 1, 1 +15, 10, 4, 1, CTACCCCCGCGGGGAC, 1, 1 43, 0, 5, 6, GGAGGGACAG, 1, 2 15, 6, 3, 9, CCTAGCGGAATACTCCTACAC, 1, 1 9, 6, 4, 0, CTCACGGGGGGTT, 1, 1 From 7e0d05705a4b4d0114837ddb92963bf525045c03 Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Mon, 18 May 2026 12:02:45 +0100 Subject: [PATCH 4/5] test(translate): update translate test reference --- tests/resources/dcr_TINY_1_beta.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/resources/dcr_TINY_1_beta.tsv b/tests/resources/dcr_TINY_1_beta.tsv index bb6181b..3091ae8 100644 --- a/tests/resources/dcr_TINY_1_beta.tsv +++ b/tests/resources/dcr_TINY_1_beta.tsv @@ -1,7 +1,7 @@ sequence_id v_call d_call j_call junction_aa duplicate_count sequence junction decombinator_id rev_comp productive sequence_aa cdr1_aa cdr2_aa vj_in_frame stop_codon conserved_c conserved_f sequence_alignment germline_alignment v_cigar d_cigar j_cigar av_UMI_cluster_size -1 TRBV20-1 TRBJ2-5 CSATTPAGTQETQYF 2 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGGGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAGTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 2 +1 TRBV20-1 TRBJ2-5 CSATTPAETQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGAGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGAGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGAGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAETQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 2 2 TRBV20-1 TRBJ2-5 CSATTPAKTQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCAAAGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCAAAGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCAAAGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAKTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1 -3 TRBV20-1 TRBJ2-5 CSASGDLPPQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGCGGGGACCTACCCCCCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTAGCGGGGACCTACCCCCCCAAGAGACCCAGTACTTC 15, 10, 3, 1, CGGGGACCTACCCCC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASGDLPPQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1 +3 TRBV20-1 TRBJ2-5 CSATTPAGTQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGGGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAGTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1 4 TRBV7-9 TRBJ1-1 CASSGGTAEAFF 1 GATACTGGAGTCTCCCAGAACCCCAGACACAAGATCACAAAGAGGGGACAGAATGTAACTTTCAGGTGTGATCCAATTTCTGAACACAACCGCCTTTATTGGTACCGACAGACCCTGGGGCAGGGCCCAGAGTTTCTGACTTACTTCCAGAATGAAGCTCAACTAGAAAAATCAAGGCTGCTCAGTGATCGGTTCTCTGCAGAGAGGCCTAAGGGATCTTTCTCCACCTTGGAGATCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCGGAGGGACAGCTGAAGCTTTCTTTGGACAAGGCACCAGACTCACAGTTGTAG TGTGCCAGCAGCGGAGGGACAGCTGAAGCTTTCTTT 43, 0, 5, 6, GGAGGGACAG F T DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSGGTAEAFFGQGTRLTVV SEHNR FQNEAQ T F T T 2 5 TRBV20-1 TRBJ2-1 CSASLAEYSYTEQFF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGCCTAGCGGAATACTCCTACACTGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAG TGCAGTGCTAGCCTAGCGGAATACTCCTACACTGAGCAGTTCTTC 15, 6, 3, 9, CCTAGCGGAATACTCCTACAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASLAEYSYTEQFFGPGTRLTVL DFQATT SNEGSKA T F T T 1 6 TRBV14 TRBJ2-1 CASSPHGGFSYNEQFF 1 GAAGCTGGAGTTACTCAGTTCCCCAGCCACAGCGTAATAGAGAAGGGCCAGACTGTGACTCTGAGATGTGACCCAATTTCTGGACATGATAATCTTTATTGGTATCGACGTGTTATGGGAAAAGAAATAAAATTTCTGTTACATTTTGTGAAAGAGTCTAAACAGGATGAGTCCGGTATGCCCAACAATCGATTCTTAGCTGAAAGGACTGGAGGGACGTATTCTACTCTGAAGGTGCAGCCTGCAGAACTGGAGGATTCTGGAGTTTATTTCTGTGCCAGCAGCCCTCACGGGGGGTTCTCCTACAATGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAG TGTGCCAGCAGCCCTCACGGGGGGTTCTCCTACAATGAGCAGTTCTTC 9, 6, 4, 0, CTCACGGGGGGTT F T EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSPHGGFSYNEQFFGPGTRLTVL SGHDN FVKESK T F T T 1 From 29866b99b39bfe1dd8cddbc4782ba1cb957ccbbc Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Mon, 18 May 2026 13:15:48 +0100 Subject: [PATCH 5/5] feat(collapse): log multi-tcr barcodes and reads --- src/decombinator/collapse.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py index 7aa85c3..d81d336 100644 --- a/src/decombinator/collapse.py +++ b/src/decombinator/collapse.py @@ -593,7 +593,7 @@ def read_in_data( if barcode in barcode_multi_tcr: # if barcode already marked as multi-tcr barcode, drop read - counts["number_multi_tcr_barcode_reads"] += 1 + counts["multi_tcr_barcode_reads"] += 1 continue if barcode in barcode_lookup: @@ -659,7 +659,7 @@ def read_in_data( if not group_assigned: # If barcode already assigned to non-equivalent TCR, register barcode as multi- # tcr UMI and remove from analysis - counts["number_multi_tcr_barcode_reads"] += 1 + counts["multi_tcr_barcode_reads"] += 1 if barcode in barcode_multi_tcr: raise ValueError( "Barcode cannot be added twice to multi_tcr list" @@ -669,7 +669,6 @@ def read_in_data( del barcode_dcretc[ barcode + "|" + str(barcode_index) + "|" + barcode_protoseq ] - counts["multi_tcr_barcodes"] += 1 continue else: @@ -684,6 +683,7 @@ def read_in_data( counts["readdata_barcode_dcretc_keys"] = len(barcode_dcretc.keys()) counts["number_input_unique_dcrs"] = len(input_dcr_counts.keys()) counts["number_input_total_dcrs"] = sum(input_dcr_counts.values()) + counts["multi_tcr_barcodes"] = len(barcode_multi_tcr) t1 = time.time() print(" Read in total of", lcount + 1, "lines") @@ -1170,6 +1170,10 @@ def collapsinator(inputargs: dict, data: list = None) -> list: + str(counts["readdata_fail_no_bclocs"]) + "\nBarcodeFail_LowQuality," + str(counts["readdata_fail_low_barcode_quality"]) + + "\nMultiTCRBarcodes," + + str(counts["multi_tcr_barcodes"]) + + "\nMultiTCRBarcodeReads," + + str(counts["multi_tcr_barcode_reads"]) ) print(summstr, file=summaryfile)