diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py index 9be7f01..d81d336 100644 --- a/src/decombinator/collapse.py +++ b/src/decombinator/collapse.py @@ -513,6 +513,7 @@ def read_in_data( t0 = time.time() barcode_dcretc = coll.defaultdict(list) barcode_lookup = coll.defaultdict(list) + barcode_multi_tcr = set() input_dcr_counts = coll.Counter() ratio = 1 @@ -590,6 +591,11 @@ def read_in_data( # where index counts upwards from zero to help disinguish identical barcodes in different groups, # protoseq is the most common sequence present in the group, and dcretc are the input reads + if barcode in barcode_multi_tcr: + # if barcode already marked as multi-tcr barcode, drop read + counts["multi_tcr_barcode_reads"] += 1 + continue + if barcode in barcode_lookup: for barcode_index, barcode_protoseq in barcode_lookup[barcode]: @@ -631,14 +637,14 @@ def read_in_data( + "|" + str(barcode_index) + "|" - + new_protoseq + + barcode_protoseq ] del barcode_dcretc[ barcode + "|" + str(barcode_index) + "|" - + new_protoseq + + barcode_protoseq ] barcode_lookup[barcode][barcode_index] = [ @@ -651,13 +657,19 @@ def read_in_data( break if not group_assigned: - # if no appropriate group found, create new group with correctly incremented index - new_index = len(barcode_lookup[barcode]) - barcode_lookup[barcode].append([new_index, seq]) - barcode_dcretc[ - "|".join([barcode, str(new_index), seq]) - ].append(dcretc) - group_assigned = True + # If barcode already assigned to non-equivalent TCR, register barcode as multi- + # tcr UMI and remove from analysis + counts["multi_tcr_barcode_reads"] += 1 + if barcode in barcode_multi_tcr: + raise ValueError( + "Barcode cannot be added twice to multi_tcr list" + ) + barcode_multi_tcr.add(barcode) + del barcode_lookup[barcode] + del barcode_dcretc[ + barcode + "|" + str(barcode_index) + "|" + barcode_protoseq + ] + continue else: # if no identical barcode found, create new barcode group with index zero @@ -671,6 +683,7 @@ def read_in_data( counts["readdata_barcode_dcretc_keys"] = len(barcode_dcretc.keys()) counts["number_input_unique_dcrs"] = len(input_dcr_counts.keys()) counts["number_input_total_dcrs"] = sum(input_dcr_counts.values()) + counts["multi_tcr_barcodes"] = len(barcode_multi_tcr) t1 = time.time() print(" Read in total of", lcount + 1, "lines") @@ -1157,6 +1170,10 @@ def collapsinator(inputargs: dict, data: list = None) -> list: + str(counts["readdata_fail_no_bclocs"]) + "\nBarcodeFail_LowQuality," + str(counts["readdata_fail_low_barcode_quality"]) + + "\nMultiTCRBarcodes," + + str(counts["multi_tcr_barcodes"]) + + "\nMultiTCRBarcodeReads," + + str(counts["multi_tcr_barcode_reads"]) ) print(summstr, file=summaryfile) diff --git a/tests/resources/dcr_TINY_1_beta.freq b/tests/resources/dcr_TINY_1_beta.freq index 1803482..59e185f 100644 --- a/tests/resources/dcr_TINY_1_beta.freq +++ b/tests/resources/dcr_TINY_1_beta.freq @@ -1,6 +1,6 @@ -15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 2 +15, 10, 4, 1, CTACCCCCGCGGAGAC, 1, 2 15, 10, 4, 1, CTACCCCCGCAAAGAC, 1, 1 -15, 10, 3, 1, CGGGGACCTACCCCC, 1, 1 +15, 10, 4, 1, CTACCCCCGCGGGGAC, 1, 1 43, 0, 5, 6, GGAGGGACAG, 1, 2 15, 6, 3, 9, CCTAGCGGAATACTCCTACAC, 1, 1 9, 6, 4, 0, CTCACGGGGGGTT, 1, 1 diff --git a/tests/resources/dcr_TINY_1_beta.tsv b/tests/resources/dcr_TINY_1_beta.tsv index bb6181b..3091ae8 100644 --- a/tests/resources/dcr_TINY_1_beta.tsv +++ b/tests/resources/dcr_TINY_1_beta.tsv @@ -1,7 +1,7 @@ sequence_id v_call d_call j_call junction_aa duplicate_count sequence junction decombinator_id rev_comp productive sequence_aa cdr1_aa cdr2_aa vj_in_frame stop_codon conserved_c conserved_f sequence_alignment germline_alignment v_cigar d_cigar j_cigar av_UMI_cluster_size -1 TRBV20-1 TRBJ2-5 CSATTPAGTQETQYF 2 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGGGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAGTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 2 +1 TRBV20-1 TRBJ2-5 CSATTPAETQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGAGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGAGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGAGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAETQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 2 2 TRBV20-1 TRBJ2-5 CSATTPAKTQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCAAAGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCAAAGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCAAAGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAKTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1 -3 TRBV20-1 TRBJ2-5 CSASGDLPPQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGCGGGGACCTACCCCCCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTAGCGGGGACCTACCCCCCCAAGAGACCCAGTACTTC 15, 10, 3, 1, CGGGGACCTACCCCC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASGDLPPQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1 +3 TRBV20-1 TRBJ2-5 CSATTPAGTQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGGGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAGTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1 4 TRBV7-9 TRBJ1-1 CASSGGTAEAFF 1 GATACTGGAGTCTCCCAGAACCCCAGACACAAGATCACAAAGAGGGGACAGAATGTAACTTTCAGGTGTGATCCAATTTCTGAACACAACCGCCTTTATTGGTACCGACAGACCCTGGGGCAGGGCCCAGAGTTTCTGACTTACTTCCAGAATGAAGCTCAACTAGAAAAATCAAGGCTGCTCAGTGATCGGTTCTCTGCAGAGAGGCCTAAGGGATCTTTCTCCACCTTGGAGATCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCGGAGGGACAGCTGAAGCTTTCTTTGGACAAGGCACCAGACTCACAGTTGTAG TGTGCCAGCAGCGGAGGGACAGCTGAAGCTTTCTTT 43, 0, 5, 6, GGAGGGACAG F T DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSGGTAEAFFGQGTRLTVV SEHNR FQNEAQ T F T T 2 5 TRBV20-1 TRBJ2-1 CSASLAEYSYTEQFF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGCCTAGCGGAATACTCCTACACTGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAG TGCAGTGCTAGCCTAGCGGAATACTCCTACACTGAGCAGTTCTTC 15, 6, 3, 9, CCTAGCGGAATACTCCTACAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASLAEYSYTEQFFGPGTRLTVL DFQATT SNEGSKA T F T T 1 6 TRBV14 TRBJ2-1 CASSPHGGFSYNEQFF 1 GAAGCTGGAGTTACTCAGTTCCCCAGCCACAGCGTAATAGAGAAGGGCCAGACTGTGACTCTGAGATGTGACCCAATTTCTGGACATGATAATCTTTATTGGTATCGACGTGTTATGGGAAAAGAAATAAAATTTCTGTTACATTTTGTGAAAGAGTCTAAACAGGATGAGTCCGGTATGCCCAACAATCGATTCTTAGCTGAAAGGACTGGAGGGACGTATTCTACTCTGAAGGTGCAGCCTGCAGAACTGGAGGATTCTGGAGTTTATTTCTGTGCCAGCAGCCCTCACGGGGGGTTCTCCTACAATGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAG TGTGCCAGCAGCCCTCACGGGGGGTTCTCCTACAATGAGCAGTTCTTC 9, 6, 4, 0, CTCACGGGGGGTT F T EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSPHGGFSYNEQFFGPGTRLTVL SGHDN FVKESK T F T T 1 diff --git a/tests/test_collapse.py b/tests/test_collapse.py index 5f66b10..07f994d 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -289,15 +289,7 @@ def test_barcode_collision( dont_count=False, opener=open, ) - assert barcode_dcretc == { - "CACCCGCTGACT|0|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTCCAGCCCCAGCATTTTGGTGATGGGACTCGACTC": [ - "['15', '4', '1', '7', 'CCCCCAGGGGGCTC']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTCCAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375", - "['15', '4', '1', '7', 'CCCCCAGGGGGCTG']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTGCAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375", - ], - "CACCCGCTGACT|1|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGAAAAAAAAAAAAAACAGCCCCAGCATTTTGGTGATGGGACTCGACTC": [ - "['15', '4', '1', '7', 'AAAAAAAAAAAAAA']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGAAAAAAAAAAAAAACAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375" - ], - } + assert barcode_dcretc == {} def test_barcode_collision_no_tcr_check( self,