Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions src/decombinator/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,7 @@ def read_in_data(
t0 = time.time()
barcode_dcretc = coll.defaultdict(list)
barcode_lookup = coll.defaultdict(list)
barcode_multi_tcr = set()

input_dcr_counts = coll.Counter()
ratio = 1
Expand Down Expand Up @@ -590,6 +591,11 @@ def read_in_data(
# where index counts upwards from zero to help disinguish identical barcodes in different groups,
# protoseq is the most common sequence present in the group, and dcretc are the input reads

if barcode in barcode_multi_tcr:
# if barcode already marked as multi-tcr barcode, drop read
counts["multi_tcr_barcode_reads"] += 1
continue

if barcode in barcode_lookup:

for barcode_index, barcode_protoseq in barcode_lookup[barcode]:
Expand Down Expand Up @@ -631,14 +637,14 @@ def read_in_data(
+ "|"
+ str(barcode_index)
+ "|"
+ new_protoseq
+ barcode_protoseq
]
del barcode_dcretc[
barcode
+ "|"
+ str(barcode_index)
+ "|"
+ new_protoseq
+ barcode_protoseq
]

barcode_lookup[barcode][barcode_index] = [
Expand All @@ -651,13 +657,19 @@ def read_in_data(
break

if not group_assigned:
# if no appropriate group found, create new group with correctly incremented index
new_index = len(barcode_lookup[barcode])
barcode_lookup[barcode].append([new_index, seq])
barcode_dcretc[
"|".join([barcode, str(new_index), seq])
].append(dcretc)
group_assigned = True
# If barcode already assigned to non-equivalent TCR, register barcode as multi-
# tcr UMI and remove from analysis
counts["multi_tcr_barcode_reads"] += 1
if barcode in barcode_multi_tcr:
raise ValueError(
"Barcode cannot be added twice to multi_tcr list"
)
barcode_multi_tcr.add(barcode)
del barcode_lookup[barcode]
del barcode_dcretc[
barcode + "|" + str(barcode_index) + "|" + barcode_protoseq
]
continue

else:
# if no identical barcode found, create new barcode group with index zero
Expand All @@ -671,6 +683,7 @@ def read_in_data(
counts["readdata_barcode_dcretc_keys"] = len(barcode_dcretc.keys())
counts["number_input_unique_dcrs"] = len(input_dcr_counts.keys())
counts["number_input_total_dcrs"] = sum(input_dcr_counts.values())
counts["multi_tcr_barcodes"] = len(barcode_multi_tcr)

t1 = time.time()
print(" Read in total of", lcount + 1, "lines")
Expand Down Expand Up @@ -1157,6 +1170,10 @@ def collapsinator(inputargs: dict, data: list = None) -> list:
+ str(counts["readdata_fail_no_bclocs"])
+ "\nBarcodeFail_LowQuality,"
+ str(counts["readdata_fail_low_barcode_quality"])
+ "\nMultiTCRBarcodes,"
+ str(counts["multi_tcr_barcodes"])
+ "\nMultiTCRBarcodeReads,"
+ str(counts["multi_tcr_barcode_reads"])
)

print(summstr, file=summaryfile)
Expand Down
4 changes: 2 additions & 2 deletions tests/resources/dcr_TINY_1_beta.freq
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
15, 10, 4, 1, CTACCCCCGCGGGGAC, 2, 2
15, 10, 4, 1, CTACCCCCGCGGAGAC, 1, 2
15, 10, 4, 1, CTACCCCCGCAAAGAC, 1, 1
15, 10, 3, 1, CGGGGACCTACCCCC, 1, 1
15, 10, 4, 1, CTACCCCCGCGGGGAC, 1, 1
43, 0, 5, 6, GGAGGGACAG, 1, 2
15, 6, 3, 9, CCTAGCGGAATACTCCTACAC, 1, 1
9, 6, 4, 0, CTCACGGGGGGTT, 1, 1
Expand Down
4 changes: 2 additions & 2 deletions tests/resources/dcr_TINY_1_beta.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sequence_id v_call d_call j_call junction_aa duplicate_count sequence junction decombinator_id rev_comp productive sequence_aa cdr1_aa cdr2_aa vj_in_frame stop_codon conserved_c conserved_f sequence_alignment germline_alignment v_cigar d_cigar j_cigar av_UMI_cluster_size
1 TRBV20-1 TRBJ2-5 CSATTPAGTQETQYF 2 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGGGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAGTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 2
1 TRBV20-1 TRBJ2-5 CSATTPAETQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGAGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGAGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGAGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAETQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 2
2 TRBV20-1 TRBJ2-5 CSATTPAKTQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCAAAGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCAAAGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCAAAGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAKTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1
3 TRBV20-1 TRBJ2-5 CSASGDLPPQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGCGGGGACCTACCCCCCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTAGCGGGGACCTACCCCCCCAAGAGACCCAGTACTTC 15, 10, 3, 1, CGGGGACCTACCCCC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASGDLPPQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1
3 TRBV20-1 TRBJ2-5 CSATTPAGTQETQYF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGCTCG TGCAGTGCTACTACCCCCGCGGGGACCCAAGAGACCCAGTACTTC 15, 10, 4, 1, CTACCCCCGCGGGGAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATTPAGTQETQYFGPGTRLLVL DFQATT SNEGSKA T F T T 1
4 TRBV7-9 TRBJ1-1 CASSGGTAEAFF 1 GATACTGGAGTCTCCCAGAACCCCAGACACAAGATCACAAAGAGGGGACAGAATGTAACTTTCAGGTGTGATCCAATTTCTGAACACAACCGCCTTTATTGGTACCGACAGACCCTGGGGCAGGGCCCAGAGTTTCTGACTTACTTCCAGAATGAAGCTCAACTAGAAAAATCAAGGCTGCTCAGTGATCGGTTCTCTGCAGAGAGGCCTAAGGGATCTTTCTCCACCTTGGAGATCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCGGAGGGACAGCTGAAGCTTTCTTTGGACAAGGCACCAGACTCACAGTTGTAG TGTGCCAGCAGCGGAGGGACAGCTGAAGCTTTCTTT 43, 0, 5, 6, GGAGGGACAG F T DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSGGTAEAFFGQGTRLTVV SEHNR FQNEAQ T F T T 2
5 TRBV20-1 TRBJ2-1 CSASLAEYSYTEQFF 1 GGTGCTGTCGTCTCTCAACATCCGAGCTGGGTTATCTGTAAGAGTGGAACCTCTGTGAAGATCGAGTGCCGTTCCCTGGACTTTCAGGCCACAACTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGATGGCAACTTCCAATGAGGGCTCCAAGGCCACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATGCAAGCCTGACCTTGTCCACTCTGACAGTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGCCTAGCGGAATACTCCTACACTGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAG TGCAGTGCTAGCCTAGCGGAATACTCCTACACTGAGCAGTTCTTC 15, 6, 3, 9, CCTAGCGGAATACTCCTACAC F T GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASLAEYSYTEQFFGPGTRLTVL DFQATT SNEGSKA T F T T 1
6 TRBV14 TRBJ2-1 CASSPHGGFSYNEQFF 1 GAAGCTGGAGTTACTCAGTTCCCCAGCCACAGCGTAATAGAGAAGGGCCAGACTGTGACTCTGAGATGTGACCCAATTTCTGGACATGATAATCTTTATTGGTATCGACGTGTTATGGGAAAAGAAATAAAATTTCTGTTACATTTTGTGAAAGAGTCTAAACAGGATGAGTCCGGTATGCCCAACAATCGATTCTTAGCTGAAAGGACTGGAGGGACGTATTCTACTCTGAAGGTGCAGCCTGCAGAACTGGAGGATTCTGGAGTTTATTTCTGTGCCAGCAGCCCTCACGGGGGGTTCTCCTACAATGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAG TGTGCCAGCAGCCCTCACGGGGGGTTCTCCTACAATGAGCAGTTCTTC 9, 6, 4, 0, CTCACGGGGGGTT F T EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSPHGGFSYNEQFFGPGTRLTVL SGHDN FVKESK T F T T 1
Expand Down
10 changes: 1 addition & 9 deletions tests/test_collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,15 +289,7 @@ def test_barcode_collision(
dont_count=False,
opener=open,
)
assert barcode_dcretc == {
"CACCCGCTGACT|0|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTCCAGCCCCAGCATTTTGGTGATGGGACTCGACTC": [
"['15', '4', '1', '7', 'CCCCCAGGGGGCTC']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTCCAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375",
"['15', '4', '1', '7', 'CCCCCAGGGGGCTG']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGCCCCCAGGGGGCTGCAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375",
],
"CACCCGCTGACT|1|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGAAAAAAAAAAAAAACAGCCCCAGCATTTTGGTGATGGGACTCGACTC": [
"['15', '4', '1', '7', 'AAAAAAAAAAAAAA']|ATCCTGAAGACAGCAGCTTCTACATCTGCAGTGCTAGAGAAAAAAAAAAAAAACAGCCCCAGCATTTTGGTGATGGGACTCGACTC|IIIIIIIIIIIIIII-II-IIIIIIIIIIIIIIIIIIIIIIIIIIIII-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII|LH00409:259:22JJCFLT4:8:1149:38410:17375"
],
}
assert barcode_dcretc == {}

def test_barcode_collision_no_tcr_check(
self,
Expand Down
Loading