Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
Binary file added data/.DS_Store
Binary file not shown.
171 changes: 171 additions & 0 deletions data/msi_sites/msi_sites_170.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
chromosome location repeat_unit_length repeat_unit_binary repeat_times left_flank_binary right_flank_binary repeat_unit_bases left_flank_bases right_flank_bases
1 8080134 1 0 16 927 512 A TGCTT GAAAA
1 11293367 1 0 11 319 812 A CATTT TAGTA
1 46597496 1 3 11 573 469 T GATTC CTCCC
1 59245375 1 3 13 289 17 T CAGAC AACAC
1 78432506 1 0 14 671 626 A GGCTT GCTAG
1 78432647 1 0 11 754 1023 A GTTAG TTTTT
1 148888276 1 0 13 713 640 A GTAGC GGAAA
1 161332091 1 3 14 245 639 T ATTCC GCTTT
1 162831227 1 3 12 468 477 T CTCCA CTCTC
1 176050430 1 0 16 206 528 A ATATG GACAA
1 176105710 1 0 12 33 604 A AAGAC GCCTA
1 204494756 1 3 14 1018 63 T TTTGG AATTT
1 228607444 1 3 10 954 259 T TGTGG CAAAT
1 243736210 1 3 15 241 95 T ATTAC ACCTT
2 47635523 1 3 13 945 40 T TGTAC AAGGA
2 47641559 1 0 27 299 687 A CAGGT GGGTT
2 48032740 1 3 13 952 43 T TGTGA AAGGT
2 48033890 1 3 18 1 63 T AAAAC AATTT
2 61145498 1 3 13 852 703 T TCCCA GGTTT
2 61726050 1 0 11 231 273 A ATGCT CACAC
2 91887905 1 3 13 980 625 T TTCCA GCTAC
2 113990907 1 0 15 650 346 A GGAGG CCCGG
2 158655883 1 0 11 37 836 A AAGCC TCACA
2 198257652 1 0 12 831 837 A TATTT TCACC
2 198267243 1 0 13 770 528 A TAAAG GACAA
2 202139739 1 3 14 969 633 T TTAGC GCTGC
2 212578379 1 0 14 911 688 A TGATT GGTAA
2 215657182 1 0 16 783 577 A TAATT GCAAC
3 30691871 1 0 10 522 606 A GAAGG GCCTG
3 41268676 2 3 14 66 509 AT ACAAG CTTTC
3 47059029 1 0 11 66 799 A ACAAG TACTT
3 47147393 1 0 22 885 832 A TCTCC TCAAA
3 47147395 1 0 20 848 832 A TCCAA TCAAA
3 49723726 1 1 12 598 93 C GCCCG ACCTC
3 52696310 1 0 11 34 287 A AAGAG CACTT
3 71008341 1 3 13 110 383 T ACGTG CCTTT
3 72495593 1 0 17 287 727 A CACTT GTCCT
3 138400782 1 0 13 542 540 A GACTG GACTA
3 142231062 1 0 18 807 513 A TAGCT GAAAC
3 178937630 1 0 11 927 975 A TGCTT TTATT
3 185155430 1 3 11 41 32 T AAGGC AAGAA
4 1806012 1 2 11 347 330 G CCCGT CCAGG
4 54280705 1 0 13 834 288 A TCAAG CAGAA
4 55598211 1 3 25 1016 544 T TTTGA GAGAA
4 55976947 1 0 13 3 692 A AAAAT GGTCA
4 153268227 1 0 14 113 546 A ACTAC GAGAG
4 187560860 1 0 10 35 497 A AAGAT CTTAC
4 187627659 1 0 16 943 928 A TGGTT TGGAA
5 38978758 1 0 11 831 642 A TATTT GGAAG
5 86629061 1 3 11 13 235 T AAATC ATGGT
5 86669949 2 3 16 684 1023 AT GGGTA TTTTT
5 86669965 1 3 12 819 173 T TATAT AGGTC
5 86679495 1 0 18 1015 1013 A TTTCT TTTCC
5 170837513 1 3 13 253 330 T ATTTC CCAGG
6 407629 1 3 19 1013 561 T TTTCC GATAC
6 6269228 1 3 12 62 44 T AATTG AAGTA
6 7518105 1 3 13 208 482 T ATCAA CTGAG
6 32166160 1 3 13 161 506 T AGGAC CTTGG
6 37140670 1 3 14 168 2 T AGGGA AAAAG
6 117642992 1 3 20 656 584 T GGCAA GCAGA
6 117681436 2 1 13 234 605 AC ATGGG GCCTC
6 117725383 1 0 15 733 1002 A GTCTC TTGGG
6 137524870 1 0 14 222 837 A ATCTG TCACC
6 157495951 1 3 14 948 245 T TGTCA ATTCC
7 6037057 1 0 17 30 977 A AACTG TTCAC
7 6038620 1 0 16 477 835 A CTCTC TCAAT
7 52528952 1 3 11 1016 488 T TTTGA CTGGA
7 116381121 1 3 16 942 703 T TGGTG GGTTT
7 116409675 1 3 15 261 383 T CAACC CCTTT
7 140482263 1 0 16 1013 833 A TTTCC TCAAC
7 140496148 1 0 16 223 671 A ATCTT GGCTT
7 140498359 1 3 21 421 9 T CGGCC AAAGC
7 151921037 1 3 11 254 625 T ATTTG GCTAC
8 55374916 1 3 13 462 690 T CTATG GGTAG
8 59337246 1 2 11 41 478 G AAGGC CTCTG
8 117864952 1 0 14 463 512 A CTATT GAAAA
8 145738581 1 2 12 187 916 G AGTGT TGCCA
9 8341280 1 0 12 202 640 A ATAGG GGAAA
9 80343587 1 0 14 890 840 A TCTGG TCAGA
9 93650758 1 0 12 383 636 A CCTTT GCTTA
9 135773000 1 0 18 862 542 A TCCTG GACTG
10 8115668 1 0 18 11 995 A AAAGT TTGAT
10 43595836 1 3 14 56 727 T AATGA GTCCT
10 63760087 1 3 13 557 369 T GAGTC CCTAC
10 89725293 1 3 11 645 58 T GGACC AATGG
10 114910667 1 3 11 926 511 T TGCTG CTTTT
11 18586080 1 0 12 895 592 A TCTTT GCCAA
11 94212930 1 0 11 95 753 A ACCTT GTTAC
11 102056722 1 3 15 312 492 T CATGA CTGTA
11 102080325 1 0 15 122 563 A ACTGG GATAT
11 102199611 1 3 10 204 484 T ATATA CTGCA
11 108114661 1 3 15 48 32 T AATAA AAGAA
11 108121410 1 3 15 821 167 T TATCC AGGCT
11 108141955 1 3 15 897 81 T TGAAC ACCAC
11 118353037 1 3 16 50 451 T AATAG CTAAT
11 118369265 1 0 15 639 543 A GCTTT GACTT
11 119145463 1 3 12 958 467 T TGTTG CTCAT
12 416046 1 3 13 201 4 T ATAGC AAACA
12 498267 1 2 15 421 853 G CGGCC TCCCC
12 3001248 1 0 10 95 911 A ACCTT TGATT
12 12013498 1 0 11 733 256 A GTCTC CAAAA
12 12017687 1 0 12 127 639 A ACTTT GCTTT
12 12022175 1 0 14 18 825 A AACAG TATGC
12 12024131 1 3 18 725 324 T GTCCC CCACA
12 12030236 1 3 16 801 63 T TAGAC AATTT
12 12032966 1 0 19 959 640 A TGTTT GGAAA
12 97487819 1 3 11 110 767 T ACGTG GTTTT
12 133237753 1 0 14 94 656 A ACCTG GGCAA
13 32907535 1 3 11 493 704 T CTGTC GTAAA
13 48954106 1 0 13 0 976 A AAAAA TTCAA
13 48954159 1 3 13 125 15 T ACTTC AAATT
13 48954282 1 3 11 781 383 T TAATC CCTTT
13 88543607 1 3 14 893 510 T TCTTC CTTTG
14 23652346 1 0 21 999 660 A TTGCT GGCCA
14 65472874 1 3 17 749 593 T GTGTC GCCAC
14 65568304 1 0 11 546 800 A GAGAG TAGAA
14 68944321 1 3 11 16 257 T AACAA CAAAC
14 68944343 1 3 14 989 383 T TTCTC CCTTT
14 95571635 1 0 11 2 532 A AAAAG GACCA
14 95574594 1 3 13 114 383 T ACTAG CCTTT
15 21131646 1 0 13 969 548 A TTAGC GAGCA
15 22146586 1 0 11 478 968 A CTCTG TTAGA
15 41988194 1 3 13 77 479 T ACATC CTCTT
15 41991036 1 3 15 796 571 T TACTA GATGT
15 91303325 1 3 12 33 349 T AAGAC CCCTC
16 3808052 1 0 13 863 941 A TCCTT TGGTC
16 9934670 1 0 20 41 546 A AAGGC GAGAG
16 23615042 1 0 14 271 692 A CAATT GGTCA
16 72832618 1 0 16 495 832 A CTGTT TCAAA
16 81954996 1 3 15 48 544 T AATAA GAGAA
17 15973417 1 3 17 10 14 T AAAGG AAATG
17 16041578 1 0 13 263 506 A CAACT CTTGG
17 29086494 1 3 12 1013 159 T TTTCC AGCTT
17 29508819 1 3 16 248 735 T ATTGA GTCTT
17 30293149 1 3 14 1013 159 T TTTCC AGCTT
17 37855713 1 0 14 447 727 A CGTTT GTCCT
17 40359505 2 4 19 718 64 CA GTATG ACAAA
17 40368183 1 3 12 477 499 T CTCTC CTTAT
17 40369097 1 0 10 739 831 A GTGAT TATTT
17 40491273 1 0 17 733 963 A GTCTC TTAAT
17 41242945 2 1 6 273 639 AC CACAC GCTTT
17 59857599 1 0 11 501 260 A CTTCC CAACA
18 48584855 1 3 16 668 690 T GGCTA GGTAG
18 56363569 1 3 26 1008 160 T TTTAA AGGAA
18 57571785 1 3 15 693 482 T GGTCC CTGAG
19 10273296 1 0 20 938 514 A TGGGG GAAAG
19 14628247 1 0 11 130 256 A AGAAG CAAAA
19 15366000 1 0 10 250 682 A ATTGG GGGGG
19 30311542 1 3 12 66 343 T ACAAG CCCCT
19 41759631 1 1 12 168 30 C AGGGA AACTG
19 50911947 1 3 12 56 10 T AATGA AAAGG
20 46266543 1 3 14 458 633 T CTAGG GCTGC
20 46270911 1 0 11 498 581 A CTTAG GCACC
21 42838093 1 0 10 202 750 A ATAGG GTGTG
21 42866994 1 0 16 733 707 A GTCTC GTAAT
21 42867553 1 0 16 733 327 A GTCTC CCACT
21 42867842 1 0 14 15 745 A AAATT GTGGC
22 30051705 1 3 16 937 607 T TGGGC GCCTT
22 41545024 1 3 14 1006 293 T TTGTG CAGCC
22 41550984 1 3 11 913 47 T TGCAC AAGTT
22 41551231 1 3 12 184 341 T AGTGA CCCCC
X 39930433 1 0 13 570 260 A GATGG CAACA
X 44918221 1 3 14 504 351 T CTTGA CCCTT
X 44935877 1 3 13 573 495 T GATTC CTGTT
X 44935924 1 3 13 213 330 T ATCCC CCAGG
X 44949951 1 3 11 30 370 T AACTG CCTAG
X 48892468 1 3 13 370 375 T CCTAG CCTCT
X 48895224 1 0 18 886 572 A TCTCG GATTA
X 123199993 1 3 27 496 75 T CTTAA ACAGT
X 123204978 1 3 14 861 12 T TCCTC AAATA
10 changes: 7 additions & 3 deletions scripts/stride_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ def main():

p.add_argument("--min-coverage", type=int, default=20)
p.add_argument("--max-repeat-bins", type=int, default=100)
p.add_argument("--keep-features", action="store_true", help="Keep generated feature TSVs (default: removed).")
p.add_argument(
"--delete-features",
action="store_true",
help="Delete generated feature TSVs after prediction (default: keep)."
)
args = p.parse_args()

if args.samples_list:
Expand All @@ -30,7 +34,7 @@ def main():
out_dir=args.out_dir,
min_coverage=args.min_coverage,
max_repeat_bins=args.max_repeat_bins,
keep_features=args.keep_features
keep_features=not args.delete_features
)
print(f"Completed batch for {len(results)} samples.")
for r in results:
Expand All @@ -50,7 +54,7 @@ def main():
sample_id=args.sample_id,
min_coverage=args.min_coverage,
max_repeat_bins=args.max_repeat_bins,
keep_features=args.keep_features
keep_features=not args.delete_features
)
print(f"Completed sample {res['sample_id']}")
print(res["prediction_txt"])
Expand Down
Binary file modified src/.DS_Store
Binary file not shown.
26 changes: 15 additions & 11 deletions src/stride/feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def count_alleles_above_thresh(freqs, threshold=5):

l1_dist = np.sum(np.abs(norm_tumor - norm_normal))
l2_dist = np.sqrt(np.sum((norm_tumor - norm_normal) ** 2))
wass_dist = wasserstein_distance(norm_tumor, norm_normal)
bins = np.arange(self.max_repeat_bins)
wass_dist = wasserstein_distance(bins, bins, u_weights=norm_tumor, v_weights=norm_normal)

allele_thresholds = [1, 5, 10, 15, 20, 25, 30]
allele_diffs = {
Expand All @@ -96,6 +97,9 @@ def count_alleles_above_thresh(freqs, threshold=5):
for thresh in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
}

def safe_mean(values):
return float(np.mean(values)) if values else np.nan

return {
"chrom": chrom,
"start": start,
Expand All @@ -110,16 +114,16 @@ def count_alleles_above_thresh(freqs, threshold=5):
"normal_norm_freqs": norm_normal.tolist(),
"tumor_total_coverage": tumor_total,
"normal_total_coverage": normal_total,
"tumor_mapq_mean": np.mean(tumor_mapq) if tumor_mapq else 0,
"normal_mapq_mean": np.mean(normal_mapq) if normal_mapq else 0,
"tumor_bq_mean": np.mean(tumor_bq) if tumor_bq else 0,
"normal_bq_mean": np.mean(normal_bq) if normal_bq else 0,
"tumor_insert_mean_all": np.mean(tumor_insert_all) if tumor_insert_all else 0,
"normal_insert_mean_all": np.mean(normal_insert_all) if normal_insert_all else 0,
"tumor_insert_mean_ref": np.mean(tumor_insert_ref) if tumor_insert_ref else 0,
"normal_insert_mean_ref": np.mean(normal_insert_ref) if normal_insert_ref else 0,
"tumor_insert_mean_alt": np.mean(tumor_insert_alt) if tumor_insert_alt else 0,
"normal_insert_mean_alt": np.mean(normal_insert_alt) if normal_insert_alt else 0,
"tumor_mapq_mean": safe_mean(tumor_mapq),
"normal_mapq_mean": safe_mean(normal_mapq),
"tumor_bq_mean": safe_mean(tumor_bq),
"normal_bq_mean": safe_mean(normal_bq),
"tumor_insert_mean_all": safe_mean(tumor_insert_all),
"normal_insert_mean_all": safe_mean(normal_insert_all),
"tumor_insert_mean_ref": safe_mean(tumor_insert_ref),
"normal_insert_mean_ref": safe_mean(normal_insert_ref),
"tumor_insert_mean_alt": safe_mean(tumor_insert_alt),
"normal_insert_mean_alt": safe_mean(normal_insert_alt),
"tumor_entropy": tumor_entropy_val,
"normal_entropy": normal_entropy_val,
"entropy_diff": tumor_entropy_val - normal_entropy_val,
Expand Down