diff --git a/.gitignore b/.gitignore index f7df387..eaaa4db 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,10 @@ target *.out *.log *.tab -*.sam \ No newline at end of file +*.sam + +# Linux build dir used by the solo Docker benchmark/diff (CARGO_TARGET_DIR) +/target-linux/ + +# amd64 Linux build dir for the benchmark container +/target-amd64/ diff --git a/ROADMAP.md b/ROADMAP.md index ea27b10..d004df5 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -27,7 +27,7 @@ Phase 1 (CLI) ✅ └→ Phase 17.B (per-mate seeding) [planned] └→ Phase 17.1 (Log.final.out) ✅ └→ Phase 17.2+ (features + polish) - └→ Phase 14 (STARsolo) [DEFERRED] + └→ Phase 14 (STARsolo) 🚧 14.1 done ``` **Phase ordering rationale**: Threading (Phase 9) done first to establish parallel architecture. @@ -55,7 +55,7 @@ Paired-end (Phase 8) builds on threaded infrastructure. GTF/junctions (Phase 7) | [15](docs-old/phase15_sam_tags.md) | SAM Tags + PE Fix | ✅ | 235 | NH/HI/AS/NM/nM/XS/jM/jI/MD, PE fix | | [16](docs-old/phase16_algorithm.md) | Algorithm Parity | ✅* | 268 | SE: **8613/8926 (0 STAR-only, 99.815% tie-adj)**, 2.2% splice; PE: **8390/8390 exact**, **99.883% tie-adj PE faithfulness**, 0 MAPQ inflate/deflate, 0 NH diffs (Phase G2) | | [17](docs-old/phase17_features.md) | Features + Polish | ✅* | 396 | Log.final.out, GeneCounts, TranscriptomeSAM, SJDB insertion, --outSAMattrRGline, --runRNGseed, combined-read PE seeding (Phase E2), scoreSeedBest (17.A), sorted BAM (17.2), outReadsUnmapped (17.4), outStd (17.6), PE chimeric (17.3), WithinBAM (17.11), GTF tag params (17.7), outBAMcompression+limitBAMsortRAM (17.9), chimeric Tier 1b soft-clip re-seed (12.2), chimeric Tier 3 residual re-seed (17.10) | -| 14 | STARsolo | DEFERRED | — | Waiting for accuracy parity | +| [14](docs-old/phase14_starsolo.md) | STARsolo (single-cell) | 🚧 In progress | 475 | **MVP done (14.1–14.4)**: 10x Gene count matrix end-to-end (barcode plumbing, CB correction, gene assignment, UMI dedup, raw matrix.mtx) | *Partially complete — see linked docs for sub-phase status. @@ -308,6 +308,37 @@ See [docs-old/phase17_features.md](docs-old/phase17_features.md) for sub-phase t --- -## Phase 14: STARsolo (Single-Cell) — DEFERRED +## Phase 14: STARsolo (Single-Cell) — IN PROGRESS -Waiting for accuracy parity (position agreement >99%). +**Prerequisite met**: position agreement >99% (SE 99.815% tie-adj, PE 99.883%). Phase unblocked 2026-06-10. + +Single-cell quantification layered around the existing aligner: the cDNA read aligns through the normal SE path; a paired **barcode read** (R1 = cell barcode + UMI) is parsed, corrected against a whitelist, assigned to a gene, UMI-deduplicated, and emitted as a sparse per-cell count matrix. Target: faithful port of STARsolo (all features). See [docs-old/phase14_starsolo.md](docs-old/phase14_starsolo.md) for the full design and sub-phase tracking. + +| Sub-phase | Description | Status | +|-----------|-------------|--------| +| 14.1 | `--solo*` params + barcode-read input plumbing (`src/solo/`, CB/UMI extraction, SE dispatch) | ✅ Complete | +| 14.2 | Whitelist load + CB correction (`--soloCBmatchWLtype`) + UMI checks | ✅ Complete | +| 14.3 | Per-read gene assignment + CB/UMI threaded into the alignment loop | ✅ Complete | +| 14.4 | UMI dedup + raw `matrix.mtx` (**MVP complete**) | ✅ Complete | +| 14.CR | CellRanger 4/5-matching flags (`1MM_CR`, `MultiGeneUMI_CR`, `1MM_multi_Nbase_pseudocounts`, `CellRanger4` clip) | ✅ Complete | +| 14.5 | `Summary.csv` / `Barcodes.stats` / `Features.stats` | ⬜ Planned | +| 14.6 | Cell filtering (`--soloCellFilter`: CellRanger2.2, EmptyDrops_CR) | ⬜ Planned | +| 14.7 | `CB`/`UB`/`GX`/`GN` SAM tags + `CB_samTagOut` | ⬜ Planned | +| 14.8 | More features: GeneFull, SJ, Velocyto | ⬜ Planned | +| 14.9 | Multi-gene resolution (`--soloMultiMappers`) | ⬜ Planned | +| 14.10 | Other chemistries: CB_UMI_Complex, SmartSeq | ⬜ Planned | +| 14.11 | Differential test harness vs STARsolo + synthetic integration tests | ⬜ Planned | + +**Phase 14.1** (2026-06-10): `SoloType` enum + 12 `--solo*` params in `src/params/mod.rs`; new `src/solo/mod.rs` (`SoloBarcodeLayout` geometry, `CellBarcode` CB/UMI extraction, `SoloReadReader` lockstep cDNA+barcode FASTQ reader); solo validation (2 read files, GTF for Gene/GeneFull, CB/UMI length); `run_single_pass` + `run_pass1` dispatch routes solo runs to the SE cDNA path (file 0). 447 lib tests (+6 solo), 0 clippy warnings. + +**Phase 14.2** (2026-06-11): new `src/solo/whitelist.rs` — faithful port of STAR's `SoloReadBarcode_getCBandUMI.cpp` read stage. 2-bit barcode packing (`seq[0]` high bits, N-detection: 0/1/>1), sorted-array whitelist load (plain/gz), `match_cb` (exact → single-N → 1MM enumeration) honoring `--soloCBmatchWLtype` (Exact/1MM/1MM_multi/…); multi-match reads record all candidate WL indices + mismatch quality (`CbMatch::Multi`) for the Phase 14.4 posterior; exact-match count table accumulated as the posterior prior; UMI checks (N → reject, homopolymer → reject); `CbMatchStats` with STAR's cbMatch categories. Params: `--soloCBmatchWLtype` validation, `solo_cb_match_type()` / `solo_cb_whitelist_path()` helpers, None-whitelist-requires-Exact rule, CBlen≤32 guard. 460 lib tests (+13 solo), 0 clippy warnings. + +**Phase 14.3** (2026-06-11): per-read gene assignment + barcode threading into the alignment loop. New `src/solo/gene.rs` — `SoloStrand` (`--soloStrand`), `assign_gene_se` (union of strand-filtered `overlapping_genes` across all loci → `Gene`/`NoFeature`/`Ambiguous`/`Unmapped`; multi-locus-same-gene stays unique). `src/solo/mod.rs` gains `SoloContext` (whitelist + gene model + stats + recorder, `build()` from params), `SoloRecorder` (thread-safe `SoloCountRecord` / deferred `SoloMultiRecord`), and `process_read` (CB match → UMI check → gene assign → record). New `align_reads_solo` loop in `lib.rs` reads cDNA + barcode in lockstep (`SoloReadReader`), aligns the cDNA, writes SAM/BAM, and collects per-cell records; `run_single_pass`/`run_two_pass` thread `solo_ctx`. 467 lib + 10 integration tests, 0 clippy warnings. + +**Phase 14.CR — CellRanger 4.x/5.x matching** (2026-06-12): implemented the STARsolo.md CellRanger-matching flag set faithfully from STAR source. `--soloUMIdedup 1MM_CR` (`umiArrayCorrect_CR`: each UMI corrected to its highest-count 1MM neighbor, non-transitive, count = distinct corrected). `--soloUMIfiltering MultiGeneUMI_CR` (keep the top-read-count gene of a multi-gene UMI) + `MultiGeneUMI`; `build_matrix` restructured to per-cell `umi → gene → readcount`. `--soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts` adds a +1 pseudocount to the CB posterior prior. `--clipAdapterType CellRanger4` (TSO 5' clip + polyA 3' trim, conservative no-op on adapter-free reads). All validated in params. Differential harness `test/solo_cellranger_diff.py` runs the full CellRanger flag set on both rustar-aligner and real STAR and compares decoded `{(barcode, gene_id): count}` matrices; committed cargo test `test_starsolo_cellranger_style_matrix` asserts the matrix (incl. 1MM_CR collapse) always. + +**Three-way benchmark** (see [docs-old/phase14_benchmark.md](docs-old/phase14_benchmark.md)): CellRanger 10.0.0 vs STARsolo 2.7.10b vs rustar-aligner on 10M reads of a real 5′ mouse 10x dataset (GRCm39-2024-A), all x86_64 in Docker. rustar produces a correct matrix (4.22M UMIs, exonic Gene, ~4% above STARsolo's 4.07M; CellRanger's 4.84M includes introns). After a buffered-I/O fix (raw-matrix write 1306s → 3s; barcodes.tsv was unbuffered), rustar's count is 670s vs STARsolo 152s / CellRanger 356s; index build 2801s (faster than STAR's 3626s under emulation). Peak RSS 37GB (index-dominated). `build_matrix` Step 1 (per-cell processing) bounds matrix-build memory. + +**Live verification — PASS:** rustar-aligner's `Gene/raw` matrix is **byte-identical to real STARsolo's** for the CellRanger-style run, confirmed deterministically (3/3 runs). The reference STAR (2.7.10b) and a Linux build of rustar-aligner run in a consistent Linux container (`test/Dockerfile.solodiff` + `test/solo_diff_docker.sh`, via colima — no Docker Desktop). This was necessary because STAR 2.7.11b reads 0 input reads on Apple-Silicon macOS (a known STAR/macOS bug, `nextChar=-1`). 479 lib + 11 integration tests, 0 clippy warnings. + +**Phase 14.4 — MVP COMPLETE** (2026-06-11): UMI deduplication + raw count-matrix output. New `src/solo/count.rs`: `UmiDedup` (`--soloUMIdedup`: Exact / NoDedup / 1MM_All [default, connected-components within Hamming-1] / 1MM_Directional / 1MM_Directional_UMItools, `dirCountAdd` 0/−1); deferred 1MM_multi CB resolution via STAR's count+quality posterior (weight = `exactCount·10^(−q/10)`, prior from `whitelist.exact_count_snapshot()`); `build_matrix` groups reads by (cell,gene), collapses UMIs, and `write_gene_matrix` writes `Solo.out/Gene/raw/{matrix.mtx, barcodes.tsv, features.tsv}` (MatrixMarket `nFeatures nBarcodes nEntries`, entries `gene+1 cell+1 count`, 1-based; CellRanger-v3 3-column features.tsv; whitelist-sorted barcodes.tsv). Wired into `align_reads` post-alignment. `--soloUMIdedup` validation in params. End-to-end test (`test_starsolo_gene_matrix`): 8 reads, one cell, two Hamming-distant UMI clouds → 2 deduped molecules → matrix `1 1 2`. **A working 10x Chromium Gene count matrix.** 475 lib + 10 integration tests, 0 clippy warnings. diff --git a/docs-old/phase14_benchmark.md b/docs-old/phase14_benchmark.md new file mode 100644 index 0000000..cdaf3ee --- /dev/null +++ b/docs-old/phase14_benchmark.md @@ -0,0 +1,88 @@ +[← Back to ROADMAP](../ROADMAP.md) · [Phase 14](phase14_starsolo.md) + +# Phase 14 Benchmark: CellRanger vs STARsolo vs rustar-aligner + +Runtime + output-stats comparison of the three single-cell quantifiers on a real +10x mouse dataset, run in one consistent Linux/x86_64 environment. + +## Setup + +- **Reference**: CellRanger mouse `refdata-gex-GRCm39-2024-A` (genome 2.79 Gb, 61 + contigs, 33,696 genes). STAR + rustar build their indexes from the refdata + `fasta/genome.fa` + `genes/genes.gtf` (`--sjdbOverhang 89`); CellRanger uses + the refdata directly. +- **Data**: 5k Mouse PBMCs, **5′ GEM-X** (SC5P-R2-v3); first **10,000,000 read + pairs** of the GEX library — identical reads for all three tools. +- **Solo params** (CellRanger-matching, 5′): `--soloType CB_UMI_Simple`, + CB 16 / UMI 12, `--soloStrand Reverse`, whitelist `3M-5pgex-jan-2023`, + `--soloFeatures Gene`, `--soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts`, + `--soloUMIfiltering MultiGeneUMI_CR`, `--soloUMIdedup 1MM_CR`. +- **Environment**: Docker (colima) on Apple-Silicon macOS, **everything x86_64 + via Rosetta** (CellRanger is x86_64-only), 14 cores / 40 GB. All absolute + times are inflated ~2–3× by emulation; the *relative* picture holds. +- **Tooling**: CellRanger 10.0.0, STAR 2.7.10b, rustar-aligner (this branch). + Driver: [`test/solo_bench.py`](../test/solo_bench.py) (each step under + `/usr/bin/time -v`), image [`test/Dockerfile.bench`](../test/Dockerfile.bench). + +## Results + +| Tool | Index build | Count (align+quant) | Peak RSS | Raw barcodes | Genes | Total UMIs | +|------|------------:|--------------------:|---------:|-------------:|------:|-----------:| +| **CellRanger 10.0.0** | (prebuilt) | 356 s | 12.5 GB | 161,465 | 17,258 | 4,843,682 | +| **STARsolo 2.7.10b** | 3,626 s | 152 s | 30 GB | 143,490 | 15,675 | 4,067,946 | +| **rustar-aligner** | 2,801 s | **670 s** | 37 GB | 156,258 | 16,278 | 4,219,582 | + +CellRanger reported: 3,858 cells, 599 median genes/cell, 88.5 % valid barcodes, +58.5 % reads mapped to transcriptome. + +### Correctness + +On identical reads, rustar's raw matrix is in line with the references: +**4,219,582 UMIs** (exonic `Gene`), ~4 % above STARsolo's 4,067,946 (also exonic +`Gene`). CellRanger's 4,843,682 is higher because it counts **intronic** reads by +default (`include-introns`), whereas `--soloFeatures Gene` is exonic-only. +rustar's read-stage barcode match rate was **86 % exact** on this real data. + +### The buffered-I/O fix + +The first rustar count run took 1,774 s. A breakdown showed the raw-matrix write +dominated: + +``` + before after +matrix write: 1,306 s → 3 s (~435×; byte-identical output) +align (10M): 402 s → 627 s (unchanged logic; emulation variance) +count total: 1,774 s → 670 s +``` + +Cause: `write_barcodes` / `write_matrix_mtx` wrote to a raw `std::fs::File` +(unbuffered) — one `write(2)` syscall per line, so `barcodes.tsv` (the full +3,686,400-barcode whitelist) cost ~3.7M syscalls, amplified by Rosetta+virtiofs. +Fix: wrap the writers in `BufWriter` + a no-alloc barcode unpack +(`unpack_barcode_into`). The write dropped to ~3 s. + +## Notes & limitations + +- **Index build**: rustar (2,801 s) was *faster* than STARsolo (3,626 s) under + emulation; CellRanger ships a prebuilt index (its 356 s "count" includes the + internal STAR alignment + cell calling + full metrics). +- **Memory**: rustar's 37 GB peak is dominated by the **loaded index (~27 GB: + 5.4 B-entry SA for the 2.79 Gb genome)** plus the alignment working set — *not* + the matrix build (Step 1 per-cell `build_matrix` already bounds that). Reducing + the peak further is about the SA representation and alignment buffers, not the + matrix. +- **Read count**: 10M (of ~200M total) keeps the run tractable and memory under + the 40 GB cap. Stats scale with depth (CellRanger called 3,858 cells at this + subsample vs the dataset's ~4,725). + +## Reproduce + +```bash +brew install colima docker && colima start --cpu 14 --memory 40 --vm-type vz --vz-rosetta +# build the amd64 image (colima can't build amd64 directly; run+commit a base): +docker run --platform linux/amd64 --name b rust:1-bookworm \ + bash -c "apt-get update -qq && apt-get install -y -qq rna-star python3 procps time" +docker commit b rustar-bench-amd64 && docker rm -f b +# then run test/solo_bench.py inside it with the ref/whitelist/fastqs mounted +# (see test/solo_bench.py header for the full argument list). +``` diff --git a/docs-old/phase14_starsolo.md b/docs-old/phase14_starsolo.md new file mode 100644 index 0000000..230190b --- /dev/null +++ b/docs-old/phase14_starsolo.md @@ -0,0 +1,324 @@ +[← Back to ROADMAP](../ROADMAP.md) + +# Phase 14: STARsolo (Single-Cell) + +**Status**: In progress — **MVP complete (14.1–14.4)** + +**Goal**: A faithful port of STARsolo — turn the aligner into a single-cell RNA-seq +quantifier that matches STAR's `--soloType` output (count matrices, barcode/UMI +correction, cell calling, SAM tags) as closely as the bulk aligner already +matches STAR. + +**Prerequisite (met)**: position agreement >99% — SE 99.815% (tie-adjusted), +PE 99.883%. Phase unblocked 2026-06-10. + +--- + +## Architecture + +STARsolo is a **layer around** the existing aligner, not a change to it. The core +alignment is untouched: + +``` + readFilesIn[0] = cDNA read ──► existing SE alignment ──► Transcript(s) + readFilesIn[1] = barcode read (R1: CB+UMI) ──► parse ──► correct vs whitelist + │ + Transcript + corrected CB + UMI ──► gene assignment (overlapping_genes) + │ + collate per (CB, gene) ──► UMI dedup ──► count + │ + Solo.out//raw/matrix.mtx +``` + +Key reuse points already in the codebase: +- `Transcript` (`src/align/transcript.rs`) carries `chr_idx`, `genome_start/end`, + `is_reverse`, `exons` — everything gene assignment needs. +- `GeneAnnotation::overlapping_genes()` (`src/quant/mod.rs`) maps an alignment to + gene indices and is directly reusable for per-cell counting. +- The SE parallel batch loop (`align_reads_single_end` in `src/lib.rs`) is where + per-read barcode info threads through to a per-cell accumulator. + +**Read-file convention** (matches STAR): `--readFilesIn cDNA_read barcode_read`. +The cDNA read is file 0, the barcode read is file 1. A solo run therefore supplies +two files but is a *single-end alignment* run. + +--- + +## Sub-phase plan + +| Sub-phase | Description | Status | +|-----------|-------------|--------| +| 14.1 | `--solo*` params + barcode-read input plumbing | ✅ Complete | +| 14.2 | Whitelist load + CB correction (`--soloCBmatchWLtype`) + UMI checks | ✅ Complete | +| 14.3 | Per-read gene assignment + CB/UMI threaded into the alignment loop | ✅ Complete | +| 14.4 | UMI dedup + raw `matrix.mtx` (**MVP complete**) | ✅ Complete | +| 14.5 | `Summary.csv` / `Barcodes.stats` / `Features.stats` | ⬜ Planned | +| 14.6 | Cell filtering (`filtered/` matrix) | ⬜ Planned | +| 14.7 | `CB`/`UB`/`GX`/`GN` SAM tags + `CB_samTagOut` | ⬜ Planned | +| 14.8 | More features: GeneFull, SJ, Velocyto | ⬜ Planned | +| 14.9 | Multi-gene resolution (`--soloMultiMappers`) | ⬜ Planned | +| 14.10 | Other chemistries: CB_UMI_Complex, SmartSeq | ⬜ Planned | +| 14.11 | Differential test harness vs STARsolo + integration tests | ⬜ Planned | + +**MVP = 14.1–14.5**: a working 10x Chromium `Gene` count matrix. + +### Faithfulness risk notes +- **Read ordering**: cDNA read is FIRST in `--readFilesIn`, barcode read second. +- **CB correction** posterior math and the **`1MM_Directional`** UMI-graph collapse + are the two algorithms where byte-parity with STAR is fiddly — budget extra + differential-testing time there (14.2, 14.4). +- **Matrix conventions**: MatrixMarket coordinate format, features × barcodes, + 1-based indices — must match Cell Ranger / STARsolo layout exactly. + +--- + +## Phase 14.1: Params + barcode-read plumbing ✅ (2026-06-10) + +**Goal**: Accept `--soloType` and the barcode geometry on the CLI, read the barcode +read alongside the cDNA read, and extract CB+UMI — without yet counting. + +**Implementation**: + +1. **`src/params/mod.rs`** — `SoloType` enum (`None`, `CbUmiSimple` [alias + `Droplet`], `CbUmiComplex`, `CbSamTagOut`, `SmartSeq`) with `FromStr`/`Display`. + 12 new parameters: + - `--soloType`, `--soloCBwhitelist`, `--soloCBstart` (1), `--soloCBlen` (16), + `--soloUMIstart` (17), `--soloUMIlen` (10), `--soloFeatures` (`Gene`), + `--soloUMIdedup` (`1MM_All`), `--soloCBmatchWLtype` (`1MM_multi`), + `--soloCellFilter`, `--soloOutFileNames`, `--soloStrand` (`Forward`). + - Helpers: `solo_enabled()`, `cdna_read_file()`, `barcode_read_file()`, + `solo_cb_whitelist_none()`. + - Validation: solo needs exactly 2 read files; `Gene`/`GeneFull` need a GTF; + CB/UMI length > 0 for `CB_UMI_Simple`. + +2. **`src/solo/mod.rs`** (new) — + - `SoloBarcodeLayout` — fixed-position geometry, 1-based starts converted to + 0-based; `from_params`, `min_read_len`, `extract`. + - `CellBarcode` — encoded CB/UMI seq + raw Phred qualities; `cb_has_n`, + `umi_has_n`, `cb_string`, `umi_string`. + - `SoloReadReader` / `SoloRead` — lockstep reader over the cDNA and barcode + FASTQ files; `read_batch`; errors on length mismatch. `open_reader(params)` + factory. + +3. **`src/lib.rs`** — `mod solo;`; `run_single_pass` + `run_pass1` compute + `n_align_files = if solo { 1 } else { read_files_in.len() }` so a 2-file solo + run routes to the SE cDNA path; `is_paired` excludes solo. + +**Boundary**: 14.1 makes a solo run *parse and validate* and aligns the cDNA read +(producing `Aligned.out.sam`). Barcodes are extracted by `SoloReadReader` but not +yet threaded into the parallel alignment loop or counted — that begins in 14.2, +where per-read barcode handling pairs naturally with whitelist correction. + +**Tests**: 6 new in `src/solo/mod.rs` (layout conversion, v2 extraction, too-short +read, N-detection, reader pairing, length-mismatch error) + CLI validation smoke +tests. 447 lib tests, 0 clippy warnings. + +**Files**: `src/params/mod.rs`, `src/solo/mod.rs` (new), `src/lib.rs` + +--- + +## Phase 14.2: Whitelist load + CB correction ✅ (2026-06-11) + +**Goal**: Load the cell-barcode whitelist and match each read's CB to it exactly +as STAR's read stage does, plus validate the UMI. + +**Reference**: STAR `source/SoloReadBarcode_getCBandUMI.cpp` (read stage). The +multi-match *posterior* resolution lives in the collation stage, not here — see +the boundary note below. + +**Implementation** (`src/solo/whitelist.rs`, new): + +- **Packing** — `pack_barcode` 2-bit packs an encoded barcode into a `u64` with + `seq[0]` in the high bits (matching `convertNuclStrToInt64`). N-handling: + `NoN(u64)` / `OneN{packed,pos}` / `ManyN`. `unpack_barcode` reverses it. +- **`CbMatchType`** — decodes `--soloCBmatchWLtype` into STAR's `mm1` / + `mm1_multi` / `mm1_multi_nbase` / `pseudocounts` flags (Exact, 1MM, 1MM_multi + [default], `_pseudocounts`, `_Nbase_pseudocounts`). +- **`CbWhitelist`** — `List` (sorted unique packed `Vec` + original-order + index for `barcodes.tsv` + per-index `exact_counts` atomics) or `NoWhitelist`. + `load()` reads plain or gzip, validates equal lengths, rejects N-containing + whitelist entries. +- **`match_cb`** follows STAR exactly: exact binary search (→ `Exact`, bumps the + exact-count prior); else single-N substitution (all 4 bases at the N position) + or 1MM enumeration (every position × 3 alternate bases). One candidate → + `Corrected`; >1 → `Multi(candidates)` when the multi flag is set (records WL + index + mismatch position + quality for later resolution) else + `MultMatchRejected`. Rejections map to STAR's cbMatch codes (`NoMatch` -1, + `NinCb` -2, `MultMatchRejected` -3). +- **`check_umi`** — any N → `NinUmi` (-23); exact homopolymer → `Homopolymer` + (-24); else `Ok(packed)`. +- **`CbMatchStats`** — atomic counters for STAR's cbMatch categories. + +**Params** (`src/params/mod.rs`): `--soloCBmatchWLtype` validity check; +`solo_cb_match_type()` and `solo_cb_whitelist_path()` helpers; rules that +`--soloCBwhitelist None` requires `Exact`, and `--soloCBlen ≤ 32`. + +**Boundary**: the count + quality **posterior** that resolves `CbMatch::Multi` +into one corrected barcode needs the *global* `exact_counts` table, which is only +complete after all reads are processed — so it is a collation-stage operation +deferred to Phase 14.4. Phase 14.2 records the candidates (exactly as STAR's +`cbMatchString`) and accumulates the prior. The matcher is also not yet wired +into the alignment loop; that happens in 14.3 alongside gene assignment. + +**Tests**: 13 new in `src/solo/whitelist.rs` (pack roundtrip, N-detection, exact +match + count, 1MM correction, ambiguous multi vs reject, no-match, single-N +correction, many-N reject, Exact-only mode, UMI checks, length-mismatch error, +gzip load, match-type parsing) + CLI validation smoke tests. 460 lib tests, +0 clippy warnings. + +**Files**: `src/solo/whitelist.rs` (new), `src/solo/mod.rs`, `src/params/mod.rs` + +--- + +## Phase 14.3: Gene assignment + barcode threading ✅ (2026-06-11) + +**Goal**: Assign each cDNA alignment to a gene and wire CB/UMI through the +alignment loop so per-cell (CB, UMI, gene) records are collected. + +**Gene assignment** (`src/solo/gene.rs`, new): +- `SoloStrand` (`--soloStrand`: Forward [default] / Reverse / Unstranded). +- `assign_gene_se(transcripts, gene_ann, strand)` — the read's gene set is the + UNION of strand-filtered `GeneAnnotation::overlapping_genes` across ALL its + alignments. Exactly one gene → `Gene(idx)`; zero → `NoFeature`; >1 → + `Ambiguous`; no transcripts → `Unmapped`. A multi-locus read whose loci all + fall in one gene is therefore still gene-unique (matching STARsolo's default + `--soloMultiMappers Unique`, unlike `quantMode GeneCounts` which drops every + multimapper). + +**Context + recorder** (`src/solo/mod.rs`): +- `SoloContext` — `build(params, genome)` loads the whitelist and builds the + gene model from `--sjdbGTFfile`; bundles layout + whitelist + match type + + strand + `CbMatchStats` + `SoloRecorder`, shared as an `Arc` across threads. +- `SoloRecorder` — thread-safe sink for `SoloCountRecord{cb, umi, gene}` plus + deferred `SoloMultiRecord` (unresolved 1MM_multi CBs, resolved in 14.4). +- `SoloContext::process_read` — CB match → UMI check → gene assign, recording + stats and producing a record only when all three succeed. + +**Loop** (`src/lib.rs`): new `align_reads_solo` reads cDNA (file 0) + barcode +(file 1) in lockstep via `SoloReadReader`, aligns the cDNA exactly like the SE +path (`align_read` → `build_alignment_records`), writes SAM/BAM, runs +`process_read` per read, and appends records to the recorder in the sequential +write phase. `run_single_pass` dispatches solo runs here; `run_single_pass` / +`run_two_pass` thread `solo_ctx`. A run-end summary logs the barcode-match stats +and record count. + +**Boundary / limitations**: the solo loop is single-pass and does not yet emit +BySJout / chimeric / transcriptome-SAM side outputs (not part of the MVP). The +count matrix (`raw/matrix.mtx` + `barcodes.tsv` + `features.tsv`) and 1MM_multi +posterior resolution are Phase 14.4. `--soloStrand` validated in params. + +**Tests**: 7 new gene-assignment unit tests + end-to-end +`test_starsolo_gene_assignment` (synthetic genome + GTF + whitelist: 16 cDNA +reads → 16 exact CB matches → 16 resolved (CB,UMI,gene) records). 467 lib + 10 +integration tests, 0 clippy warnings. + +**Files**: `src/solo/gene.rs` (new), `src/solo/mod.rs`, `src/params/mod.rs`, +`src/lib.rs`, `tests/alignment_features.rs` + +--- + +## Phase 14.4: UMI dedup + raw matrix — MVP COMPLETE ✅ (2026-06-11) + +**Goal**: Collapse UMIs and write the raw per-cell count matrix — the first +usable single-cell output. + +**Reference**: STAR `SoloFeature_collapseUMIall.cpp` (dedup), +`SoloReadFeature_inputRecords.cpp` (CB multi-resolution), +`SoloFeature_outputResults.cpp` (matrix format). + +**Implementation** (`src/solo/count.rs`, new): + +- **`UmiDedup`** (`--soloUMIdedup`): `Exact` (distinct UMIs), `NoDedup` (reads), + `1MM_All` (default — connected components where any two UMIs within Hamming-1 + merge transitively, via union-find), `1MM_Directional` / `_UMItools` + (`count_hub ≥ 2·count_leaf + dirCountAdd`, `dirCountAdd` 0 / −1). +- **Deferred 1MM_multi CB resolution** — `resolve_multi_cb` picks the candidate + maximizing STAR's posterior weight `exactCount[cand] · 10^(−q/10)` (prior = + `whitelist.exact_count_snapshot()`, `q` = mismatch-position Phred); rejects + when no candidate has positive weight. +- **`build_matrix`** groups reads by `(cell, gene)` into UMI→multiplicity maps + (resolved multi-CB records folded in), then dedups each. +- **`write_gene_matrix`** writes `Solo.out/Gene/raw/`: + - `matrix.mtx` — `%%MatrixMarket matrix coordinate integer general`; dims + `nFeatures nBarcodes nEntries`; entries `gene+1 cell+1 count` (1-based), + iterated in cell-column order. + - `features.tsv` — `gene_id gene_id Gene Expression` (CellRanger + v3; no gene names available so id is repeated). + - `barcodes.tsv` — full whitelist in sorted order (matrix column order). + +Wired into `align_reads` after alignment. `--soloUMIdedup` validated in params. + +**Known approximations to revisit** (differential testing, 14.11): the +`1MM_Directional` absorption is a greedy hub model (faithful default path is +`1MM_All`, which is exact); the CB-posterior acceptance uses no `cbMinP` +threshold (always takes the argmax); `barcodes.tsv` uses sorted (not 10x-file) +order; `--soloCBwhitelist None` matrix output is not yet supported. + +**Tests**: 8 new unit tests in `count.rs` (each dedup method incl. transitive +chains and the directional thresholds; multi-CB posterior) + end-to-end +`test_starsolo_gene_matrix` (8 reads, one cell, two Hamming-distant UMI clouds → +2 deduped molecules → matrix `1 1 2`, validated `features.tsv` / `barcodes.tsv`). +475 lib + 10 integration tests, 0 clippy warnings. + +**Files**: `src/solo/count.rs` (new), `src/solo/mod.rs`, `src/params/mod.rs`, +`src/lib.rs`, `tests/alignment_features.rs` + +--- + +## Phase 14.CR: CellRanger 4.x/5.x matching — VERIFIED vs real STARsolo ✅ (2026-06-12) + +**Goal**: Support the [STARsolo CellRanger-matching flag set](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#matching-cellranger-4xx-and-5xx-results) +and prove the output matches real STARsolo. + +**Flags** (`--clipAdapterType CellRanger4 --outFilterScoreMin 30 +--soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR +--soloUMIdedup 1MM_CR`), implemented from STAR source: + +- **`1MM_CR`** (`src/solo/count.rs::cellranger_1mm`) — port of STAR + `umiArrayCorrect_CR`: UMIs sorted ascending by `(count, umi)`, each corrected + to its highest-count 1MM neighbor, **non-transitive** (points to the neighbor's + raw UMI), count = distinct corrected UMIs. +- **`MultiGeneUMI_CR`** (`filter_multi_gene_umi`) — keep the top-read-count gene + of a multi-gene UMI. `build_matrix` restructured to per-cell + `umi → gene → read_count` so filtering precedes dedup. +- **`1MM_multi_Nbase_pseudocounts`** — +1 pseudocount on the CB posterior prior + (`resolve_multi_cb`). +- **`CellRanger4` clip** (`src/solo/mod.rs::clip_adapter_cr4`) — TSO 5' clip + + polyA 3' trim, conservative (no-op on adapter-free reads), applied in + `align_reads_solo` before fixed Nbases clipping. + +All four validated in `params.rs`. + +**Differential test** (`test/solo_cellranger_diff.py`): generates a synthetic 10x +dataset (two 2-exon genes, whitelist, cDNA + barcode reads with a planted 1MM +UMI pair), runs the full CellRanger flag set on BOTH rustar-aligner and real +STAR, and compares the decoded `{(barcode, gene_id): count}` matrices. + +**Result — byte-identical match, 3/3 deterministic:** +``` +(AAAACCCCGGGGTTTT, GENEA) = 2 # 1MM_CR collapsed M(x5)+M-1mm(x1) -> 1, +N(x3) -> 2 +(AAAACCCCGGGGTTTT, GENEB) = 1 +(ACACACACGTGTGTGT, GENEA) = 1 +``` + +**Why a container**: STAR 2.7.11b reads 0 input reads on Apple-Silicon macOS (a +known STAR/macOS bug — `nextChar=-1` immediate EOF — present in both the homebrew +bottle and a from-source build). The reference therefore runs in a Linux +container (`test/Dockerfile.solodiff` — Debian + `rna-star` 2.7.10b + Rust), +driven by `test/solo_diff_docker.sh` via colima (no Docker Desktop needed). On a +host with a working STAR, `python3 test/solo_cellranger_diff.py` runs it directly. + +A committed cargo test (`test_starsolo_cellranger_style_matrix`) asserts the same +CellRanger-style matrix (including the 1MM_CR collapse) without needing STAR, and +each CellRanger algorithm has unit tests in `src/solo/count.rs`. + +--- + +## MVP status + +Phases 14.1–14.4 deliver a working **10x Chromium `Gene`** quantifier: +`--soloType CB_UMI_Simple --soloCBwhitelist --soloFeatures Gene +--sjdbGTFfile --readFilesIn cDNA.fq barcode.fq` aligns the cDNA reads and +writes `Solo.out/Gene/raw/{matrix.mtx, barcodes.tsv, features.tsv}`. Remaining +phases (14.5–14.11) add stats files, cell filtering, SAM tags, more features, +multi-gene resolution, other chemistries, and the differential-test harness. diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 89ea66b..077d686 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -1505,7 +1505,7 @@ mod tests { } let genome = Genome { - sequence, + sequence: sequence.into(), n_genome, n_genome_real: n_genome, n_chr_real: 1, diff --git a/src/align/score.rs b/src/align/score.rs index 7790fa9..83e5842 100644 --- a/src/align/score.rs +++ b/src/align/score.rs @@ -634,7 +634,7 @@ mod tests { } Genome { - sequence, + sequence: sequence.into(), n_genome, n_genome_real: n_genome, n_chr_real: 1, diff --git a/src/align/seed.rs b/src/align/seed.rs index 7480756..31366ff 100644 --- a/src/align/seed.rs +++ b/src/align/seed.rs @@ -471,7 +471,7 @@ fn compare_seq_to_genome( return (match_len, true); } - let genome_base = index.genome.sequence[genome_idx]; + let genome_base = index.genome.sequence.base(genome_idx); if genome_base >= 5 { // Padding character — STAR returns comp_res > 0 (read > genome) diff --git a/src/align/stitch.rs b/src/align/stitch.rs index c8ef6cf..b5fe2eb 100644 --- a/src/align/stitch.rs +++ b/src/align/stitch.rs @@ -3019,7 +3019,7 @@ mod tests { } let genome = Genome { - sequence, + sequence: sequence.into(), n_genome, n_genome_real: n_genome, n_chr_real: 1, @@ -3141,7 +3141,7 @@ mod tests { } let genome = Genome { - sequence, + sequence: sequence.into(), n_genome, n_genome_real: n_genome, n_chr_real: 1, diff --git a/src/bin/emptydrops.rs b/src/bin/emptydrops.rs new file mode 100644 index 0000000..bbe1ef8 --- /dev/null +++ b/src/bin/emptydrops.rs @@ -0,0 +1,374 @@ +//! Standalone EmptyDrops_CR cell caller (Rust port of STAR +//! `SoloFeature_emptyDrops_CR.cpp` / CellRanger's EmptyDrops variant). +//! +//! Reads a raw count matrix (MatrixMarket `matrix.mtx` [.gz] genes×cells + +//! `barcodes.tsv`/`features.tsv`) and writes the called cells: +//! - guaranteed cells from the CellRanger-2.2 knee, plus +//! - extra cells whose expression profile is significantly different from the +//! ambient RNA profile (multinomial Monte-Carlo test, Benjamini-Hochberg). +//! +//! Output: `/barcodes.tsv` (called cells) + `/cells.txt` (one called +//! barcode per line) and a `/emptydrops.json` summary. +//! +//! Usage: +//! emptydrops --raw --out [--seed N] [--fdr 0.01] [--sim-n 10000] +//! +//! Defaults mirror STAR `--soloCellFilter EmptyDrops_CR 3000 0.99 10 45000 90000 500 0.01 20000`. + +use std::fs::File; +use std::io::{BufRead, BufReader, BufWriter, Read, Write}; +use std::path::{Path, PathBuf}; + +use flate2::read::GzDecoder; +use rand::SeedableRng; +use rand::distr::{Distribution, weighted::WeightedIndex}; +use rand::rngs::StdRng; + +struct Args { + raw: PathBuf, + out: PathBuf, + seed: u64, + fdr: f64, + sim_n: usize, + n_expected: usize, + max_percentile: f64, + max_min_ratio: f64, + ind_min: usize, + ind_max: usize, + umi_min: u64, + umi_min_frac_median: f64, + cand_max_n: usize, +} + +fn parse_args() -> Args { + let mut a = Args { + raw: PathBuf::new(), + out: PathBuf::new(), + seed: 19_760_110, + fdr: 0.01, + sim_n: 10_000, + n_expected: 3000, + max_percentile: 0.99, + max_min_ratio: 10.0, + ind_min: 45_000, + ind_max: 90_000, + umi_min: 500, + umi_min_frac_median: 0.01, + cand_max_n: 20_000, + }; + let mut it = std::env::args().skip(1); + while let Some(k) = it.next() { + let mut v = || it.next().expect("missing value"); + match k.as_str() { + "--raw" => a.raw = PathBuf::from(v()), + "--out" => a.out = PathBuf::from(v()), + "--seed" => a.seed = v().parse().unwrap(), + "--fdr" => a.fdr = v().parse().unwrap(), + "--sim-n" => a.sim_n = v().parse().unwrap(), + "--n-expected" => a.n_expected = v().parse().unwrap(), + "--cand-max-n" => a.cand_max_n = v().parse().unwrap(), + "--ind-min" => a.ind_min = v().parse().unwrap(), + "--ind-max" => a.ind_max = v().parse().unwrap(), + "--umi-min" => a.umi_min = v().parse().unwrap(), + other => panic!("unknown arg {other}"), + } + } + assert!(!a.raw.as_os_str().is_empty(), "--raw required"); + assert!(!a.out.as_os_str().is_empty(), "--out required"); + a +} + +fn find(d: &Path, base: &str) -> PathBuf { + for c in [base.to_string(), format!("{base}.gz")] { + let p = d.join(&c); + if p.exists() { + return p; + } + } + panic!("{base}[.gz] not found in {}", d.display()); +} + +fn reader(p: &Path) -> Box { + let f = File::open(p).unwrap(); + if p.extension().is_some_and(|e| e == "gz") { + Box::new(BufReader::new(GzDecoder::new(f))) + } else { + Box::new(BufReader::new(f)) + } +} + +fn read_lines_first_col(p: &Path) -> Vec { + reader(p) + .lines() + .map(|l| l.unwrap().split('\t').next().unwrap().trim().to_string()) + .collect() +} + +/// Per-cell sparse profile: (gene_idx, count). Plus per-cell total. +struct Matrix { + n_genes: usize, + barcodes: Vec, + cell_profiles: Vec>, + totals: Vec, +} + +fn load_matrix(raw: &Path) -> Matrix { + let barcodes = read_lines_first_col(&find(raw, "barcodes.tsv")); + let genes = read_lines_first_col(&find(raw, "features.tsv")); + let n_genes = genes.len(); + let n_cells = barcodes.len(); + + // MatrixMarket: skip % header, then "nGenes nCells nnz", then "gene cell count". + let mut rd = reader(&find(raw, "matrix.mtx")); + let mut buf = String::new(); + // header + loop { + buf.clear(); + rd.read_line(&mut buf).unwrap(); + if !buf.starts_with('%') { + break; + } + } + let mut cell_profiles: Vec> = vec![Vec::new(); n_cells]; + let mut totals = vec![0u64; n_cells]; + let mut line = String::new(); + let mut content = String::new(); + rd.read_to_string(&mut content).unwrap(); + for l in content.lines() { + line.clear(); + let mut p = l.split_whitespace(); + let g: usize = match p.next() { + Some(x) => x.parse().unwrap(), + None => continue, + }; + let c: usize = p.next().unwrap().parse().unwrap(); + let v: u64 = p.next().unwrap().parse::().unwrap() as u64; + if v == 0 { + continue; + } + let gi = g - 1; + let ci = c - 1; + cell_profiles[ci].push((gi as u32, v as u32)); + totals[ci] += v; + } + Matrix { + n_genes, + barcodes, + cell_profiles, + totals, + } +} + +/// CellRanger-2.2 knee: number of guaranteed cells (top barcodes by total). +fn knee_n_cells(sorted_desc: &[u64], n_expected: usize, max_pct: f64, max_min_ratio: f64) -> usize { + if sorted_desc.is_empty() { + return 0; + } + let idx = ((n_expected as f64 * (1.0 - max_pct)).round() as usize).min(sorted_desc.len() - 1); + let robust_max = sorted_desc[idx] as f64; + let thr = robust_max / max_min_ratio; + sorted_desc.iter().take_while(|&&c| c as f64 >= thr).count() +} + +fn main() { + let a = parse_args(); + eprintln!("emptydrops: loading {}", a.raw.display()); + let m = load_matrix(&a.raw); + let n_cells = m.totals.len(); + + // Rank barcodes by total UMI, descending (stable by index for ties). + let mut order: Vec = (0..n_cells).filter(|&i| m.totals[i] > 0).collect(); + order.sort_by(|&i, &j| m.totals[j].cmp(&m.totals[i]).then(i.cmp(&j))); + let sorted_desc: Vec = order.iter().map(|&i| m.totals[i]).collect(); + + // (1) Guaranteed cells from the CR2.2 knee. + let n_simple = knee_n_cells( + &sorted_desc, + a.n_expected, + a.max_percentile, + a.max_min_ratio, + ); + eprintln!("emptydrops: {n_simple} guaranteed cells from CR2.2 knee"); + + // (2) Ambient profile from rank [ind_min, ind_max). + let mut amb = vec![0f64; m.n_genes]; + let mut amb_total = 0f64; + for &cell in order + .iter() + .skip(a.ind_min) + .take(a.ind_max.saturating_sub(a.ind_min)) + { + for &(g, c) in &m.cell_profiles[cell] { + amb[g as usize] += c as f64; + amb_total += c as f64; + } + } + if amb_total == 0.0 { + eprintln!("emptydrops: empty ambient range — falling back to knee-only"); + write_out(&a, &m, &order[..n_simple], n_simple, 0); + return; + } + // Good-Turing P0 (unseen mass) distributed over zero-count genes; seen genes + // get proportional mass scaled by (1 - P0). Approximates STAR's SGT. + let n1 = amb.iter().filter(|&&x| (x - 1.0).abs() < 0.5).count() as f64; + let p0 = (n1 / amb_total).clamp(1e-12, 0.5); + let n_zero = amb.iter().filter(|&&x| x == 0.0).count().max(1) as f64; + let amb_prob: Vec = amb + .iter() + .map(|&x| { + if x > 0.0 { + (1.0 - p0) * x / amb_total + } else { + p0 / n_zero + } + }) + .collect(); + let amb_logp: Vec = amb_prob.iter().map(|&p| p.max(1e-300).ln()).collect(); + + // (3) Candidate barcodes: rank >= n_simple, total >= minUMI, up to cand_max_n. + let median_top = if n_simple >= 2 { + sorted_desc[n_simple / 2] + } else if !sorted_desc.is_empty() { + sorted_desc[0] + } else { + 0 + }; + let min_umi = a + .umi_min + .max((a.umi_min_frac_median * median_top as f64) as u64); + let mut cands: Vec = Vec::new(); + for &cell in order.iter().skip(n_simple).take(a.cand_max_n) { + if m.totals[cell] < min_umi { + break; + } + cands.push(cell); + } + eprintln!( + "emptydrops: {} candidates (minUMI={min_umi}); running {} Monte-Carlo sims", + cands.len(), + a.sim_n + ); + if cands.is_empty() { + write_out(&a, &m, &order[..n_simple], n_simple, 0); + return; + } + + // logFactorial up to the largest candidate total. + let max_count = cands.iter().map(|&c| m.totals[c]).max().unwrap() as usize; + let mut log_fac = vec![0f64; max_count + 1]; + for i in 2..=max_count { + log_fac[i] = log_fac[i - 1] + (i as f64).ln(); + } + + // Observed multinomial log-prob per candidate. + let obs_logp: Vec = cands + .iter() + .map(|&cell| { + let total = m.totals[cell] as usize; + let mut s = log_fac[total]; + for &(g, c) in &m.cell_profiles[cell] { + s -= log_fac[c as usize]; + s += c as f64 * amb_logp[g as usize]; + } + s + }) + .collect(); + + // (4/5) Monte Carlo: simulate sim_n barcodes from the ambient multinomial, + // recording the running log-prob at every count up to max_count. Each + // candidate of total t is compared against sim[*][t]. + let nonzero: Vec = (0..m.n_genes).filter(|&g| amb_prob[g] > 0.0).collect(); + let weights: Vec = nonzero.iter().map(|&g| amb_prob[g]).collect(); + let dist = WeightedIndex::new(&weights).unwrap(); + let mut rng = StdRng::seed_from_u64(a.seed); + + // For each count t, collect the sim log-probs (so we can compare per candidate). + // Memory: sim_n * (max_count+1) f64 — fine for ~10k * a few-thousand. + let mut sim_at: Vec> = vec![Vec::with_capacity(a.sim_n); max_count + 1]; + let mut curr = vec![0u32; m.n_genes]; + for _ in 0..a.sim_n { + for v in curr.iter_mut() { + *v = 0; + } + let mut lp = 0f64; + sim_at[0].push(0.0); + #[allow(clippy::needless_range_loop)] // ic is both index and multinomial term + for ic in 1..=max_count { + let gi = nonzero[dist.sample(&mut rng)]; + curr[gi] += 1; + lp += amb_logp[gi] + (ic as f64).ln() - (curr[gi] as f64).ln(); + sim_at[ic].push(lp); + } + } + + // p-value: fraction of sims with LOWER log-prob than observed (more extreme). + let mut pvals: Vec<(usize, f64)> = cands + .iter() + .enumerate() + .map(|(i, &cell)| { + let t = m.totals[cell] as usize; + let obs = obs_logp[i]; + let n_lower = sim_at[t].iter().filter(|&&sp| sp < obs).count(); + let p = (1 + n_lower) as f64 / (1 + a.sim_n) as f64; + (i, p) + }) + .collect(); + + // (6) Benjamini-Hochberg. + pvals.sort_by(|x, y| x.1.partial_cmp(&y.1).unwrap()); + let n = pvals.len() as f64; + let mut padj = vec![0f64; pvals.len()]; + for (rank, &(_, p)) in pvals.iter().enumerate() { + padj[rank] = (p * n / (rank + 1) as f64).min(1.0); + } + for i in (0..padj.len() - 1).rev() { + padj[i] = padj[i].min(padj[i + 1]); + } + + // Called cells = guaranteed + candidates with padj <= FDR. + let mut called: Vec = order[..n_simple].to_vec(); + let mut extra = 0usize; + for (rank, &(ci, _)) in pvals.iter().enumerate() { + if padj[rank] <= a.fdr { + called.push(cands[ci]); + extra += 1; + } + } + eprintln!("emptydrops: {extra} extra cells (FDR<={})", a.fdr); + write_out(&a, &m, &called, n_simple, extra); +} + +fn write_out(a: &Args, m: &Matrix, called: &[usize], n_simple: usize, extra: usize) { + std::fs::create_dir_all(&a.out).unwrap(); + // Stable order: by descending total then barcode. + let mut cells: Vec = called.to_vec(); + cells.sort_by(|&i, &j| { + m.totals[j] + .cmp(&m.totals[i]) + .then(m.barcodes[i].cmp(&m.barcodes[j])) + }); + cells.dedup(); + + let mut bc = BufWriter::new(File::create(a.out.join("barcodes.tsv")).unwrap()); + let mut cl = BufWriter::new(File::create(a.out.join("cells.txt")).unwrap()); + for &c in &cells { + writeln!(bc, "{}", m.barcodes[c]).unwrap(); + writeln!(cl, "{}", m.barcodes[c]).unwrap(); + } + let summary = format!( + "{{\"n_cells\": {}, \"n_guaranteed\": {}, \"n_emptydrops_extra\": {}, \"fdr\": {}, \"sim_n\": {}}}\n", + cells.len(), + n_simple, + extra, + a.fdr, + a.sim_n + ); + std::fs::write(a.out.join("emptydrops.json"), &summary).unwrap(); + println!( + "EmptyDrops_CR: {} cells ({} guaranteed + {} EmptyDrops) -> {}", + cells.len(), + n_simple, + extra, + a.out.display() + ); +} diff --git a/src/chimeric/detect.rs b/src/chimeric/detect.rs index 95ae396..de2d445 100644 --- a/src/chimeric/detect.rs +++ b/src/chimeric/detect.rs @@ -993,7 +993,7 @@ mod tests { let n_genome = chr_pad * 2; let sequence = vec![0u8; 2 * n_genome as usize]; Genome { - sequence, + sequence: sequence.into(), n_genome, n_genome_real: n_genome, n_chr_real: 2, diff --git a/src/chimeric/output.rs b/src/chimeric/output.rs index 9da093a..d198dc7 100644 --- a/src/chimeric/output.rs +++ b/src/chimeric/output.rs @@ -434,7 +434,7 @@ mod tests { fn make_genome_2chr() -> crate::genome::Genome { use crate::genome::Genome; Genome { - sequence: vec![0u8; 2048], + sequence: vec![0u8; 2048].into(), n_genome: 1024, n_genome_real: 1024, n_chr_real: 2, diff --git a/src/chimeric/score.rs b/src/chimeric/score.rs index d703150..c84aa83 100644 --- a/src/chimeric/score.rs +++ b/src/chimeric/score.rs @@ -71,8 +71,8 @@ fn extract_motif( } let genome_idx = (chr_start + extract_pos) as usize; - let b1 = genome.sequence.get(genome_idx).copied().unwrap_or(4); - let b2 = genome.sequence.get(genome_idx + 1).copied().unwrap_or(4); + let b1 = genome.sequence.get(genome_idx).unwrap_or(4); + let b2 = genome.sequence.get(genome_idx + 1).unwrap_or(4); // Convert to bases let mut motif = vec![base_to_char(b1), base_to_char(b2)]; @@ -127,8 +127,8 @@ pub fn calculate_repeat_length( break; } - let d_base = genome.sequence.get(d_pos as usize).copied().unwrap_or(4); - let a_base = genome.sequence.get(a_pos as usize).copied().unwrap_or(4); + let d_base = genome.sequence.get(d_pos as usize).unwrap_or(4); + let a_base = genome.sequence.get(a_pos as usize).unwrap_or(4); if d_base == a_base && d_base < 4 { // Only count ACGT, not N @@ -171,7 +171,7 @@ mod tests { fn mock_genome_with_sequence(seq: Vec) -> Genome { Genome { - sequence: seq, + sequence: seq.into(), n_genome: 100, n_genome_real: 100, n_chr_real: 1, diff --git a/src/genome/mod.rs b/src/genome/mod.rs index ca08848..8cb6938 100644 --- a/src/genome/mod.rs +++ b/src/genome/mod.rs @@ -10,6 +10,102 @@ use fasta::parse_fasta_files; /// STAR's genome spacing character (used for inter-chromosome padding). const GENOME_SPACING_CHAR: u8 = 5; +/// Backing storage for a genome's `[forward | reverse-complement]` sequence. +/// +/// `Owned` is the full `2*n_genome` byte buffer built at genomeGenerate time +/// (it is the only variant that supports slicing/mutation). `Mapped` is a +/// read-only memory map of the on-disk `Genome` file, which holds **only the +/// forward strand** (`n_genome` bytes): the reverse-complement half is computed +/// on access in [`GenomeSeq::base`], so loading never materializes the ~`n`-byte +/// RC buffer and the forward bytes are reclaimable file-backed pages rather than +/// an anonymous `Vec`. `Arc` keeps `Genome::clone` (two-pass) cheap. +#[derive(Clone)] +pub enum GenomeSeq { + Owned(Vec), + Mapped { + fwd: std::sync::Arc, + n_genome: usize, + }, +} + +impl GenomeSeq { + /// Base at absolute position `i` — forward `[0, n_genome)` or + /// reverse-complement `[n_genome, 2*n_genome)`. For the `Mapped` RC half, + /// `base(i) = complement(forward[2*n_genome - 1 - i])`, exactly the bytes + /// the owned builder writes into the second half. + #[inline] + pub fn base(&self, i: usize) -> u8 { + match self { + GenomeSeq::Owned(v) => v[i], + GenomeSeq::Mapped { fwd, n_genome } => { + let n = *n_genome; + if i < n { + fwd[i] + } else { + let f = fwd[2 * n - 1 - i]; + if f < 4 { 3 - f } else { f } + } + } + } + } + + /// Total sequence length (`2*n_genome` — forward + reverse complement). + #[inline] + pub fn len(&self) -> usize { + match self { + GenomeSeq::Owned(v) => v.len(), + GenomeSeq::Mapped { n_genome, .. } => 2 * n_genome, + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Bounds-checked [`base`](Self::base): the base at `i`, or `None` if out of + /// range. + #[inline] + pub fn get(&self, i: usize) -> Option { + if i < self.len() { + Some(self.base(i)) + } else { + None + } + } + + /// The contiguous byte buffer. For `Owned` this is the full + /// `[forward | RC]`; for `Mapped` it is the forward strand only — callers + /// that may touch the RC half must use [`base`](Self::base). Used at build + /// time (always `Owned`) for SA construction and the on-disk write. + pub fn as_slice(&self) -> &[u8] { + match self { + GenomeSeq::Owned(v) => v, + GenomeSeq::Mapped { fwd, .. } => fwd, + } + } +} + +impl From> for GenomeSeq { + fn from(v: Vec) -> Self { + GenomeSeq::Owned(v) + } +} + +// `memmap2::Mmap` is neither `Debug` nor `PartialEq`, so derive them by hand via +// the byte view. `as_slice()` is the full buffer for `Owned` (the only variant +// tests construct), so equality/printing behave like the old `Vec` field. +impl std::fmt::Debug for GenomeSeq { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "GenomeSeq({} bytes)", self.len()) + } +} + +impl PartialEq for GenomeSeq { + fn eq(&self, other: &Self) -> bool { + self.as_slice() == other.as_slice() + } +} + /// Packed genome with chromosome metadata. /// /// The genome sequence is stored as one byte per base: @@ -19,8 +115,9 @@ const GENOME_SPACING_CHAR: u8 = 5; #[derive(Clone)] pub struct Genome { /// Forward genome (0..n_genome) + reverse complement (n_genome..2*n_genome). - /// Initialized to GENOME_SPACING_CHAR (5), then overwritten with actual bases. - pub sequence: Vec, + /// Owned at build time; a memory map of the forward strand (RC computed on + /// access) when loaded from disk. Access bases via [`GenomeSeq::base`]. + pub sequence: GenomeSeq, /// Total length of the forward (padded) genome. pub n_genome: u64, @@ -115,7 +212,7 @@ impl Genome { } Ok(Genome { - sequence, + sequence: sequence.into(), n_genome, n_genome_real: n_genome, n_chr_real, @@ -141,7 +238,7 @@ impl Genome { let new_n = old_n + gsj.len() as u64; let mut new_seq = vec![GENOME_SPACING_CHAR; (new_n * 2) as usize]; - new_seq[..old_n as usize].copy_from_slice(&self.sequence[..old_n as usize]); + new_seq[..old_n as usize].copy_from_slice(&self.sequence.as_slice()[..old_n as usize]); new_seq[old_n as usize..new_n as usize].copy_from_slice(gsj); // Rebuild RC over the extended forward range (STAR stores Gsj_RC @@ -152,7 +249,7 @@ impl Genome { new_seq[2 * new_n as usize - 1 - i] = complement; } - self.sequence = new_seq; + self.sequence = new_seq.into(); self.n_genome = new_n; } @@ -165,7 +262,7 @@ impl Genome { /// The base value (0-3 for ACGT, 4 for N, 5 for padding), or None if out of bounds. pub fn get_base(&self, pos: u64) -> Option { if pos < self.sequence.len() as u64 { - Some(self.sequence[pos as usize]) + Some(self.sequence.base(pos as usize)) } else { None } @@ -210,8 +307,11 @@ impl Genome { // Write Genome file (forward strand only, n_genome bytes) let genome_path = dir.join("Genome"); - fs::write(&genome_path, &self.sequence[..self.n_genome as usize]) - .map_err(|e| Error::io(e, &genome_path))?; + fs::write( + &genome_path, + &self.sequence.as_slice()[..self.n_genome as usize], + ) + .map_err(|e| Error::io(e, &genome_path))?; // Write chrName.txt let chr_name_path = dir.join("chrName.txt"); @@ -443,19 +543,19 @@ mod tests { let n = genome.n_genome as usize; // Forward: A C G T N (then padding) - assert_eq!(genome.sequence[0], 0); // A - assert_eq!(genome.sequence[1], 1); // C - assert_eq!(genome.sequence[2], 2); // G - assert_eq!(genome.sequence[3], 3); // T - assert_eq!(genome.sequence[4], 4); // N + assert_eq!(genome.sequence.base(0), 0); // A + assert_eq!(genome.sequence.base(1), 1); // C + assert_eq!(genome.sequence.base(2), 2); // G + assert_eq!(genome.sequence.base(3), 3); // T + assert_eq!(genome.sequence.base(4), 4); // N // Reverse complement should be at positions [2n-1, 2n-2, 2n-3, 2n-4, 2n-5] // which maps to the reverse of [0,1,2,3,4] - assert_eq!(genome.sequence[2 * n - 1], 3); // T (complement of A at pos 0) - assert_eq!(genome.sequence[2 * n - 1 - 1], 2); // G (complement of C at pos 1) - assert_eq!(genome.sequence[2 * n - 1 - 2], 1); // C (complement of G at pos 2) - assert_eq!(genome.sequence[2 * n - 1 - 3], 0); // A (complement of T at pos 3) - assert_eq!(genome.sequence[2 * n - 1 - 4], 4); // N (complement of N at pos 4) + assert_eq!(genome.sequence.base(2 * n - 1), 3); // T (complement of A at pos 0) + assert_eq!(genome.sequence.base(2 * n - 1 - 1), 2); // G (complement of C at pos 1) + assert_eq!(genome.sequence.base(2 * n - 1 - 2), 1); // C (complement of G at pos 2) + assert_eq!(genome.sequence.base(2 * n - 1 - 3), 0); // A (complement of T at pos 3) + assert_eq!(genome.sequence.base(2 * n - 1 - 4), 4); // N (complement of N at pos 4) } #[test] @@ -509,13 +609,13 @@ mod tests { assert_eq!(genome.n_chr_real, 1); // Forward is [real 0..8 | gsj 8..13]. - assert_eq!(&genome.sequence[..4], &[0, 1, 2, 3]); - assert_eq!(&genome.sequence[8..13], gsj.as_slice()); + assert_eq!(&genome.sequence.as_slice()[..4], &[0, 1, 2, 3]); + assert_eq!(&genome.sequence.as_slice()[8..13], gsj.as_slice()); // RC over the extended forward range. sequence[2n-1-i] = complement(sequence[i]). let new_n = genome.n_genome as usize; - assert_eq!(genome.sequence[2 * new_n - 1 - 8], 3); // complement of A at fwd[8]=0 - assert_eq!(genome.sequence[2 * new_n - 1 - 12], 5); // spacer stays 5 + assert_eq!(genome.sequence.base(2 * new_n - 1 - 8), 3); // complement of A at fwd[8]=0 + assert_eq!(genome.sequence.base(2 * new_n - 1 - 12), 5); // spacer stays 5 assert_eq!(genome.sequence.len(), 2 * new_n); } diff --git a/src/index/io.rs b/src/index/io.rs index 18779f4..f9430da 100644 --- a/src/index/io.rs +++ b/src/index/io.rs @@ -1,5 +1,4 @@ use std::fs::File; -use std::io::Read; use std::path::Path; use byteorder::{LittleEndian, ReadBytesExt}; @@ -187,28 +186,29 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result let n_genome_real = chr_start[n_chr_real]; let n_genome = read_genome_file_size(genome_dir)?.unwrap_or(n_genome_real); - // Load Genome sequence file + // Memory-map the Genome sequence file (forward strand only, `n_genome` + // bytes). The reverse-complement half is computed on access by + // `GenomeSeq::base`, so the ~`n_genome`-byte RC buffer is never + // materialized and the forward bytes are reclaimable file-backed pages + // rather than an anonymous `Vec`. The genome is accessed by single-byte + // lookups during alignment, which `base` serves from the map. let genome_path = genome_dir.join("Genome"); - let genome_data = std::fs::read(&genome_path).map_err(|e| Error::io(e, &genome_path))?; + let file = File::open(&genome_path).map_err(|e| Error::io(e, &genome_path))?; + // SAFETY: Genome is opened read-only and never mutated while loaded. + let mmap = unsafe { memmap2::Mmap::map(&file).map_err(|e| Error::io(e, &genome_path))? }; - if genome_data.len() != n_genome as usize { + if mmap.len() != n_genome as usize { return Err(Error::Index(format!( "Genome file size mismatch: expected {} bytes, got {}", n_genome, - genome_data.len() + mmap.len() ))); } - // Build full sequence buffer (forward + reverse complement) - let mut sequence = vec![5u8; (n_genome * 2) as usize]; - sequence[..n_genome as usize].copy_from_slice(&genome_data); - - // Build reverse complement - for i in 0..n_genome as usize { - let base = sequence[i]; - let complement = if base < 4 { 3 - base } else { base }; - sequence[2 * n_genome as usize - 1 - i] = complement; - } + let sequence = crate::genome::GenomeSeq::Mapped { + fwd: std::sync::Arc::new(mmap), + n_genome: n_genome as usize, + }; Ok(Genome { sequence, @@ -222,9 +222,29 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result } /// Load suffix array from disk. +/// +/// The `SA` file is **memory-mapped** rather than read into a `Vec`: it is the +/// largest index component (≈21 GB for mouse) and is accessed by random binary +/// search during alignment. mmap keeps it as reclaimable file-backed memory +/// (demand-loaded, dropped — not swapped — under pressure) instead of an +/// un-reclaimable anonymous allocation. `MADV_RANDOM` disables readahead, which +/// would waste I/O on the random access pattern. +/// Best-effort `MADV_RANDOM` on a read-only mmap. `madvise` (and `memmap2::Advice`) +/// is Unix-only, so this is a no-op on platforms without it (e.g. Windows). +#[cfg(unix)] +fn advise_random(mmap: &memmap2::Mmap) { + let _ = mmap.advise(memmap2::Advice::Random); // best-effort; ignore if unsupported +} +#[cfg(not(unix))] +fn advise_random(_mmap: &memmap2::Mmap) {} + fn load_suffix_array(genome_dir: &Path, genome: &Genome) -> Result { let sa_path = genome_dir.join("SA"); - let sa_data = std::fs::read(&sa_path).map_err(|e| Error::io(e, &sa_path))?; + let file = File::open(&sa_path).map_err(|e| Error::io(e, &sa_path))?; + // SAFETY: the SA file is opened read-only and not mutated elsewhere while + // the index is loaded; the mapping is only ever read. + let mmap = unsafe { memmap2::Mmap::map(&file).map_err(|e| Error::io(e, &sa_path))? }; + advise_random(&mmap); let gstrand_bit = SuffixArray::calculate_gstrand_bit(genome.n_genome); let word_length = gstrand_bit + 1; @@ -236,7 +256,7 @@ fn load_suffix_array(genome_dir: &Path, genome: &Genome) -> Result Result Result Result { let sai_path = genome_dir.join("SAindex"); let mut file = File::open(&sai_path).map_err(|e| Error::io(e, &sai_path))?; @@ -273,15 +298,23 @@ fn load_sa_index(genome_dir: &Path, gstrand_bit: u32) -> Result genome_sa_index_start.push(val); } - // Read packed data - let mut packed_data = Vec::new(); - file.read_to_end(&mut packed_data) - .map_err(|e| Error::io(e, &sai_path))?; + // Map the packed-data region: header is `nbases` (8B) + (nbases+1)×8B. + let header_len = 8 + 8 * (u64::from(nbases) + 1); + // SAFETY: SAindex is opened read-only and never mutated while loaded. + // memmap2 handles non-page-aligned offsets internally; the map runs from + // `header_len` to EOF and is only ever read. + let mmap = unsafe { + memmap2::MmapOptions::new() + .offset(header_len) + .map(&file) + .map_err(|e| Error::io(e, &sai_path))? + }; + advise_random(&mmap); let word_length = gstrand_bit + 3; let num_indices = SaIndex::calculate_num_indices(nbases); - let data = PackedArray::from_bytes(word_length, num_indices as usize, packed_data); + let data = PackedArray::from_mmap(word_length, num_indices as usize, mmap); Ok(SaIndex { nbases, diff --git a/src/index/packed_array.rs b/src/index/packed_array.rs index 02d334e..ac925b8 100644 --- a/src/index/packed_array.rs +++ b/src/index/packed_array.rs @@ -1,3 +1,39 @@ +/// Backing byte storage for a [`PackedArray`]. +/// +/// `Owned` is a heap `Vec` (used while *building* an index — it is the only +/// variant that supports [`PackedArray::write`]). `Mapped` is a read-only +/// memory map of an on-disk `SA` / `SAindex` file (used at *load* time): its +/// pages are file-backed, so they are demand-loaded and **reclaimable under +/// memory pressure** (dropped, never swapped) rather than the un-reclaimable +/// anonymous memory a `Vec` would occupy. `Arc` keeps `Clone` cheap +/// (two-pass mode clones the whole `GenomeIndex`). +#[derive(Clone)] +enum PackedBytes { + Owned(Vec), + Mapped(std::sync::Arc), +} + +impl PackedBytes { + #[inline] + fn as_slice(&self) -> &[u8] { + match self { + PackedBytes::Owned(v) => v, + PackedBytes::Mapped(m) => m, + } + } + + fn as_mut_slice(&mut self) -> &mut [u8] { + match self { + PackedBytes::Owned(v) => v, + PackedBytes::Mapped(_) => { + panic!( + "PackedArray: cannot mutate a memory-mapped array (build into an Owned array)" + ) + } + } + } +} + /// Variable-width bit-packed array matching STAR's PackedArray format. /// /// Stores integers with a specified bit width, packing them at bit-level @@ -17,8 +53,8 @@ pub struct PackedArray { /// Number of elements length: usize, - /// Raw byte storage - data: Vec, + /// Raw byte storage (owned heap buffer or a read-only memory map). + data: PackedBytes, } impl PackedArray { @@ -44,7 +80,7 @@ impl PackedArray { ((length - 1) as u64 * word_length as u64) / 8 + 8 }; - let data = vec![0u8; length_byte as usize]; + let data = PackedBytes::Owned(vec![0u8; length_byte as usize]); Self { word_length, @@ -70,24 +106,26 @@ impl PackedArray { let masked_value = (value & self.bit_rec_mask) << bit_shift; let mask = self.bit_rec_mask << bit_shift; + let data = self.data.as_mut_slice(); + // Read current 8-byte word, update bits, write back let mut word = u64::from_le_bytes([ - self.data.get(byte_offset).copied().unwrap_or(0), - self.data.get(byte_offset + 1).copied().unwrap_or(0), - self.data.get(byte_offset + 2).copied().unwrap_or(0), - self.data.get(byte_offset + 3).copied().unwrap_or(0), - self.data.get(byte_offset + 4).copied().unwrap_or(0), - self.data.get(byte_offset + 5).copied().unwrap_or(0), - self.data.get(byte_offset + 6).copied().unwrap_or(0), - self.data.get(byte_offset + 7).copied().unwrap_or(0), + data.get(byte_offset).copied().unwrap_or(0), + data.get(byte_offset + 1).copied().unwrap_or(0), + data.get(byte_offset + 2).copied().unwrap_or(0), + data.get(byte_offset + 3).copied().unwrap_or(0), + data.get(byte_offset + 4).copied().unwrap_or(0), + data.get(byte_offset + 5).copied().unwrap_or(0), + data.get(byte_offset + 6).copied().unwrap_or(0), + data.get(byte_offset + 7).copied().unwrap_or(0), ]); word = (word & !mask) | masked_value; let bytes = word.to_le_bytes(); for (i, &byte) in bytes.iter().enumerate() { - if byte_offset + i < self.data.len() { - self.data[byte_offset + i] = byte; + if byte_offset + i < data.len() { + data[byte_offset + i] = byte; } } } @@ -106,22 +144,22 @@ impl PackedArray { let byte_offset = b / 8; let bit_shift = (b % 8) as u32; - let word = if byte_offset + 8 <= self.data.len() { + let data = self.data.as_slice(); + let word = if byte_offset + 8 <= data.len() { // Fast path: read 8 bytes directly (no per-byte bounds checks) - // SAFETY: We just verified byte_offset + 8 <= data.len() - let bytes = &self.data[byte_offset..byte_offset + 8]; + let bytes = &data[byte_offset..byte_offset + 8]; u64::from_le_bytes(bytes.try_into().unwrap()) } else { // Slow path: near end of array, read byte-by-byte with bounds checks u64::from_le_bytes([ - self.data.get(byte_offset).copied().unwrap_or(0), - self.data.get(byte_offset + 1).copied().unwrap_or(0), - self.data.get(byte_offset + 2).copied().unwrap_or(0), - self.data.get(byte_offset + 3).copied().unwrap_or(0), - self.data.get(byte_offset + 4).copied().unwrap_or(0), - self.data.get(byte_offset + 5).copied().unwrap_or(0), - self.data.get(byte_offset + 6).copied().unwrap_or(0), - self.data.get(byte_offset + 7).copied().unwrap_or(0), + data.get(byte_offset).copied().unwrap_or(0), + data.get(byte_offset + 1).copied().unwrap_or(0), + data.get(byte_offset + 2).copied().unwrap_or(0), + data.get(byte_offset + 3).copied().unwrap_or(0), + data.get(byte_offset + 4).copied().unwrap_or(0), + data.get(byte_offset + 5).copied().unwrap_or(0), + data.get(byte_offset + 6).copied().unwrap_or(0), + data.get(byte_offset + 7).copied().unwrap_or(0), ]) }; @@ -162,7 +200,7 @@ impl PackedArray { /// Get a reference to the raw byte data. pub fn data(&self) -> &[u8] { - &self.data + self.data.as_slice() } /// Create a PackedArray from raw byte data. @@ -172,6 +210,23 @@ impl PackedArray { /// * `length` - Number of elements /// * `data` - Raw byte data pub fn from_bytes(word_length: u32, length: usize, data: Vec) -> Self { + Self::from_store(word_length, length, PackedBytes::Owned(data)) + } + + /// Create a read-only PackedArray backed by a memory map of an on-disk + /// `SA` / `SAindex` file. The mapped pages are demand-loaded and + /// reclaimable under memory pressure (unlike an owned `Vec`), so loading a + /// multi-GB suffix array does not pin that much anonymous RAM. `write` will + /// panic on the result — memory-mapped arrays are read-only. + pub fn from_mmap(word_length: u32, length: usize, mmap: memmap2::Mmap) -> Self { + Self::from_store( + word_length, + length, + PackedBytes::Mapped(std::sync::Arc::new(mmap)), + ) + } + + fn from_store(word_length: u32, length: usize, data: PackedBytes) -> Self { assert!(word_length > 0 && word_length <= 64); let word_comp_length = 64 - word_length; diff --git a/src/index/sa_build.rs b/src/index/sa_build.rs index 28555a8..09f0434 100644 --- a/src/index/sa_build.rs +++ b/src/index/sa_build.rs @@ -212,7 +212,7 @@ pub(crate) fn build_impl(genome: &Genome, force_sentinel: bool) -> Result` for the chosen S. - let n_seg = count_spacer_runs(&genome.sequence[..n2]); + let n_seg = count_spacer_runs(&genome.sequence.as_slice()[..n2]); let alphabet_max = SENTINEL_BASE as u32 + n_seg; log::info!("sa_build: counted {n_seg} per-segment sentinels (alphabet max = {alphabet_max})"); @@ -317,7 +317,10 @@ where sparse_d, 1, "non-default sparse_d isn't wired through this path" ); - let n_sa_kept: usize = genome.sequence[..n2].par_iter().filter(|&&b| b < 4).count(); + let n_sa_kept: usize = genome.sequence.as_slice()[..n2] + .par_iter() + .filter(|&&b| b < 4) + .count(); log::info!("sa_build: {n_sa_kept} entries after ACGT + sparse-d={sparse_d} filter"); let n_genome_u64 = n_genome as u64; @@ -364,15 +367,27 @@ where "sa_build: RUSTAR_USE_SENTINEL_TRANSFORM=1, alphabet fits u8 — \ using sentinel-transform arm" ); - let t_prime: Vec = build_sentinel_transformed_text(&genome.sequence[..n2], n_seg); - dispatch_caps_sa(t_prime, &genome.sequence[..n2], temp_dir, &mut pack_one)?; + let t_prime: Vec = + build_sentinel_transformed_text(&genome.sequence.as_slice()[..n2], n_seg); + dispatch_caps_sa( + t_prime, + &genome.sequence.as_slice()[..n2], + temp_dir, + &mut pack_one, + )?; } else if force_sentinel && alphabet_max <= ::MAX_REPRESENTABLE { log::info!( "sa_build: RUSTAR_USE_SENTINEL_TRANSFORM=1, alphabet fits u16 — \ using sentinel-transform arm" ); - let t_prime: Vec = build_sentinel_transformed_text(&genome.sequence[..n2], n_seg); - dispatch_caps_sa(t_prime, &genome.sequence[..n2], temp_dir, &mut pack_one)?; + let t_prime: Vec = + build_sentinel_transformed_text(&genome.sequence.as_slice()[..n2], n_seg); + dispatch_caps_sa( + t_prime, + &genome.sequence.as_slice()[..n2], + temp_dir, + &mut pack_one, + )?; } else { if force_sentinel { log::warn!( @@ -387,7 +402,7 @@ where alphabet_max={alphabet_max}, {n_seg} segments)" ); } - dispatch_caps_sa_segmented(&genome.sequence[..n2], temp_dir, &mut pack_one)?; + dispatch_caps_sa_segmented(&genome.sequence.as_slice()[..n2], temp_dir, &mut pack_one)?; } debug_assert_eq!( diff --git a/src/index/sa_index.rs b/src/index/sa_index.rs index e4df398..9761b10 100644 --- a/src/index/sa_index.rs +++ b/src/index/sa_index.rs @@ -167,7 +167,7 @@ impl SaIndex { (1u64 << sa_word_length) - 1 }; let n_genome = genome.n_genome as usize; - let genome_seq: &[u8] = &genome.sequence; + let genome_seq: &[u8] = genome.sequence.as_slice(); // Chunk size: 1 M entries per worker. STAR's algorithm // visits at most ~chunk_size / isa_step boundaries per chunk @@ -495,7 +495,7 @@ impl SaIndex { if genome_pos + (k as usize) > genome.sequence.len() { break; } - let next_base = genome.sequence[genome_pos + (k - 1) as usize]; + let next_base = genome.sequence.base(genome_pos + (k - 1) as usize); if next_base >= 4 { break; } @@ -565,7 +565,7 @@ impl SaIndexBuilder<'_> { if genome_pos + (k as usize) > self.genome.sequence.len() { break; } - let next_base = self.genome.sequence[genome_pos + (k - 1) as usize]; + let next_base = self.genome.sequence.base(genome_pos + (k - 1) as usize); if next_base >= 4 { break; } diff --git a/src/index/suffix_array.rs b/src/index/suffix_array.rs index 570a7dd..197844d 100644 --- a/src/index/suffix_array.rs +++ b/src/index/suffix_array.rs @@ -80,7 +80,7 @@ fn compare_suffixes( use std::cmp::Ordering; let n_genome = genome.n_genome as usize; - let sequence = &genome.sequence; + let sequence = genome.sequence.as_slice(); // Adjust positions for reverse complement let start_a = if reverse_a { pos_a + n_genome } else { pos_a }; @@ -184,12 +184,12 @@ mod tests { let mut suffixes: Vec<(u64, bool)> = Vec::new(); for i in 0..n_genome { - if genome.sequence[i] < 4 { + if genome.sequence.base(i) < 4 { suffixes.push((i as u64, false)); } } for i in n_genome..(2 * n_genome) { - if genome.sequence[i] < 4 { + if genome.sequence.base(i) < 4 { suffixes.push(((i - n_genome) as u64, true)); } } @@ -272,7 +272,7 @@ mod tests { // The lexicographically first suffix should start with the smallest base let first_entry = sa.get(0); let (first_pos, _) = sa.decode(first_entry); - let first_base = genome.sequence[first_pos as usize]; + let first_base = genome.sequence.base(first_pos as usize); // In "AAB", the first suffix lexicographically is "A" (from pos 0 or 1) assert!(first_base == 0); // A diff --git a/src/io/bam.rs b/src/io/bam.rs index e9c9183..4daba2d 100644 --- a/src/io/bam.rs +++ b/src/io/bam.rs @@ -465,7 +465,7 @@ mod tests { fn create_test_genome() -> Genome { Genome { - sequence: vec![0, 1, 2, 3, 0, 1, 2, 3], // ACGTACGT + sequence: vec![0, 1, 2, 3, 0, 1, 2, 3].into(), // ACGTACGT n_genome: 8, n_genome_real: 8, n_chr_real: 1, diff --git a/src/io/sam.rs b/src/io/sam.rs index 29716ea..30276be 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -1404,7 +1404,7 @@ mod tests { fn make_test_genome() -> Genome { Genome { - sequence: vec![0, 1, 2, 3, 0, 1, 2, 3], // ACGTACGT + sequence: vec![0, 1, 2, 3, 0, 1, 2, 3].into(), // ACGTACGT n_genome: 8, n_genome_real: 8, n_chr_real: 1, diff --git a/src/junction/gtf.rs b/src/junction/gtf.rs index 3e3d3be..5a07ded 100644 --- a/src/junction/gtf.rs +++ b/src/junction/gtf.rs @@ -310,7 +310,7 @@ mod tests { fn test_extract_junctions_single_transcript() { // Create a simple genome let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -362,7 +362,7 @@ mod tests { #[test] fn test_extract_junctions_multiple_transcripts() { let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -438,7 +438,7 @@ mod tests { #[test] fn test_extract_junctions_single_exon_transcript() { let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -470,7 +470,7 @@ mod tests { #[test] fn test_extract_junctions_unknown_chromosome() { let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -517,7 +517,7 @@ mod tests { #[test] fn test_junction_coordinate_calculation() { let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -601,7 +601,7 @@ mod tests { #[test] fn test_extract_junctions_configured_custom_transcript_tag() { let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, diff --git a/src/junction/mod.rs b/src/junction/mod.rs index a260a2d..a3715f2 100644 --- a/src/junction/mod.rs +++ b/src/junction/mod.rs @@ -407,7 +407,7 @@ mod tests { // Two-chromosome toy genome so chr_start[1] != 0. let genome = Genome { - sequence: vec![0; 4000], + sequence: vec![0; 4000].into(), n_genome: 2000, n_genome_real: 2000, n_chr_real: 2, diff --git a/src/junction/sj_output.rs b/src/junction/sj_output.rs index 2b66b3a..1e433ec 100644 --- a/src/junction/sj_output.rs +++ b/src/junction/sj_output.rs @@ -228,10 +228,47 @@ impl SpliceJunctionStats { ) -> Result<(), Error> { let file = File::create(output_path).map_err(|e| Error::io(e, output_path))?; let mut writer = BufWriter::new(file); + let written = self.write_sj_lines(&mut writer, genome, params)?; + writer.flush().map_err(|e| Error::io(e, output_path))?; + let filtered = self.junctions.len() as u32 - written; + log::info!( + "Wrote {} junctions to {} ({} filtered by outSJfilter*)", + written, + output_path.display(), + filtered, + ); + Ok(()) + } + /// Surviving junctions sorted by (chr, intron_start, intron_end) — the + /// canonical `SJ.out.tab` order, which is also the row order of the `SJ` + /// solo-feature matrix. Returns the (intron_start, intron_end) absolute-coord + /// keys so the SJ recorder can be mapped to matrix rows. + pub(crate) fn sj_feature_order(&self, params: &Parameters) -> Vec<(u64, u64)> { let surviving = self.compute_surviving_junctions(params); + let mut keys: Vec<(usize, u64, u64)> = self + .junctions + .iter() + .filter(|e| surviving.contains(e.key())) + .map(|e| { + let k = e.key(); + (k.chr_idx, k.intron_start, k.intron_end) + }) + .collect(); + keys.sort_unstable(); + keys.into_iter().map(|(_, s, e)| (s, e)).collect() + } - // Collect and sort surviving junctions for deterministic output + /// Write the 9-column `SJ.out.tab` lines (sorted) to `writer`; returns the + /// number written. Shared by `write_output` and the SJ feature's + /// `features.tsv`, so both stay in the same order as the SJ matrix rows. + pub(crate) fn write_sj_lines( + &self, + writer: &mut dyn std::io::Write, + genome: &Genome, + params: &Parameters, + ) -> Result { + let surviving = self.compute_surviving_junctions(params); let mut output_junctions: Vec<_> = self .junctions .iter() @@ -262,11 +299,9 @@ impl SpliceJunctionStats { .chr_name .get(key.chr_idx) .ok_or_else(|| Error::Index("Invalid chromosome index in junction".to_string()))?; - let chr_start_pos = genome.chr_start[key.chr_idx]; let chr_pos_start = key.intron_start - chr_start_pos + 1; let chr_pos_end = key.intron_end - chr_start_pos + 1; - writeln!( writer, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", @@ -280,21 +315,10 @@ impl SpliceJunctionStats { multi, max_overhang ) - .map_err(|e| Error::io(e, output_path))?; + .map_err(|e| Error::Index(format!("SJ write: {e}")))?; written += 1; } - - writer.flush().map_err(|e| Error::io(e, output_path))?; - - let filtered = self.junctions.len() as u32 - written; - log::info!( - "Wrote {} junctions to {} ({} filtered by outSJfilter*)", - written, - output_path.display(), - filtered, - ); - - Ok(()) + Ok(written) } /// Get the number of unique junctions tracked @@ -523,7 +547,7 @@ mod tests { stats.record_junction(0, 300, 400, 2, SpliceMotif::GcAg, false, 15, true); let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -584,7 +608,7 @@ mod tests { stats.record_junction(0, 300, 400, 1, SpliceMotif::GtAg, true, 20, false); let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -619,7 +643,7 @@ mod tests { stats.record_junction(0, 100, 200, 1, SpliceMotif::NonCanonical, true, 2, true); let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -697,7 +721,7 @@ mod tests { } let genome = Genome { - sequence: vec![0; 1000], + sequence: vec![0; 1000].into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, diff --git a/src/junction/sjdb_insert.rs b/src/junction/sjdb_insert.rs index 85fa625..7014418 100644 --- a/src/junction/sjdb_insert.rs +++ b/src/junction/sjdb_insert.rs @@ -222,7 +222,7 @@ const GSJ_SPACING: u8 = 5; /// /// Stops at genome bounds, on any N-base (code ≥ 4), or at the 255 cap. pub fn compute_shifts(genome: &Genome, s: u64, e: u64, n_genome_real: u64) -> (u8, u8) { - let forward = &genome.sequence[..n_genome_real as usize]; + let forward = &genome.sequence.as_slice()[..n_genome_real as usize]; let si = s as usize; let ei = e as usize; @@ -448,7 +448,7 @@ pub fn build_gsj( ) -> Result, Error> { let overhang = sjdb_overhang as usize; let sjdb_length = 2 * overhang + 1; - let forward = &genome.sequence[..n_genome_real as usize]; + let forward = &genome.sequence.as_slice()[..n_genome_real as usize]; let mut gsj = vec![GSJ_SPACING; junctions.len() * sjdb_length]; for (i, pj) in junctions.iter().enumerate() { @@ -569,7 +569,7 @@ mod tests { let mut seq = forward; seq.extend(std::iter::repeat_n(5u8, n)); Genome { - sequence: seq, + sequence: seq.into(), n_genome: n as u64, n_genome_real: n as u64, n_chr_real: 1, @@ -974,7 +974,7 @@ mod tests { let mut seq = vec![5u8; 4000]; seq[..2000].copy_from_slice(&vec![0u8; 2000]); let genome = Genome { - sequence: seq, + sequence: seq.into(), n_genome: 2000, n_genome_real: 2000, n_chr_real: 2, @@ -1113,7 +1113,7 @@ mod tests { let mut seq = vec![5u8; 4000]; seq[..2000].copy_from_slice(&vec![0u8; 2000]); let genome = Genome { - sequence: seq, + sequence: seq.into(), n_genome: 2000, n_genome_real: 2000, n_chr_real: 1, diff --git a/src/lib.rs b/src/lib.rs index c9a3aa9..a916f72 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,6 +33,7 @@ pub mod io; pub mod junction; pub mod mapq; pub mod quant; +pub mod solo; pub mod stats; use log::info; @@ -217,8 +218,8 @@ fn align_reads(params: &Parameters) -> anyhow::Result<()> { info!("Using single-threaded mode"); } - // Validate read files - if params.read_files_in.is_empty() { + // Validate read files (SmartSeq supplies reads via --readFilesManifest). + if params.read_files_in.is_empty() && params.solo_type != params::SoloType::SmartSeq { anyhow::bail!("No read files specified (--readFilesIn)"); } @@ -278,17 +279,60 @@ fn align_reads(params: &Parameters) -> anyhow::Result<()> { None }; + // SmartSeq has no barcodes/UMIs — a dedicated manifest-driven path. + if params.solo_type == params::SoloType::SmartSeq { + let stats = run_smartseq(&index, ¶ms)?; + let log_path = params.output_path("Log.final.out"); + if let Some(parent) = log_path.parent() { + std::fs::create_dir_all(parent)?; + } + stats.write_log_final( + &log_path, + time_start, + chrono::Local::now(), + chrono::Local::now(), + )?; + info!("Alignment complete!"); + return Ok(()); + } + + // Build the STARsolo context (whitelist + gene model) if a droplet solo run. + let solo_ctx: Option> = if params.solo_enabled() { + info!( + "STARsolo: soloType={} — building barcode + gene context", + params.solo_type + ); + Some(std::sync::Arc::new(crate::solo::SoloContext::build( + ¶ms, + &index.genome, + )?)) + } else { + None + }; + let time_map_start = chrono::Local::now(); // 2. Dispatch based on two-pass mode let stats = match params.twopass_mode { TwopassMode::None => { info!("Running single-pass alignment"); - run_single_pass(&index, ¶ms, quant_ctx.as_ref(), tr_idx.as_ref())? + run_single_pass( + &index, + ¶ms, + quant_ctx.as_ref(), + tr_idx.as_ref(), + solo_ctx.as_ref(), + )? } TwopassMode::Basic => { info!("Running two-pass alignment mode"); - run_two_pass(&index, ¶ms, quant_ctx.as_ref(), tr_idx.as_ref())? + run_two_pass( + &index, + ¶ms, + quant_ctx.as_ref(), + tr_idx.as_ref(), + solo_ctx.as_ref(), + )? } }; @@ -329,12 +373,217 @@ fn align_reads(params: &Parameters) -> anyhow::Result<()> { Ok(()) } +/// Log STARsolo barcode/record stats and write the per-cell matrices (raw + +/// filtered), `Summary.csv`, and the SJ feature matrix. Called from the solo +/// branch of `run_single_pass`, where `sj_stats` is live. +fn write_solo_output( + sctx: &std::sync::Arc, + params: &Parameters, + stats: &std::sync::Arc, + sj_stats: &std::sync::Arc, + index: &std::sync::Arc, +) -> anyhow::Result<()> { + use std::sync::atomic::Ordering; + let s = &sctx.stats; + info!( + "STARsolo barcode stats: exact={} 1MM={} multiMM={} noMatch={} N-in-CB={} multReject={} N-in-UMI={} UMIhomopolymer={}", + s.yes_exact.load(Ordering::Relaxed), + s.yes_one_mm.load(Ordering::Relaxed), + s.yes_mult_mm.load(Ordering::Relaxed), + s.no_match.load(Ordering::Relaxed), + s.n_in_cb.load(Ordering::Relaxed), + s.mult_rejected.load(Ordering::Relaxed), + s.n_in_umi.load(Ordering::Relaxed), + s.umi_homopolymer.load(Ordering::Relaxed), + ); + for (feature, recorder) in sctx.features.iter().zip(&sctx.recorders) { + info!( + "STARsolo {}: collected {} resolved (CB,UMI,gene) records ({} deferred 1MM_multi)", + feature.dir_name(), + recorder.n_records(), + recorder.n_multi_records(), + ); + } + crate::solo::write_gene_matrix(sctx, params, stats, Some(&**sj_stats), &index.genome)?; + Ok(()) +} + +/// `--soloType SmartSeq`: align each manifest cell's reads and count reads per +/// gene (no barcodes, no UMIs). Writes `Solo.out/Gene/raw/` (genes × cells) and +/// returns the alignment stats. +fn run_smartseq( + index: &std::sync::Arc, + params: &Parameters, +) -> anyhow::Result> { + use crate::align::read_align::{PairedAlignmentResult, align_paired_read, align_read}; + use crate::solo::{GeneAssignment, SoloStrand, classify_read}; + use rayon::prelude::*; + use std::sync::Arc; + + let manifest = params + .read_files_manifest + .as_ref() + .ok_or_else(|| anyhow::anyhow!("--soloType SmartSeq requires --readFilesManifest"))?; + let cells = crate::solo::smartseq::parse_manifest(manifest)?; + info!( + "STARsolo SmartSeq: {} cells from {}", + cells.len(), + manifest.display() + ); + + let gtf = params.sjdb_gtf_file.as_ref().ok_or_else(|| { + anyhow::anyhow!("--soloType SmartSeq Gene counting requires --sjdbGTFfile") + })?; + let exons = crate::junction::gtf::parse_gtf_configured( + gtf, + ¶ms.sjdb_gtf_feature_exon, + ¶ms.sjdb_gtf_chr_prefix, + )?; + let gene_ann = crate::quant::GeneAnnotation::from_gtf_exons_configured( + &exons, + &index.genome, + ¶ms.sjdb_gtf_tag_exon_parent_gene, + ); + info!( + "STARsolo SmartSeq: {} genes from {}", + gene_ann.n_genes(), + gtf.display() + ); + let strand: SoloStrand = params.solo_strand.parse().unwrap_or_default(); + let max_multimaps = params.out_filter_multimap_nmax as usize; + + let stats = Arc::new(crate::stats::AlignmentStats::new()); + let cell_ids: Vec = cells.iter().map(|c| c.cell_id.clone()).collect(); + let counts = crate::solo::smartseq::SmartSeqCounts::new(cell_ids, gene_ann.gene_ids.len()); + + // Assign a (possibly multi-locus) read/fragment to a gene and count it. + let assign_count = |ci: usize, transcripts: &[crate::align::transcript::Transcript]| { + if let GeneAssignment::Gene(g) = + classify_read(transcripts, &gene_ann, strand, true, false, false).gene + { + counts.add(ci, g); + } + }; + let cmd = params.read_files_command.as_deref(); + + for (ci, cell) in cells.iter().enumerate() { + match &cell.read2 { + // Single-end: count reads. + None => { + let mut reader = crate::io::fastq::FastqReader::open(&cell.read1, cmd)?; + loop { + let batch = reader.read_batch(10_000)?; + if batch.is_empty() { + break; + } + batch.par_iter().for_each(|read| { + stats.record_read_bases(read.sequence.len() as u64); + let Ok((transcripts, _chim, n_for_mapq, reason)) = + align_read(&read.sequence, &read.name, index, params) + else { + return; + }; + let n = if transcripts.is_empty() && n_for_mapq > 0 { + n_for_mapq + } else { + transcripts.len() + }; + stats.record_alignment(n, max_multimaps); + if transcripts.is_empty() { + stats.record_unmapped_reason( + reason.unwrap_or(crate::stats::UnmappedReason::Other), + ); + } else if transcripts.len() == 1 { + stats.record_transcript_stats(&transcripts[0]); + } + assign_count(ci, &transcripts); + }); + } + } + // Paired-end: align both mates as a fragment, count the fragment once + // (gene from the union of both mates' overlaps). + Some(r2) => { + let mut reader = crate::io::fastq::PairedFastqReader::open(&cell.read1, r2, cmd)?; + loop { + let mut batch = Vec::with_capacity(10_000); + while batch.len() < 10_000 { + match reader.next_paired()? { + Some(p) => batch.push(p), + None => break, + } + } + if batch.is_empty() { + break; + } + batch.par_iter().for_each(|pr| { + stats.record_read_bases( + (pr.mate1.sequence.len() + pr.mate2.sequence.len()) as u64, + ); + let Ok((results, _chim, n_for_mapq, reason)) = align_paired_read( + &pr.mate1.sequence, + &pr.mate2.sequence, + &pr.name, + index, + params, + ) else { + return; + }; + let n_pairs = results.len(); + let mut trs = Vec::with_capacity(n_pairs * 2); + for r in results { + match r { + PairedAlignmentResult::BothMapped(pa) => { + trs.push(pa.mate1_transcript); + trs.push(pa.mate2_transcript); + } + PairedAlignmentResult::HalfMapped { + mapped_transcript, .. + } => trs.push(mapped_transcript), + } + } + let n = if trs.is_empty() && n_for_mapq > 0 { + n_for_mapq + } else { + n_pairs + }; + stats.record_alignment(n, max_multimaps); + if trs.is_empty() { + stats.record_unmapped_reason( + reason.unwrap_or(crate::stats::UnmappedReason::Other), + ); + } + assign_count(ci, &trs); + }); + } + } + } + } + + let solo_dir = params + .solo_out_file_names + .first() + .cloned() + .unwrap_or_else(|| "Solo.out/".to_string()); + let raw_dir = params.output_path(&format!("{solo_dir}Gene/raw/")); + let gzip = matches!(params.solo_out_gzip.as_str(), "yes" | "Yes" | "true"); + let nnz = counts.write_matrix(&raw_dir, &gene_ann.gene_ids, gzip)?; + info!( + "STARsolo SmartSeq: wrote Gene/raw matrix ({} genes × {} cells, {} entries)", + gene_ann.n_genes(), + cells.len(), + nnz, + ); + stats.print_summary(); + Ok(stats) +} + /// Run single-pass alignment (original logic) fn run_single_pass( index: &std::sync::Arc, params: &Parameters, quant_ctx: Option<&std::sync::Arc>, tr_idx: Option<&std::sync::Arc>, + solo_ctx: Option<&std::sync::Arc>, ) -> anyhow::Result> { use crate::io::bam::{BamWriter, SortedBamWriter}; use crate::io::sam::SamWriter; @@ -365,7 +614,7 @@ fn run_single_pass( use crate::io::fastq::UnmappedFastqWriter; use crate::params::OutReadsUnmapped; - let is_paired = params.read_files_in.len() == 2; + let is_paired = params.read_files_in.len() == 2 && !params.solo_enabled(); let mut unmapped_w1: Option = if params.out_reads_unmapped == OutReadsUnmapped::Fastx { let path = params.output_path("Unmapped.out.mate1"); @@ -442,13 +691,37 @@ fn run_single_pass( } } OutSamFormat::None => { - anyhow::bail!("Output format 'None' not yet implemented"); + info!("--outSAMtype None: skipping alignment output (count/quant only)"); + Box::new(NullWriter) } }, }; // Align reads through the boxed writer. - match params.read_files_in.len() { + // + // Solo runs supply two `--readFilesIn` files (cDNA read + barcode read) but + // are single-end *alignment* runs: only the cDNA read (file 0) is aligned. + // The dedicated solo loop reads the barcode read in lockstep, quantifies + // per cell, and otherwise emits the cDNA alignments like the SE path. + if let Some(sctx) = solo_ctx { + align_reads_solo(params, index, writer.as_mut(), &stats, &sj_stats, sctx)?; + writer.finish()?; + if let Some(ref mut w) = tr_writer { + w.finish()?; + } + let sj_output_path = params.output_path("SJ.out.tab"); + if !sj_stats.is_empty() { + sj_stats.write_output(&sj_output_path, &index.genome, params)?; + } + // Per-cell count matrices (raw + filtered), Summary.csv, and the SJ + // feature matrix — written here where sj_stats is available. + write_solo_output(sctx, params, &stats, &sj_stats, index)?; + stats.print_summary(); + return Ok(stats); + } + + let n_align_files = params.read_files_in.len(); + match n_align_files { 1 => align_reads_single_end( params, index, @@ -504,6 +777,7 @@ fn run_two_pass( params: &Parameters, quant_ctx: Option<&std::sync::Arc>, tr_idx: Option<&std::sync::Arc>, + solo_ctx: Option<&std::sync::Arc>, ) -> anyhow::Result> { use std::sync::Arc; @@ -534,7 +808,7 @@ fn run_two_pass( // PASS 2: Re-alignment with merged DB (quant counts happen here) info!("Two-pass mode: Pass 2 - Re-alignment"); - let stats = run_single_pass(&Arc::new(merged_index), params, quant_ctx, tr_idx)?; + let stats = run_single_pass(&Arc::new(merged_index), params, quant_ctx, tr_idx, solo_ctx)?; Ok(stats) } @@ -567,8 +841,14 @@ fn run_pass1( // Create NullWriter (discard SAM/BAM output in pass 1) let mut null_writer = NullWriter; - // Align reads (single-end or paired-end); no quant counting in pass 1 - match params.read_files_in.len() { + // Align reads (single-end or paired-end); no quant counting in pass 1. + // Solo runs align only the cDNA read (file 0) — route to the SE path. + let n_align_files = if params.solo_enabled() { + 1 + } else { + params.read_files_in.len() + }; + match n_align_files { 1 => align_reads_single_end( ¶ms_pass1, index, @@ -937,7 +1217,9 @@ fn align_reads_single_end( let clip5p = params.clip5p_nbases as usize; let clip3p = params.clip3p_nbases as usize; let max_multimaps = params.out_filter_multimap_nmax as usize; - let output_unmapped = params.out_sam_unmapped != params::OutSamUnmapped::None; + // `--outSAMtype None` (e.g. quant-only) skips building SAM records. + let emit_sam = params.emits_alignments(); + let output_unmapped = emit_sam && params.out_sam_unmapped != params::OutSamUnmapped::None; let write_unmapped_fastq = params.out_reads_unmapped == params::OutReadsUnmapped::Fastx; let by_sjout = params.out_filter_type == OutFilterType::BySJout; @@ -1083,36 +1365,39 @@ fn align_reads_single_end( Vec::new() }; - // Build SAM records (no I/O, just construction) + // Build SAM records (no I/O, just construction). + // Skipped entirely under `--outSAMtype None`. let is_unmapped_se = transcripts.is_empty(); - if is_unmapped_se { - // Unmapped - if output_unmapped { - let record = SamWriter::build_unmapped_record( + if emit_sam { + if is_unmapped_se { + // Unmapped + if output_unmapped { + let record = SamWriter::build_unmapped_record( + &read.name, + &clipped_seq, + &clipped_qual, + params, + unmapped_reason.unwrap_or(crate::stats::UnmappedReason::Other), + )?; + buffer.push(record); + } + } else if transcripts.len() <= max_multimaps { + // Mapped (within multimap limit) + let records = SamWriter::build_alignment_records( &read.name, &clipped_seq, &clipped_qual, + &transcripts, + &index.genome, params, - unmapped_reason.unwrap_or(crate::stats::UnmappedReason::Other), + n_for_mapq, )?; - buffer.push(record); - } - } else if transcripts.len() <= max_multimaps { - // Mapped (within multimap limit) - let records = SamWriter::build_alignment_records( - &read.name, - &clipped_seq, - &clipped_qual, - &transcripts, - &index.genome, - params, - n_for_mapq, - )?; - for record in records { - buffer.push(record); + for record in records { + buffer.push(record); + } } + // else: too many loci, skip output } - // else: too many loci, skip output // Transcriptome SAM projection for --quantMode TranscriptomeSAM. let transcriptome_records: Vec = @@ -1307,6 +1592,242 @@ fn align_reads_single_end( Ok(()) } +/// Align a STARsolo single-cell run: the cDNA read (file 0) is aligned exactly +/// like the SE path, while the barcode read (file 1) is read in lockstep and +/// quantified per cell. Mapped cDNA alignments are written to the SAM/BAM output +/// just like a normal SE run; the per-cell (CB, UMI, gene) records are collected +/// into `solo_ctx.recorder` for the matrix output that follows in Phase 14.4. +/// +/// Solo runs are single-pass and (for now) do not support BySJout / chimeric / +/// transcriptome-SAM side outputs — those are not part of the STARsolo MVP. +fn align_reads_solo( + params: &Parameters, + index: &std::sync::Arc, + writer: &mut W, + stats: &std::sync::Arc, + sj_stats: &std::sync::Arc, + solo_ctx: &std::sync::Arc, +) -> anyhow::Result<()> { + use crate::align::read_align::align_read; + use crate::io::fastq::clip_read; + use crate::io::sam::{BufferedSamRecords, SamWriter}; + use crate::solo::{SoloCountRecord, SoloMultiRecord}; + use rayon::prelude::*; + use std::sync::Arc; + + let cdna_file = ¶ms.read_files_in[0]; + let barcode_file = ¶ms.read_files_in[1]; + info!( + "STARsolo: cDNA reads from {}, barcode reads from {}", + cdna_file.display(), + barcode_file.display() + ); + let mut reader = crate::solo::open_reader(params)?; + + let stats = Arc::clone(stats); + let sj_stats = Arc::clone(sj_stats); + let solo = Arc::clone(solo_ctx); + + let mut read_count = 0u64; + let max_reads = if params.read_map_number < 0 { + u64::MAX + } else { + params.read_map_number as u64 + }; + let batch_size = 10000; + let clip5p = params.clip5p_nbases as usize; + let clip3p = params.clip3p_nbases as usize; + let cr4_clip = params.clip_adapter_type == "CellRanger4"; + let max_multimaps = params.out_filter_multimap_nmax as usize; + // With `--outSAMtype None` (count-only) we skip building SAM records entirely + // — a large saving for solo runs that only need the count matrix. + let emit_sam = params.emits_alignments(); + let output_unmapped = emit_sam && params.out_sam_unmapped != params::OutSamUnmapped::None; + + /// Per-read result for the solo loop (one outcome per quantified feature). + struct SoloReadProduct { + sam_records: BufferedSamRecords, + per_feature: Vec, + sj: Vec, + velocyto: Option, + } + + info!("STARsolo: aligning cDNA reads and quantifying barcodes..."); + loop { + let batch = reader.read_batch(batch_size)?; + if batch.is_empty() { + break; + } + let reads_to_process = if read_count + batch.len() as u64 > max_reads { + (max_reads - read_count) as usize + } else { + batch.len() + }; + let batch_to_process = &batch[..reads_to_process]; + + let batch_results: Vec> = batch_to_process + .par_iter() + .map(|sread| { + let index = Arc::clone(index); + let stats = Arc::clone(&stats); + let sj_stats = Arc::clone(&sj_stats); + let solo = Arc::clone(&solo); + + let read = &sread.cdna; + // CellRanger4 adapter clipping (TSO 5' + polyA 3') runs before + // the fixed clip5p/clip3p Nbases trimming. + let (cr_seq, cr_qual) = if cr4_clip { + crate::solo::clip_adapter_cr4(&read.sequence, &read.quality) + } else { + (read.sequence.clone(), read.quality.clone()) + }; + let (clipped_seq, clipped_qual) = clip_read(&cr_seq, &cr_qual, clip5p, clip3p); + let mut buffer = BufferedSamRecords::new(); + stats.record_read_bases(clipped_seq.len() as u64); + + if clipped_seq.is_empty() { + stats.record_alignment(0, max_multimaps); + stats.record_unmapped_reason(crate::stats::UnmappedReason::Other); + // No alignment → barcode still counts toward stats (unmapped → no gene). + let outcome = solo.process_read(&[], sread.barcode.as_ref(), &[]); + return Ok(SoloReadProduct { + sam_records: buffer, + per_feature: outcome.per_feature, + sj: outcome.sj, + velocyto: outcome.velocyto, + }); + } + + let (transcripts, _chimeric, n_for_mapq, unmapped_reason) = + align_read(&clipped_seq, &read.name, &index, params)?; + + let n_for_stats = if transcripts.is_empty() && n_for_mapq > 0 { + n_for_mapq + } else { + transcripts.len() + }; + stats.record_alignment(n_for_stats, max_multimaps); + if transcripts.is_empty() && unmapped_reason.is_some() { + stats.record_unmapped_reason( + unmapped_reason.unwrap_or(crate::stats::UnmappedReason::Other), + ); + } else if transcripts.len() == 1 { + stats.record_transcript_stats(&transcripts[0]); + } + + let is_unique = transcripts.len() == 1; + for transcript in &transcripts { + record_transcript_junctions(transcript, &index, &sj_stats, is_unique); + } + + // SJ feature: the junctions crossed by a uniquely-mapped read + // (absolute intron coords), mapped to SJ.out.tab rows at output. + let junctions: Vec<(u64, u64)> = + if solo.sj_enabled && is_unique && transcripts[0].n_junction > 0 { + extract_junction_keys(&transcripts[0], &index) + .into_iter() + .map(|k| (k.intron_start, k.intron_end)) + .collect() + } else { + Vec::new() + }; + + // Solo quantification (CB match + UMI check + gene assignment). + let outcome = solo.process_read(&transcripts, sread.barcode.as_ref(), &junctions); + + // Build SAM records for the cDNA alignment (same as SE path). + // Skipped entirely under `--outSAMtype None` (count-only). + if emit_sam { + if transcripts.is_empty() { + if output_unmapped { + let record = SamWriter::build_unmapped_record( + &read.name, + &clipped_seq, + &clipped_qual, + params, + unmapped_reason.unwrap_or(crate::stats::UnmappedReason::Other), + )?; + buffer.push(record); + } + } else if transcripts.len() <= max_multimaps { + let records = SamWriter::build_alignment_records( + &read.name, + &clipped_seq, + &clipped_qual, + &transcripts, + &index.genome, + params, + n_for_mapq, + )?; + for record in records { + buffer.push(record); + } + } + } + + Ok(SoloReadProduct { + sam_records: buffer, + per_feature: outcome.per_feature, + sj: outcome.sj, + velocyto: outcome.velocyto, + }) + }) + .collect(); + + // Sequential write + per-feature record collection. + let n_feat = solo.features.len(); + let mut feat_records: Vec> = (0..n_feat).map(|_| Vec::new()).collect(); + let mut feat_multi: Vec> = (0..n_feat).map(|_| Vec::new()).collect(); + let mut feat_multi_gene: Vec> = + (0..n_feat).map(|_| Vec::new()).collect(); + let mut sj_batch: Vec = Vec::new(); + let mut velo_batch: Vec = Vec::new(); + for result in batch_results { + let product = result?; + writer.write_batch(&product.sam_records.records)?; + for (fi, fo) in product.per_feature.into_iter().enumerate() { + if let Some(r) = fo.record { + feat_records[fi].push(r); + } + if let Some(m) = fo.multi { + feat_multi[fi].push(m); + } + if let Some(mg) = fo.multi_gene { + feat_multi_gene[fi].push(mg); + } + } + sj_batch.extend(product.sj); + velo_batch.extend(product.velocyto); + } + for (fi, recorder) in solo.recorders.iter().enumerate() { + recorder.extend( + std::mem::take(&mut feat_records[fi]), + std::mem::take(&mut feat_multi[fi]), + ); + let mg = std::mem::take(&mut feat_multi_gene[fi]); + if !mg.is_empty() { + recorder.multi_gene.lock().unwrap().extend(mg); + } + } + if !sj_batch.is_empty() { + solo.sj_records.lock().unwrap().extend(sj_batch); + } + if !velo_batch.is_empty() { + solo.velocyto_records.lock().unwrap().extend(velo_batch); + } + + read_count += reads_to_process as u64; + if read_count % 100_000 < batch_size as u64 { + info!("STARsolo: processed {read_count} reads..."); + } + if read_count >= max_reads { + break; + } + } + + Ok(()) +} + /// Align paired-end reads #[allow(clippy::too_many_arguments)] fn align_reads_paired_end( @@ -1368,7 +1889,9 @@ fn align_reads_paired_end( let clip5p = params.clip5p_nbases as usize; let clip3p = params.clip3p_nbases as usize; let max_multimaps = params.out_filter_multimap_nmax as usize; - let output_unmapped = params.out_sam_unmapped != params::OutSamUnmapped::None; + // `--outSAMtype None` (e.g. quant-only) skips building SAM records. + let emit_sam = params.emits_alignments(); + let output_unmapped = emit_sam && params.out_sam_unmapped != params::OutSamUnmapped::None; let write_unmapped_fastq = params.out_reads_unmapped == params::OutReadsUnmapped::Fastx; let by_sjout = params.out_filter_type == OutFilterType::BySJout; @@ -1607,8 +2130,10 @@ fn align_reads_paired_end( Vec::new() }; - // Build SAM records - if results.is_empty() { + // Build SAM records (skipped entirely under `--outSAMtype None`). + if !emit_sam { + // count/quant-only: no SAM record construction + } else if results.is_empty() { // Unmapped pair if output_unmapped { let records = SamWriter::build_paired_unmapped_records( diff --git a/src/params/mod.rs b/src/params/mod.rs index a63b5e8..d248b28 100644 --- a/src/params/mod.rs +++ b/src/params/mod.rs @@ -221,6 +221,62 @@ impl std::str::FromStr for TwopassMode { } } +// --------------------------------------------------------------------------- +// STARsolo (single-cell) type +// --------------------------------------------------------------------------- + +/// STAR's `--soloType` — selects the single-cell barcode geometry. +/// +/// Mirrors STAR's `ParametersSolo::typeStr` values. Only `None` and +/// `CB_UMI_Simple` (droplet 10x-style) are functional in Phase 14.1; the +/// remaining variants are parsed so the CLI accepts them and later sub-phases +/// can fill in behavior. +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub enum SoloType { + /// Not a single-cell run (default). + #[default] + None, + /// One cell barcode + one UMI at fixed positions in the barcode read + /// (10x Chromium, Drop-seq, inDrops-simple, etc.). STAR alias: `Droplet`. + CbUmiSimple, + /// Multi-segment cell barcode and/or UMI, optionally adapter-anchored. + CbUmiComplex, + /// Barcodes passed through as SAM tags only (no collapsing). + CbSamTagOut, + /// Plate-based Smart-seq: one cell per read-group, no UMI. + SmartSeq, +} + +impl std::str::FromStr for SoloType { + type Err = String; + fn from_str(s: &str) -> Result { + match s { + "None" => Ok(Self::None), + // STAR accepts both the descriptive name and the `Droplet` alias. + "CB_UMI_Simple" | "Droplet" => Ok(Self::CbUmiSimple), + "CB_UMI_Complex" => Ok(Self::CbUmiComplex), + "CB_samTagOut" => Ok(Self::CbSamTagOut), + "SmartSeq" => Ok(Self::SmartSeq), + _ => Err(format!( + "unknown soloType '{s}'; expected None, CB_UMI_Simple, CB_UMI_Complex, CB_samTagOut, or SmartSeq" + )), + } + } +} + +impl std::fmt::Display for SoloType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::None => "None", + Self::CbUmiSimple => "CB_UMI_Simple", + Self::CbUmiComplex => "CB_UMI_Complex", + Self::CbSamTagOut => "CB_samTagOut", + Self::SmartSeq => "SmartSeq", + }; + write!(f, "{s}") + } +} + // --------------------------------------------------------------------------- // Parameters struct // --------------------------------------------------------------------------- @@ -284,6 +340,12 @@ pub struct Parameters { #[arg(long = "readFilesCommand")] pub read_files_command: Option, + /// `--soloType SmartSeq` manifest: a TSV with `read1 read2 cellID` + /// per line (`read2` = `-` for single-end). Each line is one plate-well cell; + /// reads are counted per gene with no UMI. + #[arg(long = "readFilesManifest")] + pub read_files_manifest: Option, + /// Number of reads to map; -1 = all #[arg(long = "readMapNumber", default_value_t = -1, allow_hyphen_values = true)] pub read_map_number: i64, @@ -296,6 +358,13 @@ pub struct Parameters { #[arg(long = "clip3pNbases", default_value_t = 0)] pub clip3p_nbases: u32, + /// Adapter clipping type applied to the cDNA read: `Hamming` (default, + /// adapter-sequence based, no-op when no adapter is configured) or + /// `CellRanger4` (clip the 10x TSO from the 5' end and trim the 3' polyA + /// tail, to match CellRanger ≥ 4.0). + #[arg(long = "clipAdapterType", default_value = "Hamming")] + pub clip_adapter_type: String, + // ── Output ────────────────────────────────────────────────────────── /// Output file name prefix (including path) #[arg(long = "outFileNamePrefix", default_value = "./")] @@ -659,6 +728,88 @@ pub struct Parameters { #[arg(long = "chimOutType", num_args = 1..=2, default_values_t = vec!["Junctions".to_string()])] pub chim_out_type: Vec, + // ── STARsolo (single-cell) ────────────────────────────────────────── + /// Single-cell barcode geometry; `None` disables solo processing. + #[arg(long = "soloType", default_value = "None")] + pub solo_type: SoloType, + + /// Cell-barcode whitelist file (one barcode per line, plain or gzipped). + /// The literal `None` means "no whitelist" (all observed barcodes kept). + /// Multiple files are allowed for `CB_UMI_Complex` (one per CB segment). + #[arg(long = "soloCBwhitelist", num_args = 1.., default_values_t = vec!["None".to_string()])] + pub solo_cb_whitelist: Vec, + + /// 1-based start position of the cell barcode in the barcode read. + #[arg(long = "soloCBstart", default_value_t = 1)] + pub solo_cb_start: u32, + + /// Length of the cell barcode in bases. + #[arg(long = "soloCBlen", default_value_t = 16)] + pub solo_cb_len: u32, + + /// 1-based start position of the UMI in the barcode read. + #[arg(long = "soloUMIstart", default_value_t = 17)] + pub solo_umi_start: u32, + + /// Length of the UMI in bases (10x v2 = 10, v3 = 12). + #[arg(long = "soloUMIlen", default_value_t = 10)] + pub solo_umi_len: u32, + + /// `CB_UMI_Complex` cell-barcode segment positions, one per segment, as + /// `startAnchor_startDist_endAnchor_endDist`. Only read-start anchoring + /// (`anchor = 0`, fixed positions) is supported, e.g. `0_0_0_7 0_8_0_15`. + #[arg(long = "soloCBposition", num_args = 0..)] + pub solo_cb_position: Vec, + + /// `CB_UMI_Complex` UMI position as `startAnchor_startDist_endAnchor_endDist` + /// (read-start anchoring only), e.g. `0_16_0_25`. + #[arg(long = "soloUMIposition", default_value = "")] + pub solo_umi_position: String, + + /// Genomic features to quantify per cell: Gene, GeneFull, SJ, Velocyto, … + #[arg(long = "soloFeatures", num_args = 1.., default_values_t = vec!["Gene".to_string()])] + pub solo_features: Vec, + + /// UMI collapsing strategy: 1MM_All, 1MM_Directional, 1MM_Directional_UMItools, + /// Exact, or NoDedup. + #[arg(long = "soloUMIdedup", num_args = 1.., default_values_t = vec!["1MM_All".to_string()])] + pub solo_umi_dedup: Vec, + + /// Cell-barcode-to-whitelist matching: Exact, 1MM, 1MM_multi, + /// 1MM_multi_pseudocounts, 1MM_multi_Nbase_pseudocounts. + #[arg(long = "soloCBmatchWLtype", default_value = "1MM_multi")] + pub solo_cb_match_wl_type: String, + + /// Cell-calling / matrix filtering: None, CellRanger2.2, EmptyDrops_CR, TopCells. + #[arg(long = "soloCellFilter", num_args = 1.., default_values_t = vec!["CellRanger2.2".to_string(), "3000".to_string(), "0.99".to_string(), "10".to_string()])] + pub solo_cell_filter: Vec, + + /// Counting method for reads mapping to multiple genes: Unique (default, + /// drop), Uniform, Rescue, PropUnique, EM. Non-Unique methods additionally + /// write `UniqueAndMult-.mtx` (real-valued) per Gene/GeneFull feature. + #[arg(long = "soloMultiMappers", num_args = 1.., default_values_t = vec!["Unique".to_string()])] + pub solo_multi_mappers: Vec, + + /// Output directory name for solo matrices (relative to `--outFileNamePrefix`). + #[arg(long = "soloOutFileNames", num_args = 1.., default_values_t = vec!["Solo.out/".to_string(), "features.tsv".to_string(), "barcodes.tsv".to_string(), "matrix.mtx".to_string()])] + pub solo_out_file_names: Vec, + + /// Gzip the solo `matrix.mtx` / `barcodes.tsv` / `features.tsv` and append a + /// `.gz` suffix (CellRanger-style output). Default `no` keeps the plain files + /// that STARsolo writes (so the byte-for-byte STARsolo comparison still holds). + #[arg(long = "soloOutGzip", default_value = "no")] + pub solo_out_gzip: String, + + /// Strand of the read relative to the gene for counting: Forward, Reverse, Unstranded. + #[arg(long = "soloStrand", default_value = "Forward")] + pub solo_strand: String, + + /// UMI filtering of multi-gene UMIs: `-`/`None` (default, no filtering), + /// `MultiGeneUMI`, `MultiGeneUMI_CR`, or `MultiGeneUMI_All`. The `_CR` + /// variant matches CellRanger > 3.0. + #[arg(long = "soloUMIfiltering", num_args = 1.., default_values_t = vec!["-".to_string()])] + pub solo_umi_filtering: Vec, + /// Full command line as invoked, embedded in the BAM `@PG` `CL:` field. #[arg(skip)] pub command_line: Option, @@ -670,6 +821,14 @@ impl Parameters { PathBuf::from(format!("{}{suffix}", self.out_file_name_prefix)) } + /// Whether the run produces per-read alignment records (SAM/BAM). False only + /// for `--outSAMtype None` written to a file (no `--outStd`): the alignment + /// loops then skip building SAM records entirely, which is a large saving for + /// solo / quant-only runs that only need the count matrix. + pub fn emits_alignments(&self) -> bool { + !matches!(self.out_std, OutStd::None) || self.out_sam_type.format != OutSamFormat::None + } + /// Whether `--chimOutType` includes `Junctions` (write Chimeric.out.junction). pub fn chim_out_junctions(&self) -> bool { self.chim_out_type.iter().any(|s| s == "Junctions") @@ -856,8 +1015,12 @@ impl Parameters { )); } - // alignReads requires read files - if params.run_mode == RunMode::AlignReads && params.read_files_in.is_empty() { + // alignReads requires read files — except SmartSeq, which gets its reads + // from --readFilesManifest instead. + if params.run_mode == RunMode::AlignReads + && params.read_files_in.is_empty() + && params.solo_type != SoloType::SmartSeq + { return Err(command.error( ErrorKind::MissingRequiredArgument, "--readFilesIn is required when --runMode alignReads", @@ -917,6 +1080,184 @@ impl Parameters { )); } + // ── STARsolo validation ───────────────────────────────────────── + if params.run_mode == RunMode::AlignReads && params.solo_enabled() { + // CB_UMI_Complex needs one CB position + whitelist per segment. + if params.solo_type == SoloType::CbUmiComplex { + if params.solo_cb_position.is_empty() { + return Err(command.error( + ErrorKind::MissingRequiredArgument, + "--soloType CB_UMI_Complex requires --soloCBposition (one per CB segment)", + )); + } + if params.solo_cb_whitelist.len() != params.solo_cb_position.len() { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "--soloType CB_UMI_Complex: {} --soloCBposition segments but {} --soloCBwhitelist files (must match)", + params.solo_cb_position.len(), + params.solo_cb_whitelist.len() + ), + )); + } + } + // SmartSeq is plate-based (one library per manifest cell, no barcodes). + if params.solo_type == SoloType::SmartSeq && params.read_files_manifest.is_none() { + return Err(command.error( + ErrorKind::MissingRequiredArgument, + "--soloType SmartSeq requires --readFilesManifest (a TSV of read1read2cellID per cell)", + )); + } + // CB_UMI_Simple needs exactly two read files: cDNA + barcode read. + if matches!( + params.solo_type, + SoloType::CbUmiSimple | SoloType::CbUmiComplex | SoloType::CbSamTagOut + ) && params.read_files_in.len() != 2 + { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "--soloType {} requires exactly two --readFilesIn files (cDNA read then barcode read); got {}", + params.solo_type, + params.read_files_in.len() + ), + )); + } + // Gene / GeneFull / SJ / Velocyto are implemented. + for f in ¶ms.solo_features { + if !matches!(f.as_str(), "SJ" | "Velocyto") + && f.parse::().is_err() + { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unsupported --soloFeatures '{f}'; supported: Gene, GeneFull, SJ, Velocyto" + ), + )); + } + } + // soloMultiMappers values. + for m in ¶ms.solo_multi_mappers { + if !matches!( + m.as_str(), + "Unique" | "Uniform" | "Rescue" | "PropUnique" | "EM" + ) { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unsupported --soloMultiMappers '{m}'; expected Unique, Uniform, Rescue, PropUnique, or EM" + ), + )); + } + } + // Gene-level features need a gene model (SJ does not — junctions come + // from the alignments). + let needs_gtf = params + .solo_features + .iter() + .any(|f| f == "Gene" || f == "GeneFull" || f == "Velocyto"); + if needs_gtf && params.sjdb_gtf_file.is_none() { + return Err(command.error( + ErrorKind::MissingRequiredArgument, + "--soloFeatures Gene/GeneFull requires --sjdbGTFfile (a gene model)", + )); + } + // CB length / UMI length sanity. + if params.solo_type == SoloType::CbUmiSimple + && (params.solo_cb_len == 0 || params.solo_umi_len == 0) + { + return Err(command.error( + ErrorKind::InvalidValue, + "--soloCBlen and --soloUMIlen must be > 0 for soloType CB_UMI_Simple", + )); + } + // Cell barcode cannot exceed a u64 packing (32 bases). + if params.solo_cb_len as usize > crate::solo::whitelist::CB_LEN_MAX { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "--soloCBlen {} exceeds the maximum of {}", + params.solo_cb_len, + crate::solo::whitelist::CB_LEN_MAX + ), + )); + } + // Validate --soloCBmatchWLtype. + if params + .solo_cb_match_wl_type + .parse::() + .is_err() + { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unknown --soloCBmatchWLtype '{}'; expected Exact, 1MM, 1MM_multi, 1MM_multi_pseudocounts, or 1MM_multi_Nbase_pseudocounts", + params.solo_cb_match_wl_type + ), + )); + } + // Validate --soloUMIdedup (each method string). + for m in ¶ms.solo_umi_dedup { + if m.parse::().is_err() { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unknown --soloUMIdedup '{m}'; expected Exact, NoDedup, 1MM_All, 1MM_Directional, or 1MM_Directional_UMItools" + ), + )); + } + } + // Validate --soloUMIfiltering (each method string). + for f in ¶ms.solo_umi_filtering { + if f.parse::().is_err() { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unknown --soloUMIfiltering '{f}'; expected -, None, MultiGeneUMI, MultiGeneUMI_CR, or MultiGeneUMI_All" + ), + )); + } + } + // Validate --clipAdapterType. + if !matches!( + params.clip_adapter_type.as_str(), + "Hamming" | "CellRanger4" | "None" + ) { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unknown --clipAdapterType '{}'; expected Hamming, CellRanger4, or None", + params.clip_adapter_type + ), + )); + } + // Validate --soloStrand. + if params + .solo_strand + .parse::() + .is_err() + { + return Err(command.error( + ErrorKind::InvalidValue, + format!( + "unknown --soloStrand '{}'; expected Forward, Reverse, or Unstranded", + params.solo_strand + ), + )); + } + // A whitelist is required for any correction beyond None (SmartSeq + // has no cell barcodes at all, so the rule does not apply). + if params.solo_type != SoloType::SmartSeq + && params.solo_cb_whitelist_none() + && params.solo_cb_match_wl_type != "Exact" + { + return Err(command.error( + ErrorKind::InvalidValue, + "--soloCBwhitelist None requires --soloCBmatchWLtype Exact (no correction possible without a whitelist)", + )); + } + } + Ok(params) } @@ -929,6 +1270,56 @@ impl Parameters { pub fn quant_transcriptome_sam(&self) -> bool { self.quant_mode.iter().any(|m| m == "TranscriptomeSAM") } + + /// True when a single-cell run is requested (`--soloType` != None). + pub fn solo_enabled(&self) -> bool { + self.solo_type != SoloType::None + } + + /// Path to the cDNA (transcript) read file. For solo runs this is the + /// FIRST `--readFilesIn` file (STAR convention: `cDNA_read barcode_read`). + /// Returns `None` if no read files are configured. + pub fn cdna_read_file(&self) -> Option<&PathBuf> { + self.read_files_in.first() + } + + /// Path to the barcode (CB+UMI) read file — the SECOND `--readFilesIn` + /// file when solo is enabled. `None` if absent. + pub fn barcode_read_file(&self) -> Option<&PathBuf> { + if self.solo_enabled() { + self.read_files_in.get(1) + } else { + None + } + } + + /// True when the literal `None` whitelist was given (keep all barcodes). + pub fn solo_cb_whitelist_none(&self) -> bool { + self.solo_cb_whitelist.len() == 1 && self.solo_cb_whitelist[0] == "None" + } + + /// Path to the (first) cell-barcode whitelist file, or `None` for the + /// literal `None` whitelist. + pub fn solo_cb_whitelist_path(&self) -> Option { + if self.solo_cb_whitelist_none() { + None + } else { + self.solo_cb_whitelist.first().map(PathBuf::from) + } + } + + /// Parsed `--soloCBmatchWLtype` flags. Falls back to the `1MM_multi` + /// default if somehow unset (validation rejects invalid strings). + pub fn solo_cb_match_type(&self) -> crate::solo::whitelist::CbMatchType { + self.solo_cb_match_wl_type + .parse() + .unwrap_or(crate::solo::whitelist::CbMatchType { + mm1: true, + mm1_multi: true, + mm1_multi_nbase: false, + pseudocounts: false, + }) + } } // --------------------------------------------------------------------------- diff --git a/src/quant/mod.rs b/src/quant/mod.rs index 218f0fa..30b4094 100644 --- a/src/quant/mod.rs +++ b/src/quant/mod.rs @@ -33,6 +33,16 @@ pub struct GeneAnnotation { /// Per-chromosome exon interval list, sorted by (start, end). /// Each entry: (start_0based_incl, end_0based_excl, gene_idx). pub chr_exons: Vec>, + /// Per-chromosome **gene-body** interval list (one entry per gene: its full + /// `[min exon start, max exon end)` span, covering introns), sorted by + /// (start, end). Used by the STARsolo `GeneFull` feature, which counts a + /// read overlapping the gene locus including purely intronic reads. + pub chr_gene_body: Vec>, + /// Per-gene merged, sorted exon intervals `[start, end)` (absolute coords), + /// indexed by `gene_idx`. Used by the `Velocyto` feature to tell whether an + /// aligned block lies wholly within an exon (mature/ambiguous) or extends + /// into an intron (nascent/unspliced). + pub gene_exons: Vec>, } impl GeneAnnotation { @@ -46,6 +56,9 @@ impl GeneAnnotation { let mut gene_id_to_idx: std::collections::HashMap = std::collections::HashMap::new(); let mut chr_exons: Vec> = vec![Vec::new(); n_chrs]; + // Per-gene full span: (chr_idx, min_start, max_end). Accumulated over all + // of a gene's exons to build the GeneFull gene-body intervals. + let mut gene_span: Vec> = Vec::new(); for exon in exons { let gene_id = match exon.attributes.get(gene_tag) { @@ -61,6 +74,7 @@ impl GeneAnnotation { let is_rev = exon.strand == '-'; gene_is_reverse.push(is_rev); gene_ids.push(gene_id); + gene_span.push(None); idx }; @@ -78,6 +92,15 @@ impl GeneAnnotation { let end = chr_offset + exon.end; chr_exons[chr_idx].push((start, end, gene_idx)); + + // Extend this gene's full span. (A gene's exons share one chr.) + match &mut gene_span[gene_idx] { + Some((_, s, e)) => { + *s = (*s).min(start); + *e = (*e).max(end); + } + slot @ None => *slot = Some((chr_idx, start, end)), + } } for exons in &mut chr_exons { @@ -85,13 +108,60 @@ impl GeneAnnotation { exons.dedup(); } + // Build the per-chromosome gene-body interval list. + let mut chr_gene_body: Vec> = vec![Vec::new(); n_chrs]; + for (gene_idx, span) in gene_span.iter().enumerate() { + if let Some((chr_idx, s, e)) = *span { + chr_gene_body[chr_idx].push((s, e, gene_idx)); + } + } + for bodies in &mut chr_gene_body { + bodies.sort_unstable_by_key(|&(s, e, _)| (s, e)); + } + + // Per-gene merged exon intervals (for the Velocyto exonic/intronic test). + let mut gene_exons: Vec> = vec![Vec::new(); gene_ids.len()]; + for chr in &chr_exons { + for &(s, e, g) in chr { + gene_exons[g].push((s, e)); + } + } + for ex in &mut gene_exons { + ex.sort_unstable(); + // Merge overlapping/adjacent exons so a block test is unambiguous. + let mut merged: Vec<(u64, u64)> = Vec::with_capacity(ex.len()); + for &(s, e) in ex.iter() { + if let Some(last) = merged.last_mut() + && s <= last.1 + { + last.1 = last.1.max(e); + } else { + merged.push((s, e)); + } + } + *ex = merged; + } + GeneAnnotation { gene_ids, gene_is_reverse, chr_exons, + chr_gene_body, + gene_exons, } } + /// Whether the aligned block `[start, end)` lies wholly within a single + /// (merged) exon of gene `g` — i.e. it is exonic, not intron-spanning. + pub fn block_is_exonic(&self, g: usize, start: u64, end: u64) -> bool { + let Some(exons) = self.gene_exons.get(g) else { + return false; + }; + // First exon with exon_start > start is at `i`; the candidate is `i-1`. + let i = exons.partition_point(|&(s, _)| s <= start); + i > 0 && exons[i - 1].0 <= start && end <= exons[i - 1].1 + } + /// Build from GTF exon records using default `"gene_id"` attribute (backward-compatible). pub fn from_gtf_exons(exons: &[GtfRecord], genome: &Genome) -> Self { Self::from_gtf_exons_configured(exons, genome, "gene_id") @@ -101,38 +171,68 @@ impl GeneAnnotation { self.gene_ids.len() } - /// Return indices of all genes whose exons overlap any exon of `transcript`. - /// Result is sorted and deduplicated. + /// Return indices of all genes whose exons overlap any exon of `transcript` + /// (the `Gene` feature). Result is sorted and deduplicated. pub fn overlapping_genes(&self, transcript: &Transcript) -> Vec { - if transcript.chr_idx >= self.chr_exons.len() { - return Vec::new(); + let mut out = Vec::new(); + self.overlapping_genes_into(transcript, &mut out); + out + } + + /// Return indices of all genes whose **full body** (exons + introns) + /// overlaps any aligned block of `transcript` (the `GeneFull` feature). A + /// purely intronic read therefore counts here but not in `overlapping_genes`. + pub fn overlapping_genes_full(&self, transcript: &Transcript) -> Vec { + let mut out = Vec::new(); + self.overlapping_genes_full_into(transcript, &mut out); + out + } + + /// `overlapping_genes` into a caller-provided buffer (cleared + sorted/deduped + /// here). Lets the per-read hot path reuse one scratch `Vec` across reads. + pub fn overlapping_genes_into(&self, transcript: &Transcript, out: &mut Vec) { + Self::overlapping_in_into(&self.chr_exons, transcript, out); + } + + /// `overlapping_genes_full` into a caller-provided buffer. + pub fn overlapping_genes_full_into(&self, transcript: &Transcript, out: &mut Vec) { + Self::overlapping_in_into(&self.chr_gene_body, transcript, out); + } + + /// Shared overlap query over a sorted-by-start per-chromosome interval list, + /// writing sorted/deduped gene indices into `out` (which is cleared first). + fn overlapping_in_into( + chr_intervals: &[Vec<(u64, u64, usize)>], + transcript: &Transcript, + out: &mut Vec, + ) { + out.clear(); + if transcript.chr_idx >= chr_intervals.len() { + return; } - let chr = &self.chr_exons[transcript.chr_idx]; + let chr = &chr_intervals[transcript.chr_idx]; if chr.is_empty() { - return Vec::new(); + return; } - let mut genes: Vec = Vec::new(); - for exon in &transcript.exons { let rs = exon.genome_start; let re = exon.genome_end; if re <= rs { continue; } - // All gene exons with start < re are candidates. + // All intervals with start < re are candidates. let upper = chr.partition_point(|&(gs, _, _)| gs < re); for &(_, ge, gene_idx) in &chr[..upper] { // Overlap condition: ge > rs (start already guaranteed < re by upper bound). if ge > rs { - genes.push(gene_idx); + out.push(gene_idx); } } } - genes.sort_unstable(); - genes.dedup(); - genes + out.sort_unstable(); + out.dedup(); } } @@ -381,7 +481,7 @@ mod tests { fn make_genome() -> Genome { Genome { - sequence: vec![0u8; 2000], + sequence: vec![0u8; 2000].into(), n_genome: 2000, n_genome_real: 2000, n_chr_real: 2, diff --git a/src/quant/transcriptome.rs b/src/quant/transcriptome.rs index c4c86ae..26df2e0 100644 --- a/src/quant/transcriptome.rs +++ b/src/quant/transcriptome.rs @@ -1203,7 +1203,7 @@ fn extend_softclips( break; } let r1 = read_bases_align_orientation[r_idx]; - let g1 = genome.sequence[g_idx]; + let g1 = genome.sequence.base(g_idx); if r1 != g1 && r1 < 4 && g1 < 4 { n_mm_extra += 1; } @@ -1223,7 +1223,7 @@ fn extend_softclips( break; } let r1 = read_bases_align_orientation[r_idx]; - let g1 = genome.sequence[g_idx]; + let g1 = genome.sequence.base(g_idx); if r1 != g1 && r1 < 4 && g1 < 4 { n_mm_extra += 1; } @@ -1381,7 +1381,7 @@ mod tests { fn make_genome() -> Genome { Genome { - sequence: vec![0u8; 3000], + sequence: vec![0u8; 3000].into(), n_genome: 3000, n_genome_real: 3000, n_chr_real: 2, @@ -2296,7 +2296,7 @@ mod tests { // Aligned region [104, 144) — fill with zeros (A) so read bases match seq[104..144].fill(0); let genome = Genome { - sequence: seq, + sequence: seq.into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, @@ -2349,7 +2349,7 @@ mod tests { // Aligned region [104, 144): all zeros seq[104..144].fill(0); let genome = Genome { - sequence: seq, + sequence: seq.into(), n_genome: 1000, n_genome_real: 1000, n_chr_real: 1, diff --git a/src/solo/count.rs b/src/solo/count.rs new file mode 100644 index 0000000..7ea431e --- /dev/null +++ b/src/solo/count.rs @@ -0,0 +1,1905 @@ +//! UMI deduplication and raw count-matrix output (Phase 14.4). +//! +//! Collates the per-read `(cell, UMI, gene)` records produced during alignment +//! into a sparse per-cell, per-gene count matrix: +//! 1. resolve deferred 1MM_multi cell barcodes via the count+quality posterior +//! (STAR `SoloReadFeature_inputRecords.cpp`: weight = exactCount·10^(−q/10)); +//! 2. group reads by `(cell, gene)` and collapse UMIs per `--soloUMIdedup` +//! (STAR `SoloFeature_collapseUMIall.cpp`); +//! 3. write `Solo.out/Gene/raw/{matrix.mtx, barcodes.tsv, features.tsv}` in +//! CellRanger-compatible MatrixMarket layout (features × barcodes, 1-based). + +use crate::error::Error; +use crate::solo::whitelist::CbWhitelist; +use crate::solo::{SoloContext, SoloCountRecord}; +use flate2::Compression; +use flate2::write::GzEncoder; +use std::collections::HashMap; +use std::io::{BufRead, BufReader, Write as _}; +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +/// Open a solo output file, gzipping it (and appending `.gz` to the name) when +/// `gzip` is set. The body is written by the closure; the gzip stream is +/// finished explicitly so the trailer is always flushed. Returns the path written. +pub(crate) fn write_file(path: &Path, gzip: bool, body: F) -> Result +where + F: FnOnce(&mut dyn std::io::Write) -> Result<(), Error>, +{ + let final_path = if gzip { + let mut s = path.as_os_str().to_owned(); + s.push(".gz"); + PathBuf::from(s) + } else { + path.to_path_buf() + }; + let file = std::fs::File::create(&final_path).map_err(|e| Error::io(e, &final_path))?; + if gzip { + let mut enc = GzEncoder::new(file, Compression::default()); + body(&mut enc)?; + enc.finish().map_err(|e| Error::io(e, &final_path))?; + } else { + let mut w = std::io::BufWriter::new(file); + body(&mut w)?; + w.flush().map_err(|e| Error::io(e, &final_path))?; + } + Ok(final_path) +} + +// --------------------------------------------------------------------------- +// UMI deduplication +// --------------------------------------------------------------------------- + +/// `--soloUMIdedup` method. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UmiDedup { + /// Count distinct UMI sequences (no error correction). + Exact, + /// No collapsing — count every read. + NoDedup, + /// Collapse all UMIs within Hamming-1 transitively (connected components). + OneMmAll, + /// UMI-tools directional, `count_hub >= 2*count_leaf + 0`. + OneMmDirectional, + /// UMI-tools directional original, `count_hub >= 2*count_leaf - 1`. + OneMmDirectionalUmiTools, + /// CellRanger 2–4 1MM collapse: each UMI is corrected to a higher-count + /// 1MM neighbor (non-transitive); count = distinct corrected UMIs. + OneMmCr, +} + +impl FromStr for UmiDedup { + type Err = String; + fn from_str(s: &str) -> Result { + match s { + "Exact" => Ok(Self::Exact), + "NoDedup" => Ok(Self::NoDedup), + "1MM_All" => Ok(Self::OneMmAll), + "1MM_Directional" => Ok(Self::OneMmDirectional), + "1MM_Directional_UMItools" => Ok(Self::OneMmDirectionalUmiTools), + "1MM_CR" => Ok(Self::OneMmCr), + _ => Err(format!( + "unknown soloUMIdedup '{s}'; expected Exact, NoDedup, 1MM_All, 1MM_Directional, 1MM_Directional_UMItools, or 1MM_CR" + )), + } + } +} + +/// `--soloUMIfiltering`: removal of UMIs that map to multiple genes within a cell. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UmiFiltering { + /// No multi-gene UMI filtering. + None, + /// Remove lower-count gene assignments of a multi-gene UMI; if every gene + /// has a single read, drop the UMI entirely (STAR `MultiGeneUMI`). + MultiGeneUmi, + /// CellRanger > 3.0 variant: keep only the highest-read-count gene for a + /// multi-gene UMI (ties retained), without the all-singletons drop. + MultiGeneUmiCr, +} + +impl FromStr for UmiFiltering { + type Err = String; + fn from_str(s: &str) -> Result { + match s { + "-" | "None" => Ok(Self::None), + // MultiGeneUMI_All behaves like MultiGeneUMI for the count matrix. + "MultiGeneUMI" | "MultiGeneUMI_All" => Ok(Self::MultiGeneUmi), + "MultiGeneUMI_CR" => Ok(Self::MultiGeneUmiCr), + _ => Err(format!( + "unknown soloUMIfiltering '{s}'; expected -, None, MultiGeneUMI, MultiGeneUMI_CR, or MultiGeneUMI_All" + )), + } + } +} + +/// True if packed UMIs `a` and `b` (length `len`) differ at exactly one base. +fn hamming1(a: u64, b: u64, len: usize) -> bool { + let x = a ^ b; + let mut diff = 0u32; + for i in 0..len { + if (x >> (2 * i)) & 0b11 != 0 { + diff += 1; + if diff > 1 { + return false; + } + } + } + diff == 1 +} + +/// Deduplicate the UMIs observed for one `(cell, gene)` pair into a molecule +/// count. `umis` maps each packed UMI to its read multiplicity. +#[allow(clippy::implicit_hasher)] // always called with the default hasher +pub fn dedup_count(umis: &HashMap, method: UmiDedup, umi_len: usize) -> u64 { + match method { + UmiDedup::Exact => umis.len() as u64, + UmiDedup::NoDedup => umis.values().map(|&c| u64::from(c)).sum(), + UmiDedup::OneMmAll => connected_components(umis, umi_len), + UmiDedup::OneMmDirectional => directional(umis, umi_len, 0), + UmiDedup::OneMmDirectionalUmiTools => directional(umis, umi_len, -1), + UmiDedup::OneMmCr => cellranger_1mm(umis, umi_len), + } +} + +/// 1MM_CR: CellRanger's 1-mismatch UMI collapse (STAR `umiArrayCorrect_CR`). +/// UMIs are sorted ascending by `(count, umi)`; each UMI is corrected to the +/// LAST (highest-count) 1MM neighbor with a strictly later sort position — i.e. +/// its highest-count 1MM neighbor. Correction is non-transitive (it points to +/// the neighbor's raw UMI, not its corrected value); the molecule count is the +/// number of distinct corrected UMIs. +fn cellranger_1mm(umis: &HashMap, umi_len: usize) -> u64 { + let mut items: Vec<(u64, u32)> = umis.iter().map(|(&u, &c)| (u, c)).collect(); + // Ascending by count, then by UMI value (mirrors funCompareSolo1 ordering, + // so the inner scan from the end meets higher-count neighbors first). + items.sort_by(|a, b| a.1.cmp(&b.1).then(a.0.cmp(&b.0))); + let n = items.len(); + let mut corrected: Vec = Vec::with_capacity(n); + for iu in 0..n { + let mut corr = items[iu].0; + let mut iuu = n; + while iuu > iu + 1 { + iuu -= 1; + if hamming1(items[iu].0, items[iuu].0, umi_len) { + corr = items[iuu].0; + break; + } + } + corrected.push(corr); + } + let distinct: std::collections::HashSet = corrected.into_iter().collect(); + distinct.len() as u64 +} + +/// 1MM_All: number of connected components when UMIs within Hamming-1 are +/// merged transitively (union-find). +fn connected_components(umis: &HashMap, umi_len: usize) -> u64 { + let keys: Vec = umis.keys().copied().collect(); + let n = keys.len(); + if n <= 1 { + return n as u64; + } + let mut parent: Vec = (0..n).collect(); + fn find(parent: &mut [usize], mut x: usize) -> usize { + while parent[x] != x { + parent[x] = parent[parent[x]]; + x = parent[x]; + } + x + } + for i in 0..n { + for j in (i + 1)..n { + if hamming1(keys[i], keys[j], umi_len) { + let ri = find(&mut parent, i); + let rj = find(&mut parent, j); + if ri != rj { + parent[ri] = rj; + } + } + } + } + let mut roots = std::collections::HashSet::new(); + for i in 0..n { + let r = find(&mut parent, i); + roots.insert(r); + } + roots.len() as u64 +} + +/// 1MM_Directional: a lower-count UMI within Hamming-1 of a hub whose count +/// satisfies `count_hub >= 2*count_leaf + dir_count_add` is absorbed; the +/// molecule count is the number of surviving (non-absorbed) UMIs. +fn directional(umis: &HashMap, umi_len: usize, dir_count_add: i64) -> u64 { + // Sort by count desc, then by UMI value for determinism. + let mut items: Vec<(u64, u32)> = umis.iter().map(|(&u, &c)| (u, c)).collect(); + items.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0))); + let n = items.len(); + let mut absorbed = vec![false; n]; + for i in 0..n { + if absorbed[i] { + continue; + } + let hub_count = i64::from(items[i].1); + for j in 0..n { + if i == j || absorbed[j] { + continue; + } + let leaf_count = i64::from(items[j].1); + if leaf_count <= hub_count + && hub_count >= 2 * leaf_count + dir_count_add + && hamming1(items[i].0, items[j].0, umi_len) + { + absorbed[j] = true; + } + } + } + (n - absorbed.iter().filter(|&&a| a).count()) as u64 +} + +// --------------------------------------------------------------------------- +// Cell-barcode multi-match resolution (deferred 1MM_multi) +// --------------------------------------------------------------------------- + +/// Resolve a 1MM_multi cell barcode to a single whitelist index using the +/// count+quality posterior: weight = `(exactCount[cand] + pseudocount) · 10^(−q/10)` +/// where `q` is the mismatch-position Phred score. `pseudocount` is 1 for the +/// `*_pseudocounts` match types (CellRanger ≥ 3.0). Returns the argmax, or +/// `None` if no candidate has positive weight. +fn resolve_multi_cb( + candidates: &[crate::solo::whitelist::CbCandidate], + exact_counts: &[u64], + pseudocount: f64, +) -> Option { + let mut best: Option<(u32, f64)> = None; + let mut total = 0.0f64; + for c in candidates { + let prior = *exact_counts.get(c.wl_index as usize).unwrap_or(&0) as f64 + pseudocount; + let q = f64::from(c.mismatch_qual.saturating_sub(33)); // Phred+33 → Phred + let weight = prior * 10f64.powf(-q / 10.0); + total += weight; + match best { + Some((_, w)) if w >= weight => {} + _ => best = Some((c.wl_index, weight)), + } + } + match best { + Some((idx, w)) if total > 0.0 && w > 0.0 => Some(idx), + _ => None, + } +} + +// --------------------------------------------------------------------------- +// Matrix assembly + output +// --------------------------------------------------------------------------- + +/// Build and stream the raw count matrix to `matrix_path` in one per-cell pass, +/// returning the number of non-zero entries written. +/// +/// Mirrors STAR's `SoloFeature_collapseUMIall.cpp`: the flat record list is +/// sorted by cell barcode so each cell's reads are contiguous, then **one cell +/// is processed at a time** (Step 1 — peak build memory is a single cell's +/// `umi → gene` maps, not a global `cell → umi → gene` nest over all records). +/// +/// Step 2 (streaming output): each cell's `gene → count` entries are written +/// straight to a temporary MatrixMarket body as they are produced — the global +/// `cell → (gene → count)` map is never materialized. `nnz` is counted on the +/// fly; the final `matrix.mtx` is the header (`rows cols nnz`) followed by the +/// temp body (the BySJout temp-file pattern). So matrix-output memory is bounded +/// by one cell regardless of how many cells the raw whitelist matrix spans. +/// +/// Records are sorted by cb (ascending column), and each cell's genes are +/// emitted ascending, so entries come out in the same order as before. +#[allow(clippy::too_many_arguments)] +/// Per-cell summary collected while streaming the matrix: the whitelist barcode +/// index, reads (records before UMI dedup), UMIs (deduped column sum), and genes +/// detected (nonzero entries). +#[derive(Clone, Copy)] +pub struct CellStat { + pub cb: u32, + pub n_reads: u64, + pub n_umis: u64, + pub n_genes: u32, +} + +/// What `build_matrix_body` returns alongside the temp matrix body. +pub struct MatrixStats { + pub nnz: usize, + /// One entry per barcode that received ≥1 UMI (the raw, unfiltered set). + pub cells: Vec, + /// Distinct genes with a nonzero count anywhere in the raw matrix. + pub genes_detected: u32, +} + +/// Stream the per-cell deduplicated counts into a plain temporary MatrixMarket +/// *body* (`gene+1 cb+1 count`, barcode-ascending) and collect per-cell stats. +/// The body is finalized into `raw/` (and optionally `filtered/`) by the caller, +/// which lets the raw + filtered matrices share one streaming pass. +#[allow(clippy::too_many_arguments)] +fn build_matrix_body( + ctx: &SoloContext, + recorder: &crate::solo::SoloRecorder, + method: UmiDedup, + filtering: UmiFiltering, + umi_len: usize, + pseudocount: f64, + dir: &Path, + n_features: usize, +) -> Result<(tempfile::NamedTempFile, MatrixStats), Error> { + let mut body_tmp = tempfile::Builder::new() + .prefix(".matrix_body") + .tempfile_in(dir) + .map_err(|e| Error::io(e, dir))?; + let mut nnz = 0usize; + let mut cell_stats: Vec = Vec::new(); + let mut gene_seen = vec![false; n_features]; + + { + let mut body = std::io::BufWriter::new(body_tmp.as_file_mut()); + + // Move records out of the recorder; fold in resolved 1MM_multi cells. + let mut records = std::mem::take(&mut *recorder.records.lock().unwrap()); + let exact_counts = ctx.whitelist.exact_count_snapshot(); + let multi = std::mem::take(&mut *recorder.multi_records.lock().unwrap()); + for m in &multi { + if let Some(cb) = resolve_multi_cb(&m.candidates, &exact_counts, pseudocount) { + records.push(SoloCountRecord { + cb, + umi: m.umi, + gene: m.gene, + }); + } + } + drop(multi); + + // Group each cell's reads together so we can process + free one at a time. + records.sort_unstable_by_key(|r| r.cb); + + let mut i = 0; + while i < records.len() { + let cb = records[i].cb; + + // umi → gene → read multiplicity, for this cell only. + let mut umi_genes: HashMap> = HashMap::new(); + let mut j = i; + while j < records.len() && records[j].cb == cb { + let r = &records[j]; + *umi_genes + .entry(r.umi) + .or_default() + .entry(r.gene) + .or_insert(0) += 1; + j += 1; + } + + // (gene → (umi → read_count)) after multi-gene UMI filtering. + let mut gene_umis: HashMap> = HashMap::new(); + for (&umi, genes) in &umi_genes { + for (&gene, &rc) in filter_multi_gene_umi(genes, filtering) { + *gene_umis.entry(gene).or_default().entry(umi).or_insert(0) += rc; + } + } + + // Collapse UMIs per gene, then emit this cell's entries gene-ascending. + let mut cell_entries: Vec<(u32, u64)> = Vec::with_capacity(gene_umis.len()); + for (&gene, umis) in &gene_umis { + let count = dedup_count(umis, method, umi_len); + if count > 0 { + cell_entries.push((gene, count)); + } + } + cell_entries.sort_unstable_by_key(|&(g, _)| g); + // Per-cell summary: reads = records (j-i), genes = nonzero entries, + // UMIs = sum of deduped counts. + let n_reads = (j - i) as u64; + let n_genes = cell_entries.len() as u32; + let mut n_umis = 0u64; + for (g, c) in cell_entries { + n_umis += c; + gene_seen[g as usize] = true; + writeln!(body, "{} {} {}", g + 1, cb + 1, c).map_err(|e| Error::io(e, dir))?; + nnz += 1; + } + if n_umis > 0 { + cell_stats.push(CellStat { + cb, + n_reads, + n_umis, + n_genes, + }); + } + + i = j; + } + body.flush().map_err(|e| Error::io(e, dir))?; + } + + let genes_detected = gene_seen.iter().filter(|&&s| s).count() as u32; + Ok(( + body_tmp, + MatrixStats { + nnz, + cells: cell_stats, + genes_detected, + }, + )) +} + +/// Write a final `matrix.mtx[.gz]` = MatrixMarket header + (optionally +/// cb-remapped/filtered) body. With `remap = None` the body is copied verbatim +/// (raw); with `Some(map)` only columns in the map survive, renumbered to the +/// `n_cols` called cells. Returns the entry count written. +fn finalize_matrix( + body: &tempfile::NamedTempFile, + out_path: &Path, + gzip: bool, + n_features: usize, + n_cols: usize, + raw_nnz: usize, + remap: Option<&HashMap>, +) -> Result { + // For the filtered matrix we must know nnz before the header, so first build + // the remapped body into a temp and count it; raw reuses the known nnz. + let (src, nnz): (PathBuf, usize) = match remap { + None => (body.path().to_path_buf(), raw_nnz), + Some(map) => { + let dir = out_path.parent().unwrap_or_else(|| Path::new(".")); + let mut ftmp = tempfile::Builder::new() + .prefix(".matrix_filt") + .tempfile_in(dir) + .map_err(|e| Error::io(e, dir))?; + let mut kept = 0usize; + { + let mut w = std::io::BufWriter::new(ftmp.as_file_mut()); + let reader = BufReader::new( + std::fs::File::open(body.path()).map_err(|e| Error::io(e, body.path()))?, + ); + for line in reader.lines() { + let line = line.map_err(|e| Error::io(e, body.path()))?; + let mut it = line.split(' '); + let (Some(gene), Some(cb1), Some(cnt)) = (it.next(), it.next(), it.next()) + else { + continue; + }; + let cb0: u32 = cb1.parse::().unwrap_or(0).saturating_sub(1); + if let Some(&col) = map.get(&cb0) { + writeln!(w, "{gene} {col} {cnt}").map_err(|e| Error::io(e, out_path))?; + kept += 1; + } + } + w.flush().map_err(|e| Error::io(e, out_path))?; + } + ( + ftmp.into_temp_path() + .keep() + .map_err(|e| Error::io(e.error, out_path))?, + kept, + ) + } + }; + + write_file(out_path, gzip, |w| { + writeln!(w, "%%MatrixMarket matrix coordinate integer general") + .map_err(|e| Error::io(e, out_path))?; + writeln!(w, "%").map_err(|e| Error::io(e, out_path))?; + writeln!(w, "{n_features} {n_cols} {nnz}").map_err(|e| Error::io(e, out_path))?; + let mut r = std::fs::File::open(&src).map_err(|e| Error::io(e, &src))?; + std::io::copy(&mut r, w).map_err(|e| Error::io(e, out_path))?; + Ok(()) + })?; + if remap.is_some() { + let _ = std::fs::remove_file(&src); // best-effort cleanup of the filtered temp + } + Ok(nnz) +} + +/// `--soloMultiMappers` method (non-`Unique` ones produce a `UniqueAndMult-*.mtx`). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MultiMethod { + Uniform, + Rescue, + PropUnique, + Em, +} + +impl MultiMethod { + fn name(self) -> &'static str { + match self { + MultiMethod::Uniform => "Uniform", + MultiMethod::Rescue => "Rescue", + MultiMethod::PropUnique => "PropUnique", + MultiMethod::Em => "EM", + } + } + + /// Parse `--soloMultiMappers` values, dropping `Unique` (no extra matrix). + pub fn parse_list(vals: &[String]) -> Vec { + vals.iter() + .filter_map(|v| match v.as_str() { + "Uniform" => Some(MultiMethod::Uniform), + "Rescue" => Some(MultiMethod::Rescue), + "PropUnique" => Some(MultiMethod::PropUnique), + "EM" => Some(MultiMethod::Em), + _ => None, + }) + .collect() + } +} + +/// Distribute one cell's gene-ambiguous molecules across their gene sets and add +/// to the unique counts `u`, returning the combined (unique + multi) per-gene +/// counts. `molecules` is one gene set per deduplicated multi-gene UMI. +fn distribute_multi( + method: MultiMethod, + u: &HashMap, + molecules: &[Vec], +) -> HashMap { + let mut out = u.clone(); + let unit = |s: &[u32]| 1.0 / s.len() as f64; + let get = |m: &HashMap, g: u32| m.get(&g).copied().unwrap_or(0.0); + match method { + MultiMethod::Uniform => { + for s in molecules { + let w = unit(s); + for &g in s { + *out.entry(g).or_insert(0.0) += w; + } + } + } + MultiMethod::PropUnique => { + for s in molecules { + let total: f64 = s.iter().map(|&g| get(u, g)).sum(); + for &g in s { + let w = if total > 0.0 { + get(u, g) / total + } else { + unit(s) + }; + *out.entry(g).or_insert(0.0) += w; + } + } + } + MultiMethod::Rescue => { + // Weights = unique counts + a uniform spread of the multi molecules. + let mut unif: HashMap = HashMap::new(); + for s in molecules { + let w = unit(s); + for &g in s { + *unif.entry(g).or_insert(0.0) += w; + } + } + for s in molecules { + let total: f64 = s.iter().map(|&g| get(u, g) + get(&unif, g)).sum(); + for &g in s { + let w = if total > 0.0 { + (get(u, g) + get(&unif, g)) / total + } else { + unit(s) + }; + *out.entry(g).or_insert(0.0) += w; + } + } + } + MultiMethod::Em => { + // theta_g = u_g + (multi distributed proportional to theta), iterated. + let mut theta = u.clone(); + for s in molecules { + for &g in s { + theta.entry(g).or_insert(0.0); + } + } + for _ in 0..100 { + let mut next = u.clone(); + for s in molecules { + for &g in s { + next.entry(g).or_insert(0.0); + } + } + for s in molecules { + let total: f64 = s.iter().map(|&g| get(&theta, g)).sum(); + for &g in s { + let w = if total > 0.0 { + get(&theta, g) / total + } else { + unit(s) + }; + *next.get_mut(&g).unwrap() += w; + } + } + let delta: f64 = next.iter().map(|(g, v)| (v - get(&theta, *g)).abs()).sum(); + theta = next; + if delta < 1e-6 { + break; + } + } + out = theta; + } + } + out +} + +/// Format a real matrix value compactly (integers without a decimal point). +fn fmt_real(v: f64) -> String { + if v.fract().abs() < 1e-9 { + format!("{}", v.round() as i64) + } else { + format!("{v:.5}") + } +} + +/// Write the `UniqueAndMult-.mtx` matrices (real-valued) for the +/// `--soloMultiMappers` methods. Re-reads the raw matrix body (per-cell unique +/// counts, cb-ascending) and merges each cell with its gene-ambiguous molecules +/// (deduplicated by UMI, gene set = union). Cells present only in multi records +/// (no unique gene) are skipped. +#[allow(clippy::too_many_arguments)] +fn build_multi_matrices( + raw_body: &tempfile::NamedTempFile, + multi_records: &[crate::solo::MultiGeneRecord], + methods: &[MultiMethod], + dir: &Path, + matrix_name: &str, + n_features: usize, + n_barcodes: usize, + gzip: bool, +) -> Result<(), Error> { + if methods.is_empty() { + return Ok(()); + } + let mut multi: Vec<&crate::solo::MultiGeneRecord> = multi_records.iter().collect(); + multi.sort_unstable_by_key(|r| r.cb); + + // Per-method temp body + entry count. + let mut bodies: Vec = Vec::new(); + for _ in methods { + bodies.push( + tempfile::Builder::new() + .prefix(".um_body") + .tempfile_in(dir) + .map_err(|e| Error::io(e, dir))?, + ); + } + let mut nnz = vec![0usize; methods.len()]; + + // Gather one cell's multi molecules (gene sets, one per deduped UMI). + let cell_molecules = |cb: u32, mptr: &mut usize| -> Vec> { + while *mptr < multi.len() && multi[*mptr].cb < cb { + *mptr += 1; // skip multi-only cells (no unique gene) + } + let mut by_umi: HashMap> = HashMap::new(); + while *mptr < multi.len() && multi[*mptr].cb == cb { + let r = multi[*mptr]; + by_umi + .entry(r.umi) + .or_default() + .extend(r.genes.iter().copied()); + *mptr += 1; + } + by_umi + .into_values() + .map(|s| s.into_iter().collect()) + .collect() + }; + + { + let mut writers: Vec> = bodies + .iter_mut() + .map(|t| std::io::BufWriter::new(t.as_file_mut())) + .collect(); + let reader = BufReader::new( + std::fs::File::open(raw_body.path()).map_err(|e| Error::io(e, raw_body.path()))?, + ); + let mut mptr = 0usize; + let mut cur_cb: Option = None; + let mut u_map: HashMap = HashMap::new(); + + let mut flush = |cb: u32, + u: &HashMap, + mptr: &mut usize, + nnz: &mut [usize]| + -> Result<(), Error> { + let mols = cell_molecules(cb, mptr); + for (k, &m) in methods.iter().enumerate() { + let counts = distribute_multi(m, u, &mols); + let mut entries: Vec<(u32, f64)> = + counts.into_iter().filter(|&(_, v)| v > 1e-9).collect(); + entries.sort_unstable_by_key(|&(g, _)| g); + for (g, v) in entries { + writeln!(writers[k], "{} {} {}", g + 1, cb + 1, fmt_real(v)) + .map_err(|e| Error::io(e, dir))?; + nnz[k] += 1; + } + } + Ok(()) + }; + + for line in reader.lines() { + let line = line.map_err(|e| Error::io(e, raw_body.path()))?; + let mut it = line.split(' '); + let (Some(gt), Some(ct), Some(vt)) = (it.next(), it.next(), it.next()) else { + continue; + }; + let g: u32 = gt.parse::().unwrap_or(1) - 1; + let cb: u32 = ct.parse::().unwrap_or(1) - 1; + let v: f64 = vt.parse().unwrap_or(0.0); + if cur_cb != Some(cb) { + if let Some(prev) = cur_cb { + flush(prev, &u_map, &mut mptr, &mut nnz)?; + } + cur_cb = Some(cb); + u_map.clear(); + } + *u_map.entry(g).or_insert(0.0) += v; + } + if let Some(prev) = cur_cb { + flush(prev, &u_map, &mut mptr, &mut nnz)?; + } + for w in &mut writers { + w.flush().map_err(|e| Error::io(e, dir))?; + } + } + + // Finalize each UniqueAndMult-.mtx (real-valued MatrixMarket). + for ((m, body), &n) in methods.iter().zip(&bodies).zip(&nnz) { + let path = dir.join(format!("UniqueAndMult-{}.mtx", m.name())); + write_file(&path, gzip, |w| { + writeln!(w, "%%MatrixMarket matrix coordinate real general") + .map_err(|e| Error::io(e, &path))?; + writeln!(w, "%").map_err(|e| Error::io(e, &path))?; + writeln!(w, "{n_features} {n_barcodes} {n}").map_err(|e| Error::io(e, &path))?; + let mut r = std::fs::File::open(body.path()).map_err(|e| Error::io(e, body.path()))?; + std::io::copy(&mut r, w).map_err(|e| Error::io(e, &path))?; + Ok(()) + })?; + } + let _ = matrix_name; // UniqueAndMult uses a fixed name scheme + Ok(()) +} + +/// Apply `--soloUMIfiltering` to the gene→read_count map of a single UMI, +/// returning the surviving (gene, read_count) entries. +fn filter_multi_gene_umi(genes: &HashMap, filtering: UmiFiltering) -> Vec<(&u32, &u32)> { + if filtering == UmiFiltering::None || genes.len() <= 1 { + return genes.iter().collect(); + } + let max = genes.values().copied().max().unwrap_or(0); + match filtering { + // STAR MultiGeneUMI: threshold = max (or 2 if max==1, dropping all + // single-read multi-gene UMIs); keep genes with read_count >= threshold. + UmiFiltering::MultiGeneUmi => { + let thresh = if max == 1 { 2 } else { max }; + genes.iter().filter(|&(_, &rc)| rc >= thresh).collect() + } + // CellRanger > 3.0: keep the highest-read-count gene(s); no singleton drop. + UmiFiltering::MultiGeneUmiCr => genes.iter().filter(|&(_, &rc)| rc >= max).collect(), + UmiFiltering::None => unreachable!(), + } +} + +/// CellRanger-2.2 knee threshold on per-barcode UMI totals (STARsolo's default +/// `--soloCellFilter CellRanger2.2 3000 0.99 10`). Returns the minimum UMI count +/// for a barcode to be called a cell. +fn knee_cr22(umis_desc: &[u64], n_expected: usize, max_pct: f64, max_min_ratio: f64) -> u64 { + if umis_desc.is_empty() { + return 0; + } + let idx = ((n_expected as f64 * (1.0 - max_pct)).round() as usize).min(umis_desc.len() - 1); + let robust_max = umis_desc[idx] as f64; + (robust_max / max_min_ratio).ceil() as u64 +} + +/// Whitelist indices of called cells (sorted ascending) per `--soloCellFilter`. +/// `None` → no filtered/ output. `EmptyDrops_CR` writes only the knee-guaranteed +/// cells here (the Monte-Carlo rescue is the standalone `emptydrops` binary). +fn called_cells(cells: &[CellStat], filter: &[String]) -> Option> { + let method = filter.first().map_or("CellRanger2.2", String::as_str); + let arg = |i: usize, d: f64| filter.get(i).and_then(|s| s.parse().ok()).unwrap_or(d); + let mut cbs: Vec = match method { + "None" => return None, + "TopCells" => { + let n = arg(1, 0.0) as usize; + let mut idx: Vec<&CellStat> = cells.iter().collect(); + idx.sort_by(|a, b| b.n_umis.cmp(&a.n_umis).then(a.cb.cmp(&b.cb))); + idx.into_iter().take(n).map(|c| c.cb).collect() + } + // EmptyDrops_CR is handled by `emptydrops_called`; the knee here is the + // fallback / guaranteed-cell base. + "CellRanger2.2" | "EmptyDrops_CR" => { + let mut umis: Vec = cells.iter().map(|c| c.n_umis).collect(); + umis.sort_unstable_by(|a, b| b.cmp(a)); + let thr = knee_cr22(&umis, arg(1, 3000.0) as usize, arg(2, 0.99), arg(3, 10.0)); + cells + .iter() + .filter(|c| c.n_umis >= thr) + .map(|c| c.cb) + .collect() + } + other => { + log::warn!("--soloCellFilter '{other}' not supported; skipping filtered/ output"); + return None; + } + }; + cbs.sort_unstable(); + Some(cbs) +} + +/// `--soloCellFilter EmptyDrops_CR`: the CR2.2-knee guaranteed cells PLUS cells +/// rescued by the EmptyDrops multinomial Monte-Carlo test (STAR +/// `SoloFeature_emptyDrops_CR.cpp`). Per-cell gene profiles for the ambient + +/// candidate cells are read back from the raw matrix body. `filter` is the +/// `EmptyDrops_CR nExpected maxPct maxMinRatio indMin indMax umiMin +/// umiMinFracMedian candMaxN FDR [simN]` argument list. +fn emptydrops_called( + cells: &[CellStat], + body: &tempfile::NamedTempFile, + n_features: usize, + filter: &[String], +) -> Result, Error> { + use rand::SeedableRng; + use rand::distr::{Distribution, weighted::WeightedIndex}; + let arg = |i: usize, d: f64| { + filter + .get(i) + .and_then(|s| s.parse::().ok()) + .unwrap_or(d) + }; + let (n_expected, max_pct, ratio) = (arg(1, 3000.0) as usize, arg(2, 0.99), arg(3, 10.0)); + let (ind_min, ind_max) = (arg(4, 45000.0) as usize, arg(5, 90000.0) as usize); + let umi_min = arg(6, 500.0) as u64; + let umi_min_frac = arg(7, 0.01); + let cand_max = arg(8, 20000.0) as usize; + let fdr = arg(9, 0.01); + let sim_n = arg(10, 10000.0).max(1.0) as usize; + + // Rank by total UMI (descending, cb tie-break). + let mut order: Vec<&CellStat> = cells.iter().collect(); + order.sort_by(|a, b| b.n_umis.cmp(&a.n_umis).then(a.cb.cmp(&b.cb))); + let totals_desc: Vec = order.iter().map(|c| c.n_umis).collect(); + let thr = knee_cr22(&totals_desc, n_expected, max_pct, ratio); + let n_simple = totals_desc.iter().take_while(|&&u| u >= thr).count(); + let mut called: Vec = order.iter().take(n_simple).map(|c| c.cb).collect(); + + // Candidate cells: rank ≥ nSimple, total ≥ minUMI, up to candMaxN. + let median_top = totals_desc.get(n_simple / 2).copied().unwrap_or(0); + let min_umi = umi_min.max((umi_min_frac * median_top as f64) as u64); + let mut cand_cbs: Vec = Vec::new(); + for c in order.iter().skip(n_simple).take(cand_max) { + if c.n_umis < min_umi { + break; + } + cand_cbs.push(c.cb); + } + if cand_cbs.is_empty() { + called.sort_unstable(); + return Ok(called); + } + let cand_set: std::collections::HashSet = cand_cbs.iter().copied().collect(); + let ambient_set: std::collections::HashSet = order + .iter() + .skip(ind_min) + .take(ind_max.saturating_sub(ind_min)) + .map(|c| c.cb) + .collect(); + + // Re-read the raw body for ambient (summed) + per-candidate profiles. + let mut ambient = vec![0f64; n_features]; + let mut amb_total = 0f64; + let mut cand_profiles: HashMap> = HashMap::new(); + let reader = + BufReader::new(std::fs::File::open(body.path()).map_err(|e| Error::io(e, body.path()))?); + for line in reader.lines() { + let line = line.map_err(|e| Error::io(e, body.path()))?; + let mut it = line.split(' '); + let (Some(gt), Some(ct), Some(vt)) = (it.next(), it.next(), it.next()) else { + continue; + }; + let g = gt.parse::().unwrap_or(1) - 1; + let cb = ct.parse::().unwrap_or(1) - 1; + let v = vt.parse::().unwrap_or(0); + if ambient_set.contains(&cb) { + ambient[g as usize] += v as f64; + amb_total += v as f64; + } + if cand_set.contains(&cb) { + cand_profiles.entry(cb).or_default().push((g, v)); + } + } + if amb_total == 0.0 { + called.sort_unstable(); + return Ok(called); + } + + // Ambient probabilities with a Good-Turing P0 unseen-mass correction. + let n1 = ambient.iter().filter(|&&x| (x - 1.0).abs() < 0.5).count() as f64; + let p0 = (n1 / amb_total).clamp(1e-12, 0.5); + let n_zero = ambient.iter().filter(|&&x| x == 0.0).count().max(1) as f64; + let amb_p: Vec = ambient + .iter() + .map(|&x| { + if x > 0.0 { + (1.0 - p0) * x / amb_total + } else { + p0 / n_zero + } + }) + .collect(); + let amb_logp: Vec = amb_p.iter().map(|&p| p.max(1e-300).ln()).collect(); + + // Observed multinomial log-prob per candidate. + let max_count = cand_cbs + .iter() + .filter_map(|cb| cand_profiles.get(cb)) + .map(|p| p.iter().map(|&(_, c)| c as usize).sum::()) + .max() + .unwrap_or(0); + let mut log_fac = vec![0f64; max_count + 1]; + for i in 2..=max_count { + log_fac[i] = log_fac[i - 1] + (i as f64).ln(); + } + let obs: Vec<(u32, usize, f64)> = cand_cbs + .iter() + .filter_map(|&cb| { + let prof = cand_profiles.get(&cb)?; + let total: usize = prof.iter().map(|&(_, c)| c as usize).sum(); + let mut s = log_fac[total]; + for &(g, c) in prof { + s -= log_fac[c as usize]; + s += c as f64 * amb_logp[g as usize]; + } + Some((cb, total, s)) + }) + .collect(); + + // Monte-Carlo: simulate sim_n ambient barcodes, recording the running + // log-prob at each count; compare each candidate against sim[*][its total]. + let nonzero: Vec = (0..n_features).filter(|&g| amb_p[g] > 0.0).collect(); + let weights: Vec = nonzero.iter().map(|&g| amb_p[g]).collect(); + let dist = WeightedIndex::new(&weights).map_err(|e| { + Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + e.to_string(), + )) + })?; + let mut rng = rand::rngs::StdRng::seed_from_u64(19_760_110); + let mut sim_at: Vec> = vec![Vec::with_capacity(sim_n); max_count + 1]; + let mut curr = vec![0u32; n_features]; + for _ in 0..sim_n { + curr.fill(0); + let mut lp = 0f64; + sim_at[0].push(0.0); + #[allow(clippy::needless_range_loop)] // ic is both index and multinomial term + for ic in 1..=max_count { + let gi = nonzero[dist.sample(&mut rng)]; + curr[gi] += 1; + lp += amb_logp[gi] + (ic as f64).ln() - (curr[gi] as f64).ln(); + sim_at[ic].push(lp); + } + } + + // p-values + Benjamini-Hochberg. + let mut pvals: Vec<(u32, f64)> = obs + .iter() + .map(|&(cb, total, o)| { + let lower = sim_at[total].iter().filter(|&&sp| sp < o).count(); + (cb, (1 + lower) as f64 / (1 + sim_n) as f64) + }) + .collect(); + pvals.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let n = pvals.len() as f64; + let mut padj = vec![0f64; pvals.len()]; + for (rank, &(_, p)) in pvals.iter().enumerate() { + padj[rank] = (p * n / (rank + 1) as f64).min(1.0); + } + for i in (0..padj.len().saturating_sub(1)).rev() { + padj[i] = padj[i].min(padj[i + 1]); + } + let mut rescued = 0usize; + for (rank, &(cb, _)) in pvals.iter().enumerate() { + if padj[rank] <= fdr { + called.push(cb); + rescued += 1; + } + } + log::info!( + "EmptyDrops_CR: {n_simple} knee cells + {rescued} rescued (of {} candidates, FDR<={fdr})", + cand_cbs.len() + ); + called.sort_unstable(); + Ok(called) +} + +/// Median of an ascending-sorted slice (0 if empty). +fn median_sorted(sorted: &[u64]) -> u64 { + let n = sorted.len(); + if n == 0 { + 0 + } else if n % 2 == 1 { + sorted[n / 2] + } else { + u64::midpoint(sorted[n / 2 - 1], sorted[n / 2]) + } +} + +/// Write the raw gene-count matrix + `Summary.csv` for a finished solo run. +/// No-op (with a warning) when there is no explicit whitelist. +pub fn write_gene_matrix( + ctx: &SoloContext, + params: &crate::params::Parameters, + align_stats: &crate::stats::AlignmentStats, + sj_stats: Option<&crate::junction::SpliceJunctionStats>, + genome: &crate::genome::Genome, +) -> Result<(), Error> { + let CbWhitelist::List { sorted, .. } = &ctx.whitelist else { + log::warn!( + "STARsolo: --soloCBwhitelist None matrix output is not yet supported (Phase 14.4); skipping matrix" + ); + return Ok(()); + }; + + let method: UmiDedup = params + .solo_umi_dedup + .first() + .map_or("1MM_All", String::as_str) + .parse() + .unwrap_or(UmiDedup::OneMmAll); + let filtering: UmiFiltering = params + .solo_umi_filtering + .first() + .map_or("-", String::as_str) + .parse() + .unwrap_or(UmiFiltering::None); + // `*_pseudocounts` CB-match types add 1 to the posterior prior. + let pseudocount = if params.solo_cb_match_wl_type.contains("pseudocounts") { + 1.0 + } else { + 0.0 + }; + let umi_len = params.solo_umi_len as usize; + + let solo_dir = params + .solo_out_file_names + .first() + .cloned() + .unwrap_or_else(|| "Solo.out/".to_string()); + let features_name = params + .solo_out_file_names + .get(1) + .cloned() + .unwrap_or_else(|| "features.tsv".to_string()); + let barcodes_name = params + .solo_out_file_names + .get(2) + .cloned() + .unwrap_or_else(|| "barcodes.tsv".to_string()); + let matrix_name = params + .solo_out_file_names + .get(3) + .cloned() + .unwrap_or_else(|| "matrix.mtx".to_string()); + + // Global mapping funnel (shared across features). The region tallies are + // CellRanger-style positional bins over uniquely-mapped reads, populated only + // when both Gene and GeneFull run (otherwise the split is unavailable). + use std::sync::atomic::Ordering; + let total_reads = align_stats.total_reads.load(Ordering::Relaxed); + let mapped_unique = align_stats.uniquely_mapped.load(Ordering::Relaxed); + let mapped_multi = align_stats.multi_mapped.load(Ordering::Relaxed); + let valid_barcodes = ctx.stats.yes_exact.load(Ordering::Relaxed) + + ctx.stats.yes_one_mm.load(Ordering::Relaxed) + + ctx.stats.yes_mult_mm.load(Ordering::Relaxed); + let reads_of = |f: crate::solo::SoloFeature| -> u64 { + ctx.features + .iter() + .position(|&x| x == f) + .map_or(0, |i| ctx.feature_reads[i].load(Ordering::Relaxed)) + }; + let have_funnel = ctx.features.contains(&crate::solo::SoloFeature::Gene) + && ctx.features.contains(&crate::solo::SoloFeature::GeneFull); + let region = have_funnel.then(|| RegionFunnel { + exonic: ctx.region_stats.exonic.load(Ordering::Relaxed), + intronic: ctx.region_stats.intronic.load(Ordering::Relaxed), + intergenic: ctx.region_stats.intergenic.load(Ordering::Relaxed), + antisense: ctx.region_stats.antisense.load(Ordering::Relaxed), + }); + + let gzip = matches!(params.solo_out_gzip.as_str(), "yes" | "Yes" | "true"); + let n_genes = ctx.gene_ann.gene_ids.len(); + let multi_methods = MultiMethod::parse_list(¶ms.solo_multi_mappers); + + // One {prefix}{soloOutFileNames[0]}/{raw,filtered}/ per feature. + for (feature, recorder) in ctx.features.iter().zip(&ctx.recorders) { + let feature_dir = params.output_path(&format!("{solo_dir}{}/", feature.dir_name())); + let raw_dir = feature_dir.join("raw"); + std::fs::create_dir_all(&raw_dir).map_err(|e| Error::io(e, &raw_dir))?; + + // Stream the deduplicated counts into a shared temp body, then finalize + // the raw matrix (and the filtered one below) from it. + let (body, mstats) = build_matrix_body( + ctx, + recorder, + method, + filtering, + umi_len, + pseudocount, + &raw_dir, + n_genes, + )?; + write_features(&raw_dir.join(&features_name), &ctx.gene_ann.gene_ids, gzip)?; + write_barcodes( + &raw_dir.join(&barcodes_name), + &ctx.whitelist, + sorted.len(), + gzip, + )?; + finalize_matrix( + &body, + &raw_dir.join(&matrix_name), + gzip, + n_genes, + sorted.len(), + mstats.nnz, + None, + )?; + log::info!( + "STARsolo: wrote {}/raw matrix ({} genes × {} barcodes, {} entries){}", + feature.dir_name(), + n_genes, + sorted.len(), + mstats.nnz, + if gzip { " [gzip]" } else { "" }, + ); + + // Filtered (cell-called) matrix per --soloCellFilter. EmptyDrops_CR runs + // the Monte-Carlo rescue (needs the per-cell profiles in the body). + let called = if params + .solo_cell_filter + .first() + .is_some_and(|m| m == "EmptyDrops_CR") + { + Some(emptydrops_called( + &mstats.cells, + &body, + n_genes, + ¶ms.solo_cell_filter, + )?) + } else { + called_cells(&mstats.cells, ¶ms.solo_cell_filter) + }; + if let Some(cbs) = called + && !cbs.is_empty() + { + let filt_dir = feature_dir.join("filtered"); + std::fs::create_dir_all(&filt_dir).map_err(|e| Error::io(e, &filt_dir))?; + let remap: HashMap = cbs + .iter() + .enumerate() + .map(|(i, &cb)| (cb, i as u32 + 1)) + .collect(); + write_features(&filt_dir.join(&features_name), &ctx.gene_ann.gene_ids, gzip)?; + write_barcodes_subset(&filt_dir.join(&barcodes_name), &ctx.whitelist, &cbs, gzip)?; + let fnnz = finalize_matrix( + &body, + &filt_dir.join(&matrix_name), + gzip, + n_genes, + cbs.len(), + 0, + Some(&remap), + )?; + log::info!( + "STARsolo: wrote {}/filtered matrix ({} cells, {} entries)", + feature.dir_name(), + cbs.len(), + fnnz, + ); + } + + // --soloMultiMappers: UniqueAndMult-.mtx alongside raw. + if !multi_methods.is_empty() { + let mg = recorder.multi_gene.lock().unwrap(); + build_multi_matrices( + &body, + &mg, + &multi_methods, + &raw_dir, + &matrix_name, + n_genes, + sorted.len(), + gzip, + )?; + log::info!( + "STARsolo: wrote {} UniqueAndMult matrices for {} ({} ambiguous reads)", + multi_methods.len(), + feature.dir_name(), + mg.len(), + ); + } + + write_summary( + &feature_dir.join("Summary.csv"), + feature.dir_name(), + &mstats, + total_reads, + valid_barcodes, + mapped_unique, + mapped_multi, + reads_of(*feature), + region, + )?; + log::info!("STARsolo: wrote {}/Summary.csv", feature.dir_name()); + } + + // SJ (splice-junction) feature: rows are the SJ.out.tab junctions. + if ctx.sj_enabled + && let Some(sjs) = sj_stats + { + let sj_dir = params.output_path(&format!("{solo_dir}SJ/raw/")); + std::fs::create_dir_all(&sj_dir).map_err(|e| Error::io(e, &sj_dir))?; + let order = sjs.sj_feature_order(params); // (intron_start, intron_end), row order + let row: HashMap<(u64, u64), u32> = order + .iter() + .enumerate() + .map(|(i, &k)| (k, i as u32)) + .collect(); + // features.tsv = the SJ.out.tab lines (same sorted order as the rows). + write_file(&sj_dir.join(&features_name), gzip, |w| { + sjs.write_sj_lines(w, genome, params).map(|_| ()) + })?; + write_barcodes( + &sj_dir.join(&barcodes_name), + &ctx.whitelist, + sorted.len(), + gzip, + )?; + let umi_len = params.solo_umi_len as usize; + let nnz = build_sj_matrix( + &ctx.sj_records.lock().unwrap(), + &row, + method, + umi_len, + &sj_dir.join(&matrix_name), + order.len(), + sorted.len(), + gzip, + )?; + log::info!( + "STARsolo: wrote SJ/raw matrix ({} junctions × {} barcodes, {} entries)", + order.len(), + sorted.len(), + nnz, + ); + } + + // Velocyto feature: spliced / unspliced / ambiguous gene×cell matrices. + if ctx.velocyto_enabled { + let velo_dir = params.output_path(&format!("{solo_dir}Velocyto/raw/")); + std::fs::create_dir_all(&velo_dir).map_err(|e| Error::io(e, &velo_dir))?; + write_features(&velo_dir.join(&features_name), &ctx.gene_ann.gene_ids, gzip)?; + write_barcodes( + &velo_dir.join(&barcodes_name), + &ctx.whitelist, + sorted.len(), + gzip, + )?; + let umi_len = params.solo_umi_len as usize; + let nnz = build_velocyto_matrices( + &ctx.velocyto_records.lock().unwrap(), + method, + umi_len, + &velo_dir, + n_genes, + sorted.len(), + gzip, + )?; + log::info!( + "STARsolo: wrote Velocyto/raw matrices (spliced={} unspliced={} ambiguous={} entries)", + nnz[0], + nnz[1], + nnz[2], + ); + } + Ok(()) +} + +/// Build the SJ feature matrix from (cell, UMI, junction) records, mapping each +/// junction's absolute intron coords to its `SJ.out.tab` row and UMI-collapsing +/// per (cell, junction). Junctions not in `row` (filtered out of SJ.out.tab) are +/// dropped. Same MatrixMarket layout as the gene matrix (junctions are rows). +#[allow(clippy::too_many_arguments)] +fn build_sj_matrix( + records: &[crate::solo::SjCountRecord], + row: &HashMap<(u64, u64), u32>, + method: UmiDedup, + umi_len: usize, + matrix_path: &Path, + n_junctions: usize, + n_barcodes: usize, + gzip: bool, +) -> Result { + // Group by cell barcode (ascending column order). + let mut recs: Vec<&crate::solo::SjCountRecord> = records.iter().collect(); + recs.sort_unstable_by_key(|r| r.cb); + + let dir = matrix_path.parent().unwrap_or_else(|| Path::new(".")); + let mut body_tmp = tempfile::Builder::new() + .prefix(".sj_body") + .tempfile_in(dir) + .map_err(|e| Error::io(e, dir))?; + let mut nnz = 0usize; + { + let mut body = std::io::BufWriter::new(body_tmp.as_file_mut()); + let mut i = 0; + while i < recs.len() { + let cb = recs[i].cb; + // junction row → (umi → read count) for this cell. + let mut sj_umis: HashMap> = HashMap::new(); + while i < recs.len() && recs[i].cb == cb { + let r = recs[i]; + if let Some(&rw) = row.get(&(r.intron_start, r.intron_end)) { + *sj_umis.entry(rw).or_default().entry(r.umi).or_insert(0) += 1; + } + i += 1; + } + let mut entries: Vec<(u32, u64)> = sj_umis + .into_iter() + .map(|(rw, umis)| (rw, dedup_count(&umis, method, umi_len))) + .filter(|&(_, c)| c > 0) + .collect(); + entries.sort_unstable_by_key(|&(rw, _)| rw); + for (rw, c) in entries { + writeln!(body, "{} {} {}", rw + 1, cb + 1, c).map_err(|e| Error::io(e, dir))?; + nnz += 1; + } + } + body.flush().map_err(|e| Error::io(e, dir))?; + } + + write_file(matrix_path, gzip, |w| { + writeln!(w, "%%MatrixMarket matrix coordinate integer general") + .map_err(|e| Error::io(e, matrix_path))?; + writeln!(w, "%").map_err(|e| Error::io(e, matrix_path))?; + writeln!(w, "{n_junctions} {n_barcodes} {nnz}").map_err(|e| Error::io(e, matrix_path))?; + let mut r = + std::fs::File::open(body_tmp.path()).map_err(|e| Error::io(e, body_tmp.path()))?; + std::io::copy(&mut r, w).map_err(|e| Error::io(e, matrix_path))?; + Ok(()) + })?; + Ok(nnz) +} + +/// Build the three `Velocyto` matrices (`spliced`/`unspliced`/`ambiguous`) from +/// (cell, UMI, gene, category) records. Per (cell, gene) each UMI is resolved to +/// one category (priority unspliced > spliced > ambiguous — any intron evidence +/// makes the molecule nascent), then UMI-deduplicated per category. Genes are +/// rows, cells columns — same layout as the Gene matrix, written as three files +/// scVelo/dynamo ingest directly. +#[allow(clippy::too_many_arguments)] +fn build_velocyto_matrices( + records: &[crate::solo::VelocytoRecord], + method: UmiDedup, + umi_len: usize, + dir: &Path, + n_genes: usize, + n_barcodes: usize, + gzip: bool, +) -> Result<[usize; 3], Error> { + use crate::solo::VelocytoCategory; + // Category → matrix index (file order) and resolution priority. + let cat_idx = |c: VelocytoCategory| match c { + VelocytoCategory::Spliced => 0usize, + VelocytoCategory::Unspliced => 1, + VelocytoCategory::Ambiguous => 2, + }; + let priority = |c: VelocytoCategory| match c { + VelocytoCategory::Unspliced => 2u8, + VelocytoCategory::Spliced => 1, + VelocytoCategory::Ambiguous => 0, + }; + let names = ["spliced.mtx", "unspliced.mtx", "ambiguous.mtx"]; + + let mut recs: Vec<&crate::solo::VelocytoRecord> = records.iter().collect(); + recs.sort_unstable_by_key(|r| r.cb); + + let mut bodies: Vec = Vec::new(); + for _ in 0..3 { + bodies.push( + tempfile::Builder::new() + .prefix(".velo_body") + .tempfile_in(dir) + .map_err(|e| Error::io(e, dir))?, + ); + } + let mut nnz = [0usize; 3]; + { + let mut writers: Vec> = bodies + .iter_mut() + .map(|t| std::io::BufWriter::new(t.as_file_mut())) + .collect(); + let mut i = 0; + while i < recs.len() { + let cb = recs[i].cb; + // gene → umi → (resolved category, read count) + let mut gene_umi: HashMap> = HashMap::new(); + while i < recs.len() && recs[i].cb == cb { + let r = recs[i]; + let e = gene_umi + .entry(r.gene) + .or_default() + .entry(r.umi) + .or_insert((r.category, 0)); + e.1 += 1; + if priority(r.category) > priority(e.0) { + e.0 = r.category; + } + i += 1; + } + // Per gene, dedup UMIs within each resolved category, emit entries. + let mut genes: Vec<&u32> = gene_umi.keys().collect(); + genes.sort_unstable(); + for &g in &genes { + let umis = &gene_umi[g]; + let mut by_cat: [HashMap; 3] = + [HashMap::new(), HashMap::new(), HashMap::new()]; + for (&umi, &(cat, rc)) in umis { + by_cat[cat_idx(cat)].insert(umi, rc); + } + for (k, w) in writers.iter_mut().enumerate() { + let c = dedup_count(&by_cat[k], method, umi_len); + if c > 0 { + writeln!(w, "{} {} {}", g + 1, cb + 1, c).map_err(|e| Error::io(e, dir))?; + nnz[k] += 1; + } + } + } + } + for w in &mut writers { + w.flush().map_err(|e| Error::io(e, dir))?; + } + } + + for (k, body) in bodies.iter().enumerate() { + let path = dir.join(names[k]); + write_file(&path, gzip, |w| { + writeln!(w, "%%MatrixMarket matrix coordinate integer general") + .map_err(|e| Error::io(e, &path))?; + writeln!(w, "%").map_err(|e| Error::io(e, &path))?; + writeln!(w, "{n_genes} {n_barcodes} {}", nnz[k]).map_err(|e| Error::io(e, &path))?; + let mut r = std::fs::File::open(body.path()).map_err(|e| Error::io(e, body.path()))?; + std::io::copy(&mut r, w).map_err(|e| Error::io(e, &path))?; + Ok(()) + })?; + } + Ok(nnz) +} + +/// CellRanger-style positional mapping bins over uniquely-mapped reads. +#[derive(Clone, Copy)] +struct RegionFunnel { + exonic: u64, + intronic: u64, + intergenic: u64, + antisense: u64, +} + +/// Write a CellRanger/STARsolo-style `Summary.csv` for one feature: the +/// sequencing/mapping funnel (genome → exonic → intronic → intergenic, antisense) +/// plus per-cell UMI/gene statistics over the CR2.2-knee-called cells. +#[allow(clippy::too_many_arguments)] +fn write_summary( + path: &Path, + feature_name: &str, + mstats: &MatrixStats, + total_reads: u64, + valid_barcodes: u64, + mapped_unique: u64, + mapped_multi: u64, + feature_mapped: u64, + region: Option, +) -> Result<(), Error> { + let frac = |num: u64| -> f64 { + if total_reads == 0 { + 0.0 + } else { + num as f64 / total_reads as f64 + } + }; + + // Cell calling: CR2.2 knee on per-barcode UMI totals. + let mut umis_desc: Vec = mstats.cells.iter().map(|c| c.n_umis).collect(); + umis_desc.sort_unstable_by(|a, b| b.cmp(a)); + let thr = knee_cr22(&umis_desc, 3000, 0.99, 10.0); + let cells: Vec<&CellStat> = mstats.cells.iter().filter(|c| c.n_umis >= thr).collect(); + let n_cells = cells.len(); + + // Totals across all barcodes (for sequencing saturation + fraction-in-cells). + let total_reads_counted: u64 = mstats.cells.iter().map(|c| c.n_reads).sum(); + let total_umis_all: u64 = mstats.cells.iter().map(|c| c.n_umis).sum(); + let saturation = if total_reads_counted > 0 { + 1.0 - total_umis_all as f64 / total_reads_counted as f64 + } else { + 0.0 + }; + + // Per-cell aggregates over called cells. + let reads_in_cells: u64 = cells.iter().map(|c| c.n_reads).sum(); + let umis_in_cells: u64 = cells.iter().map(|c| c.n_umis).sum(); + let mut reads_sorted: Vec = cells.iter().map(|c| c.n_reads).collect(); + let mut umis_sorted: Vec = cells.iter().map(|c| c.n_umis).collect(); + let mut genes_sorted: Vec = cells.iter().map(|c| c.n_genes as u64).collect(); + reads_sorted.sort_unstable(); + umis_sorted.sort_unstable(); + genes_sorted.sort_unstable(); + let mean = |sum: u64| -> u64 { + if n_cells == 0 { + 0 + } else { + sum / n_cells as u64 + } + }; + + use std::fmt::Write as _; + let mut out = String::new(); + let mut row = |k: &str, v: String| { + let _ = writeln!(out, "{k},{v}"); + }; + row("Number of Reads", total_reads.to_string()); + row( + "Reads With Valid Barcodes", + format!("{:.6}", frac(valid_barcodes)), + ); + row("Sequencing Saturation", format!("{saturation:.6}")); + row( + "Reads Mapped to Genome: Unique+Multiple", + format!("{:.6}", frac(mapped_unique + mapped_multi)), + ); + row( + "Reads Mapped to Genome: Unique", + format!("{:.6}", frac(mapped_unique)), + ); + row( + &format!("Reads Mapped to {feature_name}: Unique {feature_name}"), + format!("{:.6}", frac(feature_mapped)), + ); + // CellRanger-style positional funnel over uniquely-mapped reads (each region + // counted by where the read falls, independent of strand; antisense is a + // separate orientation metric). Available only with Gene + GeneFull. + if let Some(r) = region { + row( + "Reads Mapped Confidently to Exonic Regions", + format!("{:.6}", frac(r.exonic)), + ); + row( + "Reads Mapped Confidently to Intronic Regions", + format!("{:.6}", frac(r.intronic)), + ); + row( + "Reads Mapped Confidently to Intergenic Regions", + format!("{:.6}", frac(r.intergenic)), + ); + row( + "Reads Mapped Antisense to Gene", + format!("{:.6}", frac(r.antisense)), + ); + } + row("Estimated Number of Cells", n_cells.to_string()); + row( + &format!("Unique Reads in Cells Mapped to {feature_name}"), + reads_in_cells.to_string(), + ); + row( + "Fraction of Unique Reads in Cells", + format!( + "{:.6}", + if total_reads_counted > 0 { + reads_in_cells as f64 / total_reads_counted as f64 + } else { + 0.0 + } + ), + ); + row("Mean Reads per Cell", mean(reads_in_cells).to_string()); + row( + "Median Reads per Cell", + median_sorted(&reads_sorted).to_string(), + ); + row("UMIs in Cells", umis_in_cells.to_string()); + row("Mean UMI per Cell", mean(umis_in_cells).to_string()); + row( + "Median UMI per Cell", + median_sorted(&umis_sorted).to_string(), + ); + row( + &format!("Mean {feature_name} per Cell"), + mean(genes_sorted.iter().sum()).to_string(), + ); + row( + &format!("Median {feature_name} per Cell"), + median_sorted(&genes_sorted).to_string(), + ); + row( + &format!("Total {feature_name} Detected"), + mstats.genes_detected.to_string(), + ); + + std::fs::write(path, out).map_err(|e| Error::io(e, path))?; + Ok(()) +} + +/// `features.tsv`: `gene_id gene_name "Gene Expression"` (CellRanger +/// v3 layout). We have no gene names, so the id is repeated. +fn write_features(path: &Path, gene_ids: &[String], gzip: bool) -> Result<(), Error> { + write_file(path, gzip, |w| { + for id in gene_ids { + writeln!(w, "{id}\t{id}\tGene Expression").map_err(|e| Error::io(e, path))?; + } + Ok(()) + })?; + Ok(()) +} + +/// Unpack `cb` into `line` (with trailing newline) and write it. +fn write_one_barcode( + w: &mut dyn std::io::Write, + whitelist: &CbWhitelist, + cb: u32, + line: &mut Vec, + path: &Path, +) -> Result<(), Error> { + line.clear(); + whitelist.unpack_barcode_into(cb, line); + line.push(b'\n'); + w.write_all(line).map_err(|e| Error::io(e, path)) +} + +/// `barcodes.tsv`: full whitelist in sorted order (matches the raw matrix +/// columns). Lists millions of lines, so the writer is buffered and the barcode +/// is unpacked into a reused scratch buffer (no per-line allocation). +fn write_barcodes(path: &Path, whitelist: &CbWhitelist, n: usize, gzip: bool) -> Result<(), Error> { + let len = whitelist.barcode_len(); + write_file(path, gzip, |w| { + let mut line: Vec = Vec::with_capacity(len + 1); + for i in 0..n { + write_one_barcode(w, whitelist, i as u32, &mut line, path)?; + } + Ok(()) + })?; + Ok(()) +} + +/// `barcodes.tsv` for the filtered matrix: only the called-cell barcodes, in the +/// same (cb-ascending) order as the filtered matrix columns. +fn write_barcodes_subset( + path: &Path, + whitelist: &CbWhitelist, + cbs: &[u32], + gzip: bool, +) -> Result<(), Error> { + let len = whitelist.barcode_len(); + write_file(path, gzip, |w| { + let mut line: Vec = Vec::with_capacity(len + 1); + for &cb in cbs { + write_one_barcode(w, whitelist, cb, &mut line, path)?; + } + Ok(()) + })?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::io::fastq::encode_base; + use crate::solo::whitelist::pack_barcode; + + #[test] + fn median_sorted_odd_even_empty() { + assert_eq!(median_sorted(&[]), 0); + assert_eq!(median_sorted(&[5]), 5); + assert_eq!(median_sorted(&[1, 2, 3]), 2); + assert_eq!(median_sorted(&[10, 20, 30, 40]), 25); // midpoint(20,30) + } + + #[test] + fn distribute_multi_methods() { + // Unique counts: gene 0 has 4, gene 1 has none. One ambiguous molecule + // maps to {0,1}. + let u: HashMap = [(0u32, 4.0)].into_iter().collect(); + let mols = vec![vec![0u32, 1u32]]; + + // Uniform: +0.5 to each gene in the set. + let uni = distribute_multi(MultiMethod::Uniform, &u, &mols); + assert!((uni[&0] - 4.5).abs() < 1e-9); + assert!((uni[&1] - 0.5).abs() < 1e-9); + + // PropUnique: all weight to gene 0 (gene 1 has 0 unique) → 5 / 0. + let pu = distribute_multi(MultiMethod::PropUnique, &u, &mols); + assert!((pu[&0] - 5.0).abs() < 1e-9); + assert!(pu.get(&1).copied().unwrap_or(0.0).abs() < 1e-9); + + // EM converges to all weight on gene 0 as well. + let em = distribute_multi(MultiMethod::Em, &u, &mols); + assert!((em[&0] - 5.0).abs() < 1e-6); + assert!(em.get(&1).copied().unwrap_or(0.0).abs() < 1e-6); + + // With no unique evidence, PropUnique falls back to uniform. + let empty: HashMap = HashMap::new(); + let pu0 = distribute_multi(MultiMethod::PropUnique, &empty, &mols); + assert!((pu0[&0] - 0.5).abs() < 1e-9 && (pu0[&1] - 0.5).abs() < 1e-9); + } + + #[test] + fn called_cells_methods() { + let mk = |cb, u| CellStat { + cb, + n_reads: u, + n_umis: u, + n_genes: 1, + }; + let cells = vec![mk(5, 1000), mk(2, 900), mk(8, 50), mk(1, 40)]; + let s = |v: &[&str]| v.iter().map(ToString::to_string).collect::>(); + + // TopCells 2: the two highest-UMI cells (cb 5, 2), returned cb-ascending. + assert_eq!( + called_cells(&cells, &s(&["TopCells", "2"])).unwrap(), + vec![2, 5] + ); + // None: no filtered output. + assert!(called_cells(&cells, &s(&["None"])).is_none()); + // CellRanger2.2: called cbs are sorted ascending. + let cr = called_cells(&cells, &s(&["CellRanger2.2", "3000", "0.99", "10"])).unwrap(); + assert!(cr.windows(2).all(|w| w[0] < w[1])); + // EmptyDrops_CR falls back to the same knee here. + assert_eq!( + called_cells(&cells, &s(&["EmptyDrops_CR", "3000", "0.99", "10"])), + Some(cr) + ); + } + + #[test] + fn knee_cr22_threshold() { + // 100 cells at 1000 UMI, then a long ambient tail at 10. + let mut umis: Vec = vec![1000; 100]; + umis.extend(std::iter::repeat_n(10u64, 5000)); + umis.sort_unstable_by(|a, b| b.cmp(a)); + // robust max = umis[round(3000*0.01)] = umis[30] = 1000; thr = 1000/10 = 100. + let thr = knee_cr22(&umis, 3000, 0.99, 10.0); + assert_eq!(thr, 100); + let cells = umis.iter().filter(|&&u| u >= thr).count(); + assert_eq!(cells, 100); // the 100 real cells, none of the ambient tail + } + + fn umi(s: &str) -> u64 { + match pack_barcode(&s.bytes().map(encode_base).collect::>()) { + crate::solo::whitelist::PackResult::NoN(p) => p, + _ => panic!("N in test UMI"), + } + } + + fn counts(pairs: &[(&str, u32)]) -> HashMap { + pairs.iter().map(|&(s, c)| (umi(s), c)).collect() + } + + #[test] + fn dedup_method_parsing() { + assert_eq!("1MM_All".parse::().unwrap(), UmiDedup::OneMmAll); + assert_eq!("Exact".parse::().unwrap(), UmiDedup::Exact); + assert_eq!("NoDedup".parse::().unwrap(), UmiDedup::NoDedup); + assert!("bogus".parse::().is_err()); + } + + #[test] + fn exact_counts_distinct_umis() { + let c = counts(&[("AAAA", 3), ("AAAC", 1), ("TTTT", 5)]); + assert_eq!(dedup_count(&c, UmiDedup::Exact, 4), 3); + } + + #[test] + fn nodedup_sums_reads() { + let c = counts(&[("AAAA", 3), ("AAAC", 1), ("TTTT", 5)]); + assert_eq!(dedup_count(&c, UmiDedup::NoDedup, 4), 9); + } + + #[test] + fn one_mm_all_merges_neighbors() { + // AAAA–AAAC are Hamming-1 (one component); TTTT separate → 2 molecules. + let c = counts(&[("AAAA", 3), ("AAAC", 1), ("TTTT", 5)]); + assert_eq!(dedup_count(&c, UmiDedup::OneMmAll, 4), 2); + } + + #[test] + fn one_mm_all_transitive_chain() { + // AAAA–AAAC–AACC chain: all one component even though AAAA/AACC are 2 apart. + let c = counts(&[("AAAA", 1), ("AAAC", 1), ("AACC", 1)]); + assert_eq!(dedup_count(&c, UmiDedup::OneMmAll, 4), 1); + } + + #[test] + fn directional_absorbs_low_count_neighbor() { + // hub AAAA count 5 absorbs AAAC count 1 (5 >= 2*1+0); TTTT survives. + let c = counts(&[("AAAA", 5), ("AAAC", 1), ("TTTT", 5)]); + assert_eq!(dedup_count(&c, UmiDedup::OneMmDirectional, 4), 2); + // Equal counts are NOT absorbed (5 >= 2*5 is false). + let c2 = counts(&[("AAAA", 5), ("AAAC", 5)]); + assert_eq!(dedup_count(&c2, UmiDedup::OneMmDirectional, 4), 2); + } + + #[test] + fn directional_umitools_threshold() { + // count_hub >= 2*leaf - 1: hub 3 absorbs leaf 2 (3 >= 3). Directional(0) + // would not (3 >= 4 false). + let c = counts(&[("AAAA", 3), ("AAAC", 2)]); + assert_eq!(dedup_count(&c, UmiDedup::OneMmDirectionalUmiTools, 4), 1); + assert_eq!(dedup_count(&c, UmiDedup::OneMmDirectional, 4), 2); + } + + #[test] + fn cellranger_1mm_collapses_neighbor() { + // AAAA (5) and AAAC (1) are 1MM → low-count corrected to high-count → + // 1 molecule. TTTT separate → 2 total. + let c = counts(&[("AAAA", 5), ("AAAC", 1), ("TTTT", 5)]); + assert_eq!(dedup_count(&c, UmiDedup::OneMmCr, 4), 2); + assert_eq!("1MM_CR".parse::().unwrap(), UmiDedup::OneMmCr); + } + + #[test] + fn cellranger_1mm_non_transitive() { + // Chain AAAA(1)–AAAC(2)–AACC(4): each corrects to its highest-count 1MM + // neighbor. AAAA→AAAC (only neighbor), AAAC→AACC, AACC→self. Corrected + // set {AAAC, AACC, AACC} → 2 molecules (NOT 1 like the transitive All). + let c = counts(&[("AAAA", 1), ("AAAC", 2), ("AACC", 4)]); + assert_eq!(dedup_count(&c, UmiDedup::OneMmCr, 4), 2); + assert_eq!(dedup_count(&c, UmiDedup::OneMmAll, 4), 1); + } + + #[test] + fn umi_filtering_parsing() { + assert_eq!("-".parse::().unwrap(), UmiFiltering::None); + assert_eq!( + "MultiGeneUMI_CR".parse::().unwrap(), + UmiFiltering::MultiGeneUmiCr + ); + assert!("bogus".parse::().is_err()); + } + + #[test] + fn multi_gene_umi_cr_keeps_top_gene() { + // UMI maps to gene 0 (3 reads) and gene 1 (1 read). CR keeps only gene 0. + let mut genes = HashMap::new(); + genes.insert(0u32, 3u32); + genes.insert(1u32, 1u32); + let kept = filter_multi_gene_umi(&genes, UmiFiltering::MultiGeneUmiCr); + assert_eq!(kept.len(), 1); + assert_eq!(*kept[0].0, 0); + // Plain MultiGeneUMI with all-singletons drops the UMI entirely. + let mut single = HashMap::new(); + single.insert(0u32, 1u32); + single.insert(1u32, 1u32); + assert_eq!( + filter_multi_gene_umi(&single, UmiFiltering::MultiGeneUmi).len(), + 0 + ); + } + + #[test] + fn resolve_multi_prefers_higher_prior() { + use crate::solo::whitelist::CbCandidate; + let cands = vec![ + CbCandidate { + wl_index: 0, + mismatch_pos: 1, + mismatch_qual: b'I', + }, + CbCandidate { + wl_index: 1, + mismatch_pos: 2, + mismatch_qual: b'I', + }, + ]; + // Same quality → higher exact-count prior wins. + assert_eq!(resolve_multi_cb(&cands, &[10, 3], 0.0), Some(0)); + assert_eq!(resolve_multi_cb(&cands, &[3, 10], 0.0), Some(1)); + // No prior signal and no pseudocount → rejected. + assert_eq!(resolve_multi_cb(&cands, &[0, 0], 0.0), None); + // Pseudocount gives every candidate positive weight → argmax accepted. + assert!(resolve_multi_cb(&cands, &[0, 0], 1.0).is_some()); + } +} diff --git a/src/solo/gene.rs b/src/solo/gene.rs new file mode 100644 index 0000000..c4d45b2 --- /dev/null +++ b/src/solo/gene.rs @@ -0,0 +1,529 @@ +//! Per-read gene assignment for the STARsolo `Gene` feature (Phase 14.3). +//! +//! A read is assigned to a gene by intersecting the gene model with the read's +//! alignment(s). Following STARsolo's `Gene` feature under the default +//! `--soloMultiMappers Unique`, the read's gene set is the UNION of genes +//! concordant with any of its alignments (strand-filtered by `--soloStrand`): +//! exactly one gene → assigned; zero → no feature; more than one → ambiguous. +//! A multi-locus read whose loci all fall in the same gene is therefore still +//! gene-unique, unlike `--quantMode GeneCounts` which drops all multimappers. + +use crate::align::transcript::Transcript; +use crate::quant::GeneAnnotation; +use std::cell::RefCell; +use std::str::FromStr; + +/// `--soloStrand`: orientation of the cDNA read relative to its gene. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum SoloStrand { + /// Read maps to the sense (same) strand as the gene (10x 3'/5', default). + #[default] + Forward, + /// Read maps to the antisense (opposite) strand. + Reverse, + /// Strand is ignored. + Unstranded, +} + +impl FromStr for SoloStrand { + type Err = String; + fn from_str(s: &str) -> Result { + match s { + "Forward" => Ok(Self::Forward), + "Reverse" => Ok(Self::Reverse), + "Unstranded" => Ok(Self::Unstranded), + _ => Err(format!( + "unknown soloStrand '{s}'; expected Forward, Reverse, or Unstranded" + )), + } + } +} + +/// A STARsolo `--soloFeatures` value that quantifies genes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SoloFeature { + /// Exonic counting: a read counts toward a gene only if it overlaps an exon. + Gene, + /// Full gene-body counting (CellRanger `include-introns`): a read counts if + /// it overlaps the gene locus, including purely intronic reads. + GeneFull, +} + +impl SoloFeature { + /// Output sub-directory name (`Solo.out//raw/`). + pub fn dir_name(self) -> &'static str { + match self { + SoloFeature::Gene => "Gene", + SoloFeature::GeneFull => "GeneFull", + } + } +} + +impl FromStr for SoloFeature { + type Err = String; + fn from_str(s: &str) -> Result { + match s { + "Gene" => Ok(Self::Gene), + "GeneFull" => Ok(Self::GeneFull), + _ => Err(format!( + "unsupported soloFeature '{s}'; supported: Gene, GeneFull" + )), + } + } +} + +/// Outcome of assigning a read to a gene. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GeneAssignment { + /// Concordant with exactly one gene (the assigned gene index). + Gene(u32), + /// Mapped but overlaps no gene on the selected strand. + NoFeature, + /// Overlaps more than one gene → not uniquely assignable. + Ambiguous, + /// Read did not map (no transcripts / too many loci). + Unmapped, +} + +/// Whether gene `g` is kept for read alignment `tr` under `strand`. +#[inline] +fn strand_keeps(strand: SoloStrand, gene_is_reverse: bool, read_is_reverse: bool) -> bool { + match strand { + SoloStrand::Unstranded => true, + SoloStrand::Forward => gene_is_reverse == read_is_reverse, + SoloStrand::Reverse => gene_is_reverse != read_is_reverse, + } +} + +/// RNA-velocity read category (Sullivan et al. 2025 mature/nascent/ambiguous, +/// reported as scVelo's spliced/unspliced/ambiguous). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VelocytoCategory { + /// Spans an exon–exon junction → processed (mature) mRNA. + Spliced, + /// No junction, but a block extends into an intron → nascent mRNA. + Unspliced, + /// No junction, all blocks wholly within exons → origin indistinguishable. + Ambiguous, +} + +/// Classify a uniquely-mapped read (assigned to gene `g` by gene-body overlap) +/// into its velocity category from the alignment: a splice in the CIGAR means +/// the read is mature; otherwise an aligned block that leaves the exons (into an +/// intron) means nascent; a wholly-exonic block is ambiguous. +pub fn velocyto_category( + transcripts: &[Transcript], + gene_ann: &GeneAnnotation, + g: u32, +) -> VelocytoCategory { + if transcripts.iter().any(|t| t.n_junction > 0) { + return VelocytoCategory::Spliced; + } + let g = g as usize; + let all_exonic = transcripts.iter().all(|t| { + t.exons + .iter() + .all(|e| gene_ann.block_is_exonic(g, e.genome_start, e.genome_end)) + }); + if all_exonic { + VelocytoCategory::Ambiguous + } else { + VelocytoCategory::Unspliced + } +} + +/// CellRanger-style positional region of a uniquely-mapped read (independent of +/// strand): which genomic region the read falls in. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Region { + /// Overlaps ≥1 annotated exon. + Exonic, + /// Overlaps a gene body but no exon (purely intronic). + Intronic, + /// Overlaps no gene body. + Intergenic, +} + +/// Everything one read's alignment set tells us, computed in a single pass over +/// the gene model (the two overlap queries are shared between the per-feature +/// gene assignment and the region classification, so this costs no more than the +/// old two `assign_gene_se` calls). +#[derive(Debug, Clone)] +pub struct ReadClass { + /// Sense-strand exonic gene assignment (the `Gene` feature). `Unmapped` if + /// exon overlap was not requested. + pub gene: GeneAssignment, + /// Sense-strand gene-body assignment (the `GeneFull` feature). `Unmapped` if + /// body overlap was not requested. + pub gene_full: GeneAssignment, + /// Positional region (only when both exon + body overlap were computed). + pub region: Option, + /// Read maps to a gene body on the antisense strand and to none on the sense + /// strand (CellRanger's "Reads Mapped Antisense to Gene"). + pub antisense: bool, + /// Multi-gene set for the `Gene` feature (the sense exon genes), populated + /// only when `want_multi` and the read is gene-ambiguous (>1 gene). Used by + /// `--soloMultiMappers` to distribute the read; empty otherwise. + pub gene_multi: Vec, + /// Multi-gene set for the `GeneFull` feature (sense body genes). + pub gene_full_multi: Vec, +} + +fn assignment_of(sense_genes: &[usize]) -> GeneAssignment { + match sense_genes.len() { + 0 => GeneAssignment::NoFeature, + 1 => GeneAssignment::Gene(sense_genes[0] as u32), + _ => GeneAssignment::Ambiguous, + } +} + +/// Classify a read in one pass: sense-strand `Gene`/`GeneFull` assignments plus +/// the CellRanger-style positional region + antisense flag. `want_exon` / +/// `want_body` skip the corresponding overlap query when a feature is not needed. +pub fn classify_read( + transcripts: &[Transcript], + gene_ann: &GeneAnnotation, + strand: SoloStrand, + want_exon: bool, + want_body: bool, + want_multi: bool, +) -> ReadClass { + if transcripts.is_empty() { + return ReadClass { + gene: GeneAssignment::Unmapped, + gene_full: GeneAssignment::Unmapped, + region: None, + antisense: false, + gene_multi: Vec::new(), + gene_full_multi: Vec::new(), + }; + } + + thread_local! { + static RAW: RefCell> = const { RefCell::new(Vec::new()) }; + static EXON_S: RefCell> = const { RefCell::new(Vec::new()) }; + static BODY_S: RefCell> = const { RefCell::new(Vec::new()) }; + } + + RAW.with(|rb| { + EXON_S.with(|eb| { + BODY_S.with(|bb| { + let mut raw = rb.borrow_mut(); + let mut exon_s = eb.borrow_mut(); + let mut body_s = bb.borrow_mut(); + exon_s.clear(); + body_s.clear(); + // `*_any` track positional (either-strand) overlap for the region; + // `body_anti_any` tracks an antisense-only body hit. + let (mut exon_any, mut body_any, mut body_anti_any) = (false, false, false); + + for tr in transcripts { + if want_exon { + gene_ann.overlapping_genes_into(tr, &mut raw); + for &g in raw.iter() { + exon_any = true; + if strand_keeps(strand, gene_ann.gene_is_reverse[g], tr.is_reverse) { + exon_s.push(g); + } + } + } + if want_body { + gene_ann.overlapping_genes_full_into(tr, &mut raw); + for &g in raw.iter() { + body_any = true; + if strand_keeps(strand, gene_ann.gene_is_reverse[g], tr.is_reverse) { + body_s.push(g); + } else { + body_anti_any = true; + } + } + } + } + exon_s.sort_unstable(); + exon_s.dedup(); + body_s.sort_unstable(); + body_s.dedup(); + + let region = if want_exon && want_body { + Some(if exon_any { + Region::Exonic + } else if body_any { + Region::Intronic + } else { + Region::Intergenic + }) + } else { + None + }; + + // Capture the multi-gene sets only when requested and ambiguous, + // for --soloMultiMappers distribution. + let gene_multi = if want_multi && want_exon && exon_s.len() > 1 { + exon_s.iter().map(|&g| g as u32).collect() + } else { + Vec::new() + }; + let gene_full_multi = if want_multi && want_body && body_s.len() > 1 { + body_s.iter().map(|&g| g as u32).collect() + } else { + Vec::new() + }; + + ReadClass { + gene: if want_exon { + assignment_of(&exon_s) + } else { + GeneAssignment::Unmapped + }, + gene_full: if want_body { + assignment_of(&body_s) + } else { + GeneAssignment::Unmapped + }, + region, + antisense: body_anti_any && body_s.is_empty(), + gene_multi, + gene_full_multi, + } + }) + }) + }) +} + +/// Assign a single-end (cDNA) read to a gene from its alignment set, using the +/// `Gene` (exonic) or `GeneFull` (gene-body, intron-inclusive) overlap basis. +/// Thin wrapper over [`classify_read`] for the single-feature case (and tests). +pub fn assign_gene_se( + transcripts: &[Transcript], + gene_ann: &GeneAnnotation, + strand: SoloStrand, + feature: SoloFeature, +) -> GeneAssignment { + let want_exon = feature == SoloFeature::Gene; + let class = classify_read(transcripts, gene_ann, strand, want_exon, !want_exon, false); + match feature { + SoloFeature::Gene => class.gene, + SoloFeature::GeneFull => class.gene_full, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::align::transcript::{Exon, Transcript}; + use crate::genome::Genome; + use crate::junction::gtf::GtfRecord; + use std::collections::HashMap; + + fn genome() -> Genome { + Genome { + sequence: vec![0u8; 2000].into(), + n_genome: 2000, + n_genome_real: 2000, + n_chr_real: 1, + chr_start: vec![0, 1000], + chr_length: vec![1000], + chr_name: vec!["chr1".to_string()], + } + } + + fn gtf_exon(start: u64, end: u64, strand: char, gene: &str) -> GtfRecord { + let mut attrs = HashMap::new(); + attrs.insert("gene_id".to_string(), gene.to_string()); + attrs.insert("transcript_id".to_string(), format!("{gene}_t1")); + GtfRecord { + seqname: "chr1".to_string(), + feature: "exon".to_string(), + start, + end, + strand, + attributes: attrs, + } + } + + /// G1 (+) at 100-200, G2 (-) at 300-400. + fn annotation() -> GeneAnnotation { + let exons = vec![gtf_exon(100, 200, '+', "G1"), gtf_exon(300, 400, '-', "G2")]; + GeneAnnotation::from_gtf_exons(&exons, &genome()) + } + + fn read_at(start: u64, end: u64, is_reverse: bool) -> Transcript { + Transcript { + chr_idx: 0, + genome_start: start, + genome_end: end, + is_reverse, + exons: vec![Exon { + genome_start: start, + genome_end: end, + read_start: 0, + read_end: (end - start) as usize, + i_frag: 0, + }], + cigar: Vec::new(), + score: 0, + n_mismatch: 0, + n_gap: 0, + n_junction: 0, + junction_motifs: Vec::new(), + junction_annotated: Vec::new(), + read_seq: Vec::new(), + } + } + + #[test] + fn unmapped_when_no_transcripts() { + let ann = annotation(); + assert_eq!( + assign_gene_se(&[], &ann, SoloStrand::Forward, SoloFeature::Gene), + GeneAssignment::Unmapped + ); + } + + #[test] + fn forward_sense_assigns_g1() { + let ann = annotation(); + // Read on + strand overlapping G1 (a + gene). + let tr = read_at(120, 180, false); + match assign_gene_se(&[tr], &ann, SoloStrand::Forward, SoloFeature::Gene) { + GeneAssignment::Gene(g) => assert_eq!(ann.gene_ids[g as usize], "G1"), + other => panic!("expected G1, got {other:?}"), + } + } + + #[test] + fn forward_antisense_is_no_feature() { + let ann = annotation(); + // Read on - strand overlapping G1 (+): wrong strand under Forward. + let tr = read_at(120, 180, true); + assert_eq!( + assign_gene_se(&[tr], &ann, SoloStrand::Forward, SoloFeature::Gene), + GeneAssignment::NoFeature + ); + } + + #[test] + fn reverse_strand_picks_antisense() { + let ann = annotation(); + // Read on - strand overlapping G1 (+): kept under Reverse. + let tr = read_at(120, 180, true); + match assign_gene_se(&[tr], &ann, SoloStrand::Reverse, SoloFeature::Gene) { + GeneAssignment::Gene(g) => assert_eq!(ann.gene_ids[g as usize], "G1"), + other => panic!("expected G1 under Reverse, got {other:?}"), + } + } + + #[test] + fn no_overlap_is_no_feature() { + let ann = annotation(); + let tr = read_at(500, 600, false); + assert_eq!( + assign_gene_se(&[tr], &ann, SoloStrand::Unstranded, SoloFeature::Gene), + GeneAssignment::NoFeature + ); + } + + #[test] + fn multilocus_same_gene_is_unique() { + let ann = annotation(); + // Two loci both inside G1 → still gene-unique. + let a = read_at(110, 150, false); + let b = read_at(150, 190, false); + match assign_gene_se(&[a, b], &ann, SoloStrand::Forward, SoloFeature::Gene) { + GeneAssignment::Gene(g) => assert_eq!(ann.gene_ids[g as usize], "G1"), + other => panic!("expected G1, got {other:?}"), + } + } + + #[test] + fn two_genes_unstranded_is_ambiguous() { + let ann = annotation(); + // One locus in G1 (+), one in G2 (-); unstranded sees both. + let a = read_at(120, 180, false); + let b = read_at(320, 380, true); + assert_eq!( + assign_gene_se(&[a, b], &ann, SoloStrand::Unstranded, SoloFeature::Gene), + GeneAssignment::Ambiguous + ); + } + + #[test] + fn genefull_counts_intronic_read() { + // Two-exon gene G3 (+): exons [500,600) and [800,900) → gene body + // [500,900) with an intron at [600,800). + let g = genome(); + let exons = vec![gtf_exon(501, 600, '+', "G3"), gtf_exon(801, 900, '+', "G3")]; + let ann = GeneAnnotation::from_gtf_exons(&exons, &g); + // A read entirely inside the intron overlaps no exon... + assert_eq!( + assign_gene_se( + &[read_at(650, 700, false)], + &ann, + SoloStrand::Forward, + SoloFeature::Gene + ), + GeneAssignment::NoFeature + ); + // ...but does overlap the gene body, so GeneFull counts it. + match assign_gene_se( + &[read_at(650, 700, false)], + &ann, + SoloStrand::Forward, + SoloFeature::GeneFull, + ) { + GeneAssignment::Gene(gi) => assert_eq!(ann.gene_ids[gi as usize], "G3"), + other => panic!("expected G3 under GeneFull, got {other:?}"), + } + } + + #[test] + fn classify_read_regions_and_antisense() { + // Ga (+): exons [100,200) and [400,500) → body [100,500), intron [200,400). + let g = genome(); + let exons = vec![gtf_exon(101, 200, '+', "Ga"), gtf_exon(401, 500, '+', "Ga")]; + let ann = GeneAnnotation::from_gtf_exons(&exons, &g); + let cls = |start, end, rev| { + classify_read( + &[read_at(start, end, rev)], + &ann, + SoloStrand::Forward, + true, + true, + false, + ) + }; + + // In an exon, sense strand → Exonic, not antisense. + let c = cls(120, 180, false); + assert_eq!(c.region, Some(Region::Exonic)); + assert!(!c.antisense); + assert!(matches!(c.gene, GeneAssignment::Gene(_))); + + // Entirely within the intron → Intronic (body but no exon). + assert_eq!(cls(250, 350, false).region, Some(Region::Intronic)); + + // Outside the gene → Intergenic. + assert_eq!(cls(700, 800, false).region, Some(Region::Intergenic)); + + // Exonic position but read on the opposite strand of a (+) gene: + // positionally Exonic, flagged antisense, no sense gene assignment. + let c = cls(120, 180, true); + assert_eq!(c.region, Some(Region::Exonic)); + assert!(c.antisense); + assert_eq!(c.gene, GeneAssignment::NoFeature); + + // No region computed when only one side requested. + assert_eq!( + classify_read( + &[read_at(120, 180, false)], + &ann, + SoloStrand::Forward, + true, + false, + false + ) + .region, + None + ); + } +} diff --git a/src/solo/mod.rs b/src/solo/mod.rs new file mode 100644 index 0000000..91fee76 --- /dev/null +++ b/src/solo/mod.rs @@ -0,0 +1,957 @@ +//! STARsolo single-cell support (Phase 14). +//! +//! Phase 14.1 covers barcode-read input plumbing: parsing the cell barcode (CB) +//! and unique molecular identifier (UMI) out of the barcode read for +//! `--soloType CB_UMI_Simple` (droplet 10x-style geometry). Whitelist +//! correction (14.2), gene assignment (14.3), UMI deduplication and matrix +//! output (14.4+) build on the structures defined here. +//! +//! The barcode read is the SECOND `--readFilesIn` file (STAR convention: +//! `--readFilesIn cDNA_read barcode_read`). It is never aligned — only parsed. + +pub mod count; +pub mod gene; +pub mod smartseq; +pub mod whitelist; + +pub use count::{UmiDedup, UmiFiltering, write_gene_matrix}; +pub use gene::{ + GeneAssignment, Region, SoloFeature, SoloStrand, VelocytoCategory, assign_gene_se, + classify_read, velocyto_category, +}; +pub use whitelist::{ + CbCandidate, CbMatch, CbMatchStats, CbMatchType, CbWhitelist, UmiCheck, check_umi, pack_barcode, +}; + +use crate::align::transcript::Transcript; +use crate::error::Error; +use crate::io::fastq::{EncodedRead, FastqReader, decode_base}; +use crate::params::{Parameters, SoloType}; +use crate::quant::GeneAnnotation; +use std::path::Path; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; + +/// Cell-barcode + UMI read geometry. `Simple` is a single fixed-position CB + +/// UMI (`CB_UMI_Simple`); `Complex` assembles the CB from several fixed-position +/// segments (`CB_UMI_Complex`). All offsets are 0-based. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SoloBarcodeLayout { + Simple { + cb_start: usize, + cb_len: usize, + umi_start: usize, + umi_len: usize, + }, + /// Multi-segment CB: each `(start, len)` is one segment, concatenated in + /// order to form the cell barcode; `umi = (start, len)`. + Complex { + cb_segments: Vec<(usize, usize)>, + umi: (usize, usize), + }, +} + +/// Parse a `--soloCBposition`/`--soloUMIposition` spec +/// (`startAnchor_startDist_endAnchor_endDist`) into a 0-based `(start, len)`. +/// Only read-start anchoring (`anchor = 0`) is supported. +fn parse_position(spec: &str) -> Result<(usize, usize), Error> { + let f: Vec<&str> = spec.split('_').collect(); + if f.len() != 4 { + return Err(invalid_pos( + spec, + "expected startAnchor_startDist_endAnchor_endDist", + )); + } + let (sa, sd, ea, ed) = ( + f[0].parse::().ok(), + f[1].parse::().ok(), + f[2].parse::().ok(), + f[3].parse::().ok(), + ); + match (sa, sd, ea, ed) { + (Some(0), Some(sd), Some(0), Some(ed)) if sd >= 0 && ed >= sd => { + Ok((sd as usize, (ed - sd + 1) as usize)) + } + (Some(0), _, Some(0), _) => Err(invalid_pos(spec, "end < start")), + _ => Err(invalid_pos( + spec, + "only read-start anchoring (anchor=0) is supported", + )), + } +} + +fn invalid_pos(spec: &str, why: &str) -> Error { + Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("invalid position spec '{spec}': {why}"), + )) +} + +impl SoloBarcodeLayout { + /// Build the layout from CLI parameters. `CB_UMI_Complex` parses + /// `--soloCBposition`/`--soloUMIposition`; otherwise fixed Simple geometry. + pub fn from_params(params: &Parameters) -> Self { + if params.solo_type == SoloType::CbUmiComplex && !params.solo_cb_position.is_empty() { + let cb_segments = params + .solo_cb_position + .iter() + .filter_map(|s| parse_position(s).ok()) + .collect(); + let umi = parse_position(¶ms.solo_umi_position).unwrap_or((0, 0)); + return Self::Complex { cb_segments, umi }; + } + Self::Simple { + cb_start: (params.solo_cb_start.max(1) - 1) as usize, + cb_len: params.solo_cb_len as usize, + umi_start: (params.solo_umi_start.max(1) - 1) as usize, + umi_len: params.solo_umi_len as usize, + } + } + + /// Minimum barcode-read length required to extract the CB and UMI. + pub fn min_read_len(&self) -> usize { + match self { + Self::Simple { + cb_start, + cb_len, + umi_start, + umi_len, + } => (cb_start + cb_len).max(umi_start + umi_len), + Self::Complex { cb_segments, umi } => cb_segments + .iter() + .map(|&(s, l)| s + l) + .chain(std::iter::once(umi.0 + umi.1)) + .max() + .unwrap_or(0), + } + } + + /// Extract the CB (concatenating segments for `Complex`) and UMI from one + /// barcode read. `None` if the read is shorter than [`Self::min_read_len`]. + pub fn extract(&self, barcode_read: &EncodedRead) -> Option { + let seq = &barcode_read.sequence; + let qual = &barcode_read.quality; + if seq.len() < self.min_read_len() { + return None; + } + match self { + Self::Simple { + cb_start, + cb_len, + umi_start, + umi_len, + } => Some(CellBarcode { + cb_seq: seq[*cb_start..cb_start + cb_len].to_vec(), + cb_qual: slice_or_empty(qual, *cb_start, *cb_len), + umi_seq: seq[*umi_start..umi_start + umi_len].to_vec(), + umi_qual: slice_or_empty(qual, *umi_start, *umi_len), + }), + Self::Complex { cb_segments, umi } => { + let mut cb_seq = Vec::new(); + let mut cb_qual = Vec::new(); + for &(s, l) in cb_segments { + cb_seq.extend_from_slice(&seq[s..s + l]); + cb_qual.extend_from_slice(&slice_or_empty(qual, s, l)); + } + Some(CellBarcode { + cb_seq, + cb_qual, + umi_seq: seq[umi.0..umi.0 + umi.1].to_vec(), + umi_qual: slice_or_empty(qual, umi.0, umi.1), + }) + } + } + } +} + +fn slice_or_empty(data: &[u8], start: usize, len: usize) -> Vec { + if start + len <= data.len() { + data[start..start + len].to_vec() + } else { + Vec::new() + } +} + +/// A cell barcode + UMI extracted from one barcode read. +/// +/// Sequences are stored in genome encoding (0=A, 1=C, 2=G, 3=T, 4=N) to match +/// the rest of the pipeline; qualities are raw Phred+33 ASCII bytes. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CellBarcode { + pub cb_seq: Vec, + pub cb_qual: Vec, + pub umi_seq: Vec, + pub umi_qual: Vec, +} + +impl CellBarcode { + /// True if the cell barcode contains an `N` (encoded 4) — such barcodes + /// cannot match a whitelist exactly. + pub fn cb_has_n(&self) -> bool { + self.cb_seq.contains(&4) + } + + /// True if the UMI contains an `N`. STARsolo discards reads whose UMI has + /// any ambiguous base. + pub fn umi_has_n(&self) -> bool { + self.umi_seq.contains(&4) + } + + /// Decode the cell barcode to an ASCII `ACGTN` string (for CB SAM tags and + /// `barcodes.tsv`). + pub fn cb_string(&self) -> String { + decode_seq(&self.cb_seq) + } + + /// Decode the UMI to an ASCII `ACGTN` string (for UB SAM tags). + pub fn umi_string(&self) -> String { + decode_seq(&self.umi_seq) + } +} + +fn decode_seq(encoded: &[u8]) -> String { + encoded.iter().map(|&b| decode_base(b) as char).collect() +} + +/// Reads cDNA reads and their paired barcode reads in lockstep from two FASTQ +/// files. The cDNA read flows into the normal alignment path; the barcode read +/// is parsed into a [`CellBarcode`] (or `None` when too short). +pub struct SoloReadReader { + cdna: FastqReader, + barcode: FastqReader, + layout: SoloBarcodeLayout, +} + +/// One cDNA read paired with its (optional) extracted barcode. +pub struct SoloRead { + pub cdna: EncodedRead, + /// `None` when the barcode read was too short to extract CB+UMI. + pub barcode: Option, +} + +impl SoloReadReader { + /// Open the cDNA and barcode FASTQ files for a solo run. + pub fn open( + cdna_path: &Path, + barcode_path: &Path, + layout: SoloBarcodeLayout, + decompress_cmd: Option<&str>, + ) -> Result { + Ok(Self { + cdna: FastqReader::open(cdna_path, decompress_cmd)?, + barcode: FastqReader::open(barcode_path, decompress_cmd)?, + layout, + }) + } + + /// Fetch the next paired (cDNA, barcode) read. Errors if the two files + /// have different lengths. + pub fn next_read(&mut self) -> Result, Error> { + let cdna_opt = self.cdna.next_encoded()?; + let barcode_opt = self.barcode.next_encoded()?; + match (cdna_opt, barcode_opt) { + (Some(cdna), Some(bc)) => { + let barcode = self.layout.extract(&bc); + Ok(Some(SoloRead { cdna, barcode })) + } + (None, None) => Ok(None), + (Some(_), None) => Err(Error::from(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "solo: cDNA read file has more reads than the barcode read file", + ))), + (None, Some(_)) => Err(Error::from(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "solo: barcode read file has more reads than the cDNA read file", + ))), + } + } + + /// Read up to `batch_size` paired reads for parallel processing. + pub fn read_batch(&mut self, batch_size: usize) -> Result, Error> { + let mut batch = Vec::with_capacity(batch_size); + for _ in 0..batch_size { + match self.next_read()? { + Some(read) => batch.push(read), + None => break, + } + } + Ok(batch) + } +} + +/// Build a [`SoloReadReader`] from parameters, resolving the cDNA/barcode files +/// from `--readFilesIn`. Returns an error if solo is enabled but the read files +/// are missing (validation should have caught this earlier). +pub fn open_reader(params: &Parameters) -> Result { + debug_assert!(matches!( + params.solo_type, + SoloType::CbUmiSimple | SoloType::CbUmiComplex + )); + let cdna = params.cdna_read_file().ok_or_else(|| { + Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "solo: missing cDNA read file", + )) + })?; + let barcode = params.barcode_read_file().ok_or_else(|| { + Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "solo: missing barcode read file", + )) + })?; + let layout = SoloBarcodeLayout::from_params(params); + SoloReadReader::open(cdna, barcode, layout, params.read_files_command.as_deref()) +} + +// --------------------------------------------------------------------------- +// CellRanger4 adapter clipping (--clipAdapterType CellRanger4) +// --------------------------------------------------------------------------- + +/// The 10x template-switch oligo (TSO), clipped from the 5' of the cDNA read +/// under `--clipAdapterType CellRanger4`. Encoded 0=A,1=C,2=G,3=T. +const TSO_SEQ: &[u8] = b"AAGCAGTGGTATCAACGCAGAGTACATGGG"; + +/// Clip the 10x TSO from the 5' end and trim a 3' polyA tail of the cDNA read, +/// matching `--clipAdapterType CellRanger4`. Operates on encoded bases +/// (0=A..3=T,4=N) with parallel quality bytes. Returns the clipped read. +/// +/// Conservative thresholds (full-length TSO match ≤ 3 mismatches at the 5' +/// anchor; trailing polyA run ≥ 8) keep this a no-op on adapter-free reads. +pub fn clip_adapter_cr4(seq: &[u8], qual: &[u8]) -> (Vec, Vec) { + let mut start = 0usize; + let mut end = seq.len(); + + // 5' TSO: compare the read prefix against the full TSO; clip on a match. + if seq.len() >= TSO_SEQ.len() { + let tso: Vec = TSO_SEQ + .iter() + .map(|&b| crate::io::fastq::encode_base(b)) + .collect(); + let mismatches = seq[..tso.len()] + .iter() + .zip(&tso) + .filter(|(a, b)| a != b) + .count(); + if mismatches <= 3 { + start = tso.len(); + } + } + + // 3' polyA: trim a trailing run of A (encoded 0) of length >= 8. + let mut run = 0usize; + while end > start && seq[end - 1] == 0 { + run += 1; + end -= 1; + } + if run < 8 { + end += run; // not a real polyA tail; keep those bases + } + + if start == 0 && end == seq.len() { + return (seq.to_vec(), qual.to_vec()); + } + ( + seq[start..end].to_vec(), + qual.get(start..end.min(qual.len())) + .map(<[u8]>::to_vec) + .unwrap_or_default(), + ) +} + +// --------------------------------------------------------------------------- +// Solo counting context + per-read processing (Phase 14.3) +// --------------------------------------------------------------------------- + +/// A fully-resolved per-read count record: one (cell, UMI, gene) observation. +/// These are collapsed by UMI per (cell, gene) into the count matrix (14.4). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SoloCountRecord { + /// Sorted whitelist index of the cell barcode. + pub cb: u32, + /// 2-bit packed UMI. + pub umi: u64, + /// Assigned gene index. + pub gene: u32, +} + +/// One (cell, UMI, splice-junction) observation for the `SJ` feature. The +/// junction is identified by its absolute intron coordinates; it is mapped to a +/// matrix row (the `SJ.out.tab` order) at output time. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SjCountRecord { + pub cb: u32, + pub umi: u64, + pub intron_start: u64, + pub intron_end: u64, +} + +/// One (cell, UMI, gene) observation for the `Velocyto` feature, tagged with the +/// read's spliced/unspliced/ambiguous category. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct VelocytoRecord { + pub cb: u32, + pub umi: u64, + pub gene: u32, + pub category: VelocytoCategory, +} + +/// A read whose cell barcode matched multiple whitelist entries by 1MM +/// (`1MM_multi`). Resolution to a single CB needs the global exact-count table +/// and is deferred to the collation stage (Phase 14.4). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SoloMultiRecord { + /// Candidate whitelist barcodes + mismatch quality. + pub candidates: Vec, + pub umi: u64, + pub gene: u32, +} + +/// A read that mapped to multiple genes (gene-ambiguous). Distributed across its +/// gene set by `--soloMultiMappers` into the `UniqueAndMult-*.mtx` matrices. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MultiGeneRecord { + pub cb: u32, + pub umi: u64, + pub genes: Vec, +} + +/// Thread-safe sink for the records produced during alignment. +#[derive(Default)] +pub struct SoloRecorder { + pub records: Mutex>, + pub multi_records: Mutex>, + /// Gene-ambiguous reads for `--soloMultiMappers` (resolved CB only). + pub multi_gene: Mutex>, +} + +impl SoloRecorder { + pub fn new() -> Self { + Self::default() + } + + /// Append a batch's records (called from the sequential write phase). + pub fn extend(&self, recs: Vec, multi: Vec) { + if !recs.is_empty() { + self.records.lock().unwrap().extend(recs); + } + if !multi.is_empty() { + self.multi_records.lock().unwrap().extend(multi); + } + } + + /// Number of fully-resolved count records collected so far. + pub fn n_records(&self) -> usize { + self.records.lock().unwrap().len() + } + + /// Number of deferred multi-CB records collected so far. + pub fn n_multi_records(&self) -> usize { + self.multi_records.lock().unwrap().len() + } +} + +/// Everything the alignment loop needs to quantify a solo run, shared as an +/// `Arc` across rayon threads. The gene model is built from `--sjdbGTFfile`; +/// the whitelist and stats are read concurrently (interior atomics). +pub struct SoloContext { + pub layout: SoloBarcodeLayout, + pub whitelist: CbWhitelist, + pub match_type: CbMatchType, + pub strand: SoloStrand, + pub gene_ann: GeneAnnotation, + pub stats: CbMatchStats, + /// Quantified features (`Gene`, `GeneFull`, …), each with its own recorder + /// and `Solo.out//raw/` output. Parallel to `recorders`. + pub features: Vec, + pub recorders: Vec, + /// Reads uniquely assigned to a gene per feature (parallel to `features`), + /// among valid-barcode reads — the STARsolo "Reads Mapped to : + /// Unique" metric. + pub feature_reads: Vec, + /// CellRanger-style positional mapping funnel over uniquely-mapped reads + /// (independent of barcode), populated only when both `Gene` and `GeneFull` + /// features run. + pub region_stats: RegionStats, + /// `--soloFeatures SJ`: collect per-cell splice-junction counts. + pub sj_enabled: bool, + /// (cell, UMI, junction) observations for the SJ feature. + pub sj_records: Mutex>, + /// `--soloFeatures Velocyto`: collect spliced/unspliced/ambiguous counts. + pub velocyto_enabled: bool, + /// (cell, UMI, gene, category) observations for the Velocyto feature. + pub velocyto_records: Mutex>, + /// `--soloMultiMappers` includes a non-`Unique` method → capture gene- + /// ambiguous reads for distribution into `UniqueAndMult-*.mtx`. + pub want_multi: bool, +} + +/// Per-region read tallies for the `Summary.csv` mapping funnel (uniquely-mapped +/// reads, mirroring CellRanger's "confidently mapped to ... regions"). +#[derive(Default)] +pub struct RegionStats { + pub exonic: AtomicU64, + pub intronic: AtomicU64, + pub intergenic: AtomicU64, + pub antisense: AtomicU64, +} + +/// What happened to one solo read — one `(record, multi)` per quantified +/// feature, parallel to [`SoloContext::features`]. +#[derive(Debug, Default)] +pub struct SoloReadOutcome { + pub per_feature: Vec, + /// SJ-feature records for this read (one per crossed junction); empty unless + /// `--soloFeatures SJ` and the read is uniquely mapped with a resolved CB. + pub sj: Vec, + /// Velocyto record for this read (resolved CB, gene-assigned), if enabled. + pub velocyto: Option, +} + +/// The record(s) one read produces for a single feature. +#[derive(Debug, Default)] +pub struct FeatureOutcome { + /// A resolved count record, if the read was fully assignable. + pub record: Option, + /// A deferred multi-CB record, if the CB was an unresolved 1MM_multi. + pub multi: Option, + /// A gene-ambiguous record (resolved CB), for `--soloMultiMappers`. + pub multi_gene: Option, +} + +impl SoloContext { + /// Build the solo context from parameters: load the whitelist and build the + /// gene model from `--sjdbGTFfile`. Call once before alignment. + pub fn build(params: &Parameters, genome: &crate::genome::Genome) -> Result { + let whitelist = if params.solo_type == SoloType::CbUmiComplex { + // One whitelist per CB segment → combined cartesian-product whitelist. + let paths: Vec = params + .solo_cb_whitelist + .iter() + .map(std::path::PathBuf::from) + .collect(); + log::info!( + "STARsolo CB_UMI_Complex: combining {} segment whitelists", + paths.len() + ); + let wl = CbWhitelist::load_complex(&paths)?; + log::info!("STARsolo: {} combined whitelist barcodes", wl.len()); + wl + } else { + match params.solo_cb_whitelist_path() { + Some(path) => { + log::info!( + "STARsolo: loading cell-barcode whitelist from {}", + path.display() + ); + let wl = CbWhitelist::load(&path)?; + log::info!("STARsolo: {} whitelist barcodes loaded", wl.len()); + wl + } + None => CbWhitelist::NoWhitelist { + len: params.solo_cb_len as usize, + }, + } + }; + + // Gene model from the GTF (validated to be present for Gene/GeneFull). + let gtf_path = params.sjdb_gtf_file.as_ref().ok_or_else(|| { + Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "STARsolo Gene feature requires --sjdbGTFfile", + )) + })?; + let exons = crate::junction::gtf::parse_gtf_configured( + gtf_path, + ¶ms.sjdb_gtf_feature_exon, + ¶ms.sjdb_gtf_chr_prefix, + )?; + let gene_ann = GeneAnnotation::from_gtf_exons_configured( + &exons, + genome, + ¶ms.sjdb_gtf_tag_exon_parent_gene, + ); + log::info!( + "STARsolo: {} genes loaded from {}", + gene_ann.n_genes(), + gtf_path.display() + ); + + let strand: SoloStrand = params.solo_strand.parse().map_err(|e: String| { + Error::from(std::io::Error::new(std::io::ErrorKind::InvalidInput, e)) + })?; + + // Quantified gene features (Gene, GeneFull). Validation guarantees these + // parse; default to Gene if somehow empty. + let features: Vec = params + .solo_features + .iter() + .filter_map(|f| f.parse().ok()) + .collect(); + let features = if features.is_empty() { + vec![SoloFeature::Gene] + } else { + features + }; + let recorders = features.iter().map(|_| SoloRecorder::new()).collect(); + let feature_reads = features.iter().map(|_| AtomicU64::new(0)).collect(); + let sj_enabled = params.solo_features.iter().any(|f| f == "SJ"); + let velocyto_enabled = params.solo_features.iter().any(|f| f == "Velocyto"); + let want_multi = params.solo_multi_mappers.iter().any(|m| m != "Unique"); + + Ok(Self { + layout: SoloBarcodeLayout::from_params(params), + whitelist, + match_type: params.solo_cb_match_type(), + strand, + gene_ann, + stats: CbMatchStats::new(), + features, + recorders, + feature_reads, + region_stats: RegionStats::default(), + sj_enabled, + sj_records: Mutex::new(Vec::new()), + velocyto_enabled, + velocyto_records: Mutex::new(Vec::new()), + want_multi, + }) + } + + /// Process one solo read: match the cell barcode, validate the UMI, assign + /// a gene, and (on success) produce a count record. Stats are recorded + /// here; the returned records are appended to the recorder by the caller. + pub fn process_read( + &self, + cdna_transcripts: &[Transcript], + barcode: Option<&CellBarcode>, + junctions: &[(u64, u64)], + ) -> SoloReadOutcome { + let mut out = SoloReadOutcome::default(); + + // One-pass classification: the two overlap queries are shared between the + // per-feature gene assignment and the CellRanger-style mapping funnel, so + // this is no more work than the old per-feature `assign_gene_se` calls. + let want_exon = self.features.contains(&SoloFeature::Gene); + // Velocyto assigns its gene by gene-body overlap, so it needs `want_body`. + let want_body = self.features.contains(&SoloFeature::GeneFull) || self.velocyto_enabled; + let class = classify_read( + cdna_transcripts, + &self.gene_ann, + self.strand, + want_exon, + want_body, + self.want_multi, + ); + + // Mapping funnel: count uniquely-mapped reads by region (CellRanger's + // "confidently mapped" = MAPQ 255 ≈ a single alignment), independent of + // barcode validity. + if cdna_transcripts.len() == 1 { + match class.region { + Some(Region::Exonic) => { + self.region_stats.exonic.fetch_add(1, Ordering::Relaxed); + } + Some(Region::Intronic) => { + self.region_stats.intronic.fetch_add(1, Ordering::Relaxed); + } + Some(Region::Intergenic) => { + self.region_stats.intergenic.fetch_add(1, Ordering::Relaxed); + } + None => {} + } + if class.antisense { + self.region_stats.antisense.fetch_add(1, Ordering::Relaxed); + } + } + + // No barcode read (too short) → nothing to count (region already tallied). + let Some(bc) = barcode else { + return out; + }; + + // Cell-barcode match. + let cb_match = self + .whitelist + .match_cb(&bc.cb_seq, &bc.cb_qual, self.match_type); + self.stats.record_cb(&cb_match); + + let cb_resolved: Option = match &cb_match { + CbMatch::Exact(idx) | CbMatch::Corrected(idx) => Some(*idx), + CbMatch::Multi(_) => None, // deferred to collation + CbMatch::NoMatch | CbMatch::NinCb | CbMatch::MultMatchRejected => return out, + }; + + // UMI validity. + let umi = match check_umi(&bc.umi_seq) { + UmiCheck::Ok(packed) => { + self.stats.record_umi(&UmiCheck::Ok(packed)); + packed + } + rejected => { + self.stats.record_umi(&rejected); + return out; + } + }; + + // SJ feature: record (cell, UMI, junction) for each crossed junction. + // Only for resolved CBs (1MM_multi deferral is not applied to SJ). + if self.sj_enabled + && !junctions.is_empty() + && let Some(cb) = cb_resolved + { + out.sj = junctions + .iter() + .map(|&(intron_start, intron_end)| SjCountRecord { + cb, + umi, + intron_start, + intron_end, + }) + .collect(); + } + + // Velocyto feature: gene from gene-body overlap, then classify the read + // spliced/unspliced/ambiguous. Resolved CB only. + if self.velocyto_enabled + && let Some(cb) = cb_resolved + && let GeneAssignment::Gene(gene) = class.gene_full + { + out.velocyto = Some(VelocytoRecord { + cb, + umi, + gene, + category: velocyto_category(cdna_transcripts, &self.gene_ann, gene), + }); + } + + // The CB match + UMI are shared across features; reuse the cached + // per-feature gene assignment from `classify_read`. One outcome/feature. + out.per_feature = self + .features + .iter() + .enumerate() + .map(|(fi, &feature)| { + let mut fo = FeatureOutcome::default(); + let assignment = match feature { + SoloFeature::Gene => class.gene, + SoloFeature::GeneFull => class.gene_full, + }; + let gene = match assignment { + GeneAssignment::Gene(g) => g, + GeneAssignment::Ambiguous => { + // Gene-ambiguous read: record its gene set for + // --soloMultiMappers distribution (resolved CB only). + if let Some(cb) = cb_resolved { + let genes = match feature { + SoloFeature::Gene => &class.gene_multi, + SoloFeature::GeneFull => &class.gene_full_multi, + }; + if !genes.is_empty() { + fo.multi_gene = Some(MultiGeneRecord { + cb, + umi, + genes: genes.clone(), + }); + } + } + return fo; + } + GeneAssignment::NoFeature | GeneAssignment::Unmapped => return fo, + }; + // Reads uniquely mapped to a gene under this feature, among + // valid-barcode reads (STARsolo "Reads Mapped to "). + self.feature_reads[fi].fetch_add(1, Ordering::Relaxed); + match (cb_resolved, &cb_match) { + (Some(cb), _) => fo.record = Some(SoloCountRecord { cb, umi, gene }), + (None, CbMatch::Multi(cands)) => { + fo.multi = Some(SoloMultiRecord { + candidates: cands.clone(), + umi, + gene, + }); + } + (None, _) => unreachable!("non-multi unresolved CB returned early"), + } + fo + }) + .collect(); + out + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::io::fastq::encode_base; + + fn encoded_read(name: &str, seq: &str, qual: &str) -> EncodedRead { + EncodedRead { + name: name.to_string(), + sequence: seq.bytes().map(encode_base).collect(), + quality: qual.bytes().collect(), + } + } + + fn v2_layout() -> SoloBarcodeLayout { + // 10x v2: CB at 1..16 (16 bp), UMI at 17..26 (10 bp). + SoloBarcodeLayout::Simple { + cb_start: 0, + cb_len: 16, + umi_start: 16, + umi_len: 10, + } + } + + #[test] + fn layout_from_params_converts_to_zero_based() { + let params = Parameters::try_parse_from([ + "rustar-aligner", + "--soloType", + "CB_UMI_Simple", + "--readFilesIn", + "cdna.fq", + "bc.fq", + "--sjdbGTFfile", + "genes.gtf", + "--soloCBwhitelist", + "wl.txt", + ]) + .unwrap(); + let layout = SoloBarcodeLayout::from_params(¶ms); + assert_eq!( + layout, + SoloBarcodeLayout::Simple { + cb_start: 0, + cb_len: 16, + umi_start: 16, + umi_len: 10, + } + ); + assert_eq!(layout.min_read_len(), 26); + } + + #[test] + fn complex_layout_assembles_segments() { + // Two CB segments [0..2] + [4..6] (skipping a 2bp linker), UMI [6..8]. + let layout = SoloBarcodeLayout::Complex { + cb_segments: vec![(0, 2), (4, 2)], + umi: (6, 2), + }; + let read = encoded_read("r", "AACCGGTT", "IIIIIIII"); + let bc = layout.extract(&read).unwrap(); + // CB = bases [0,1] ++ [4,5] = "AA" ++ "GG"; UMI = [6,7] = "TT". + assert_eq!( + bc.cb_seq, + "AAGG".bytes().map(encode_base).collect::>() + ); + assert_eq!( + bc.umi_seq, + "TT".bytes().map(encode_base).collect::>() + ); + } + + #[test] + fn parse_position_read_start() { + assert_eq!(parse_position("0_0_0_7").unwrap(), (0, 8)); + assert_eq!(parse_position("0_8_0_15").unwrap(), (8, 8)); + assert!(parse_position("2_0_2_7").is_err()); // adapter anchor unsupported + assert!(parse_position("0_5_0_2").is_err()); // end < start + } + + #[test] + fn extract_v2_barcode() { + let layout = v2_layout(); + // 16bp CB = AAAAAAAACCCCCCCC, 10bp UMI = GGGGGTTTTT. + let read = encoded_read( + "bc1", + "AAAAAAAACCCCCCCCGGGGGTTTTT", + "IIIIIIIIIIIIIIIIJJJJJJJJJJ", + ); + let bc = layout.extract(&read).expect("should extract"); + assert_eq!(bc.cb_string(), "AAAAAAAACCCCCCCC"); + assert_eq!(bc.umi_string(), "GGGGGTTTTT"); + assert_eq!(bc.cb_qual.len(), 16); + assert_eq!(bc.umi_qual.len(), 10); + assert!(!bc.cb_has_n()); + assert!(!bc.umi_has_n()); + } + + #[test] + fn extract_too_short_returns_none() { + let layout = v2_layout(); + let read = encoded_read("short", "AAAAAAAACCCC", "IIIIIIIIIIII"); + assert!(layout.extract(&read).is_none()); + } + + #[test] + fn detects_n_in_cb_and_umi() { + let layout = v2_layout(); + let read = encoded_read( + "bcN", + "AAAAAAAANCCCCCCCGGGGGTTTTN", + "IIIIIIIIIIIIIIIIJJJJJJJJJJ", + ); + let bc = layout.extract(&read).unwrap(); + assert!(bc.cb_has_n()); + assert!(bc.umi_has_n()); + } + + #[test] + fn reader_pairs_cdna_and_barcode() { + use std::io::Write; + use tempfile::NamedTempFile; + + let mut cdna = NamedTempFile::new().unwrap(); + writeln!(cdna, "@r1\nACGTACGTAC\n+\nIIIIIIIIII").unwrap(); + writeln!(cdna, "@r2\nTTTTGGGGCC\n+\nIIIIIIIIII").unwrap(); + cdna.flush().unwrap(); + + let mut bc = NamedTempFile::new().unwrap(); + writeln!( + bc, + "@r1\nAAAAAAAACCCCCCCCGGGGGTTTTT\n+\nIIIIIIIIIIIIIIIIJJJJJJJJJJ" + ) + .unwrap(); + writeln!( + bc, + "@r2\nGGGGGGGGTTTTTTTTACGTACGTAC\n+\nIIIIIIIIIIIIIIIIJJJJJJJJJJ" + ) + .unwrap(); + bc.flush().unwrap(); + + let mut reader = SoloReadReader::open(cdna.path(), bc.path(), v2_layout(), None).unwrap(); + let batch = reader.read_batch(10).unwrap(); + assert_eq!(batch.len(), 2); + assert_eq!(batch[0].cdna.name, "r1"); + assert_eq!( + batch[0].barcode.as_ref().unwrap().cb_string(), + "AAAAAAAACCCCCCCC" + ); + assert_eq!( + batch[1].barcode.as_ref().unwrap().umi_string(), + "ACGTACGTAC" + ); + } + + #[test] + fn reader_length_mismatch_errors() { + use std::io::Write; + use tempfile::NamedTempFile; + + let mut cdna = NamedTempFile::new().unwrap(); + writeln!(cdna, "@r1\nACGT\n+\nIIII").unwrap(); + writeln!(cdna, "@r2\nTTTT\n+\nIIII").unwrap(); + cdna.flush().unwrap(); + + let mut bc = NamedTempFile::new().unwrap(); + writeln!( + bc, + "@r1\nAAAAAAAACCCCCCCCGGGGGTTTTT\n+\nIIIIIIIIIIIIIIIIJJJJJJJJJJ" + ) + .unwrap(); + bc.flush().unwrap(); + + let mut reader = SoloReadReader::open(cdna.path(), bc.path(), v2_layout(), None).unwrap(); + assert!(reader.read_batch(10).is_err()); + } +} diff --git a/src/solo/smartseq.rs b/src/solo/smartseq.rs new file mode 100644 index 0000000..207468b --- /dev/null +++ b/src/solo/smartseq.rs @@ -0,0 +1,138 @@ +//! `--soloType SmartSeq` — plate-based full-length protocols (Smart-seq2). +//! +//! There are no cell barcodes or UMIs in the reads. Each plate well is a +//! separate library given by a `--readFilesManifest` line +//! (`read1 read2 cellID`); the cell identity is the manifest cellID, +//! and a gene's count for a cell is the number of its uniquely-gene-assigned +//! reads (no UMI deduplication). Output mirrors the droplet path: +//! `Solo.out/Gene/raw/{matrix.mtx, barcodes.tsv (cell IDs), features.tsv}`. +//! +//! Supports both single-end manifests (`read2 = -`, read counts) and paired-end +//! (`read2` = mate-2 file, fragment counts via paired alignment). + +use crate::error::Error; +use std::path::{Path, PathBuf}; +use std::sync::Mutex; + +/// One plate-well cell from the manifest. +pub struct SmartSeqCell { + pub read1: PathBuf, + /// Mate-2 file for paired-end SmartSeq; `None` for single-end (`read2 = -`). + pub read2: Option, + pub cell_id: String, +} + +/// Parse a `--readFilesManifest` TSV into per-cell entries. Lines are +/// `read1 read2 cellID`; blank lines and `#` comments are skipped. +/// `read2 = -` is single-end; any other value is the mate-2 file (paired-end). +pub fn parse_manifest(path: &Path) -> Result, Error> { + let text = std::fs::read_to_string(path).map_err(|e| Error::io(e, path))?; + let mut cells = Vec::new(); + for (lineno, line) in text.lines().enumerate() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + let f: Vec<&str> = line.split('\t').collect(); + if f.len() < 3 { + return Err(invalid(format!( + "readFilesManifest line {}: expected 'read1read2cellID', got {:?}", + lineno + 1, + line + ))); + } + cells.push(SmartSeqCell { + read1: PathBuf::from(f[0]), + read2: (f[1] != "-").then(|| PathBuf::from(f[1])), + cell_id: f[2].to_string(), + }); + } + if cells.is_empty() { + return Err(invalid(format!( + "readFilesManifest {} has no cell entries", + path.display() + ))); + } + Ok(cells) +} + +fn invalid(msg: String) -> Error { + Error::from(std::io::Error::new(std::io::ErrorKind::InvalidInput, msg)) +} + +/// Per-cell, per-gene read counts for a SmartSeq run. `cells` is the manifest +/// order (the matrix column order); `counts[cell]` maps gene → read count. +pub struct SmartSeqCounts { + pub cell_ids: Vec, + pub counts: Vec>>, + pub n_genes: usize, +} + +impl SmartSeqCounts { + pub fn new(cell_ids: Vec, n_genes: usize) -> Self { + let counts = (0..cell_ids.len()) + .map(|_| Mutex::new(std::collections::HashMap::new())) + .collect(); + Self { + cell_ids, + counts, + n_genes, + } + } + + /// Add `+1` to (cell, gene) for one uniquely-assigned read. + pub fn add(&self, cell: usize, gene: u32) { + *self.counts[cell].lock().unwrap().entry(gene).or_insert(0) += 1; + } + + /// Write `Solo.out/Gene/raw/{matrix.mtx, barcodes.tsv, features.tsv}` — + /// genes × cells, integer read counts. `gzip` appends `.gz`. + pub fn write_matrix( + &self, + raw_dir: &Path, + gene_ids: &[String], + gzip: bool, + ) -> Result { + std::fs::create_dir_all(raw_dir).map_err(|e| Error::io(e, raw_dir))?; + + // features.tsv (CellRanger v3 layout: id, name, "Gene Expression"). + crate::solo::count::write_file(&raw_dir.join("features.tsv"), gzip, |w| { + for id in gene_ids { + writeln!(w, "{id}\t{id}\tGene Expression").map_err(|e| Error::io(e, raw_dir))?; + } + Ok(()) + })?; + // barcodes.tsv = the manifest cell IDs (one per matrix column). + crate::solo::count::write_file(&raw_dir.join("barcodes.tsv"), gzip, |w| { + for cid in &self.cell_ids { + writeln!(w, "{cid}").map_err(|e| Error::io(e, raw_dir))?; + } + Ok(()) + })?; + + // matrix.mtx — collect entries cell-ascending, gene-ascending. + let mut nnz = 0usize; + let path = raw_dir.join("matrix.mtx"); + // Pre-count nnz. + for c in &self.counts { + nnz += c.lock().unwrap().len(); + } + crate::solo::count::write_file(&path, gzip, |w| { + writeln!(w, "%%MatrixMarket matrix coordinate integer general") + .map_err(|e| Error::io(e, &path))?; + writeln!(w, "%").map_err(|e| Error::io(e, &path))?; + writeln!(w, "{} {} {}", self.n_genes, self.cell_ids.len(), nnz) + .map_err(|e| Error::io(e, &path))?; + for (ci, cell) in self.counts.iter().enumerate() { + let map = cell.lock().unwrap(); + let mut entries: Vec<(u32, u64)> = map.iter().map(|(&g, &c)| (g, c)).collect(); + entries.sort_unstable_by_key(|&(g, _)| g); + for (g, c) in entries { + writeln!(w, "{} {} {}", g + 1, ci + 1, c).map_err(|e| Error::io(e, &path))?; + } + } + Ok(()) + })?; + Ok(nnz) + } +} diff --git a/src/solo/whitelist.rs b/src/solo/whitelist.rs new file mode 100644 index 0000000..4023836 --- /dev/null +++ b/src/solo/whitelist.rs @@ -0,0 +1,740 @@ +//! Cell-barcode whitelist loading and read-stage CB/UMI matching (Phase 14.2). +//! +//! Faithful port of STAR's `SoloReadBarcode_getCBandUMI.cpp` read stage: +//! barcodes are 2-bit packed (seq[0] in the high bits) into a `u64` and the +//! whitelist is a sorted array searched by binary search. Exact match, +//! single-N correction, and 1-mismatch (1MM / 1MM_multi) correction follow +//! STAR's enumeration exactly. +//! +//! The 1MM_multi *posterior* resolution (count + quality weighted) is a +//! collation-stage concern and is deferred to Phase 14.4 — here a multi-match +//! read records all candidate whitelist indices plus the mismatch-position +//! quality, exactly as STAR's `cbMatchString`. + +use crate::error::Error; +use crate::io::fastq::{decode_base, encode_base}; +use flate2::read::GzDecoder; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::Path; +use std::str::FromStr; +use std::sync::atomic::{AtomicU64, Ordering}; + +/// Maximum barcode length representable in a `u64` (32 × 2-bit bases). +pub const CB_LEN_MAX: usize = 32; + +// --------------------------------------------------------------------------- +// Barcode packing +// --------------------------------------------------------------------------- + +/// Result of packing an encoded barcode into a `u64`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PackResult { + /// No ambiguous bases; the packed value. + NoN(u64), + /// Exactly one `N`; `packed` has `A` (0) at the N position. + OneN { packed: u64, pos: usize }, + /// More than one `N` — uncorrectable. + ManyN, +} + +/// 2-bit pack an encoded barcode (`0=A,1=C,2=G,3=T,4=N`) with `seq[0]` in the +/// high bits, matching STAR's `convertNuclStrToInt64`. +pub fn pack_barcode(seq: &[u8]) -> PackResult { + let len = seq.len(); + let mut packed: u64 = 0; + let mut n_pos: Option = None; + let mut n_count = 0usize; + for (i, &b) in seq.iter().enumerate() { + let shift = 2 * (len - 1 - i); + if b >= 4 { + n_count += 1; + if n_count > 1 { + return PackResult::ManyN; + } + n_pos = Some(i); + // leave 0 (A) at this position; correction substitutes all 4 bases + } else { + packed |= (b as u64) << shift; + } + } + match n_pos { + None => PackResult::NoN(packed), + Some(pos) => PackResult::OneN { packed, pos }, + } +} + +/// Unpack a `u64` of `len` 2-bit bases back to an ASCII `ACGT` string +/// (`seq[0]` from the high bits). +pub fn unpack_barcode(packed: u64, len: usize) -> String { + (0..len) + .map(|i| { + let shift = 2 * (len - 1 - i); + decode_base(((packed >> shift) & 0b11) as u8) as char + }) + .collect() +} + +/// Bit shift for the base at sequence index `pos` in a `len`-base packing. +#[inline] +fn shift_for(pos: usize, len: usize) -> u32 { + (2 * (len - 1 - pos)) as u32 +} + +// --------------------------------------------------------------------------- +// Match-type configuration (--soloCBmatchWLtype) +// --------------------------------------------------------------------------- + +/// Flags decoded from `--soloCBmatchWLtype`. Mirrors STAR's `CBmatchWL` +/// boolean fields one-for-one, so the multiple bools are intentional. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(clippy::struct_excessive_bools)] +pub struct CbMatchType { + /// Allow a single mismatch to the whitelist. + pub mm1: bool, + /// Keep multiple 1MM candidates for posterior resolution. + pub mm1_multi: bool, + /// Allow multiple matches for the N-substitution path. + pub mm1_multi_nbase: bool, + /// Add pseudocounts in posterior resolution (collation stage). + pub pseudocounts: bool, +} + +impl FromStr for CbMatchType { + type Err = String; + fn from_str(s: &str) -> Result { + match s { + "Exact" => Ok(Self { + mm1: false, + mm1_multi: false, + mm1_multi_nbase: false, + pseudocounts: false, + }), + "1MM" => Ok(Self { + mm1: true, + mm1_multi: false, + mm1_multi_nbase: false, + pseudocounts: false, + }), + "1MM_multi" => Ok(Self { + mm1: true, + mm1_multi: true, + mm1_multi_nbase: false, + pseudocounts: false, + }), + "1MM_multi_pseudocounts" => Ok(Self { + mm1: true, + mm1_multi: true, + mm1_multi_nbase: false, + pseudocounts: true, + }), + "1MM_multi_Nbase_pseudocounts" => Ok(Self { + mm1: true, + mm1_multi: true, + mm1_multi_nbase: true, + pseudocounts: true, + }), + _ => Err(format!( + "unknown soloCBmatchWLtype '{s}'; expected Exact, 1MM, 1MM_multi, 1MM_multi_pseudocounts, or 1MM_multi_Nbase_pseudocounts" + )), + } + } +} + +// --------------------------------------------------------------------------- +// Match result +// --------------------------------------------------------------------------- + +/// One candidate whitelist barcode reachable by a single edit, plus the quality +/// of the mismatched base (for posterior resolution at collation). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CbCandidate { + /// Index into the sorted whitelist. + pub wl_index: u32, + /// 0-based mismatch position in the read barcode. + pub mismatch_pos: usize, + /// Raw Phred+33 quality byte at the mismatch position. + pub mismatch_qual: u8, +} + +/// Outcome of matching one cell barcode to the whitelist. The negative STAR +/// `cbMatch` codes map to the rejection variants. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CbMatch { + /// Exact whitelist hit (cbMatch=0); carries the sorted whitelist index. + Exact(u32), + /// Unambiguous single-edit correction (cbMatch=1). + Corrected(u32), + /// Multiple 1MM candidates kept for later posterior resolution (cbMatch>1). + Multi(Vec), + /// No whitelist match within one edit (cbMatch=-1). + NoMatch, + /// More than one `N` in the barcode (cbMatch=-2). + NinCb, + /// >1 whitelist match but `mm1_multi` not enabled (cbMatch=-3). + MultMatchRejected, +} + +// --------------------------------------------------------------------------- +// UMI validity (matches STAR umiCheck=-23 / -24) +// --------------------------------------------------------------------------- + +/// Outcome of validating a UMI. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UmiCheck { + /// Valid UMI; carries the packed value. + Ok(u64), + /// Contains an `N` (cbMatch=-23). + NinUmi, + /// Exact homopolymer, e.g. all-A (cbMatch=-24). + Homopolymer, +} + +/// Validate a UMI: reject any `N`, then reject exact homopolymers. +pub fn check_umi(umi_seq: &[u8]) -> UmiCheck { + match pack_barcode(umi_seq) { + PackResult::ManyN | PackResult::OneN { .. } => UmiCheck::NinUmi, + PackResult::NoN(packed) => { + if is_homopolymer(umi_seq) { + UmiCheck::Homopolymer + } else { + UmiCheck::Ok(packed) + } + } + } +} + +fn is_homopolymer(seq: &[u8]) -> bool { + match seq.first() { + None => false, + Some(&first) => seq.iter().all(|&b| b == first), + } +} + +// --------------------------------------------------------------------------- +// Whitelist +// --------------------------------------------------------------------------- + +/// Cell-barcode whitelist. `List` is an explicit, sorted, de-duplicated set of +/// packed barcodes; `NoWhitelist` corresponds to `--soloCBwhitelist None`. +pub enum CbWhitelist { + List { + /// Sorted unique packed barcodes (binary-search target). + sorted: Vec, + /// `orig_index[k]` = line number of `sorted[k]` in the whitelist file, + /// for `barcodes.tsv` column ordering (Phase 14.4). + orig_index: Vec, + /// Per-whitelist-index exact-match read counts (posterior prior). + exact_counts: Vec, + /// Barcode length in bases. + len: usize, + }, + /// `--soloCBwhitelist None`: keep every valid (N-free) barcode as observed. + NoWhitelist { len: usize }, +} + +impl CbWhitelist { + /// Number of whitelist barcodes (0 for `NoWhitelist`). + pub fn len(&self) -> usize { + match self { + Self::List { sorted, .. } => sorted.len(), + Self::NoWhitelist { .. } => 0, + } + } + + /// True if the whitelist has no barcodes. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Barcode length in bases. + pub fn barcode_len(&self) -> usize { + match self { + Self::List { len, .. } | Self::NoWhitelist { len } => *len, + } + } + + /// Decode the whitelist barcode at sorted index `idx` to an ASCII string. + pub fn barcode_string(&self, idx: u32) -> Option { + match self { + Self::List { sorted, len, .. } => { + sorted.get(idx as usize).map(|&p| unpack_barcode(p, *len)) + } + Self::NoWhitelist { .. } => None, + } + } + + /// Append the ASCII `ACGT` barcode at sorted index `idx` to `out` without + /// allocating a `String` — used when writing the full whitelist to + /// `barcodes.tsv` (millions of lines). Appends nothing for an out-of-range + /// index or `NoWhitelist`. + pub fn unpack_barcode_into(&self, idx: u32, out: &mut Vec) { + if let Self::List { sorted, len, .. } = self + && let Some(&packed) = sorted.get(idx as usize) + { + for i in 0..*len { + let shift = 2 * (*len - 1 - i); + out.push(decode_base(((packed >> shift) & 0b11) as u8)); + } + } + } + + /// Load a whitelist from a file (plain or gzip). One barcode per line; + /// blank lines ignored. Barcodes are encoded, packed, sorted, de-duplicated. + pub fn load(path: &Path) -> Result { + let (packed, len) = Self::load_packed(path)?; + Ok(Self::from_packed_list(packed, len)) + } + + /// Build a `List` whitelist from packed barcodes (sort + dedup + index). + pub fn from_packed_list(packed: Vec, len: usize) -> Self { + let mut indexed: Vec<(u64, u32)> = packed + .into_iter() + .enumerate() + .map(|(i, p)| (p, i as u32)) + .collect(); + indexed.sort_unstable_by_key(|&(p, _)| p); + indexed.dedup_by_key(|&mut (p, _)| p); + let sorted: Vec = indexed.iter().map(|&(p, _)| p).collect(); + let orig_index: Vec = indexed.iter().map(|&(_, i)| i).collect(); + let exact_counts = (0..sorted.len()).map(|_| AtomicU64::new(0)).collect(); + Self::List { + sorted, + orig_index, + exact_counts, + len, + } + } + + /// `CB_UMI_Complex`: combine per-segment whitelists into one whitelist of + /// concatenated barcodes (the cartesian product, segment order = file order). + /// Matching the assembled CB against this is equivalent to STARsolo's + /// per-segment matching for both Exact and 1MM (a 1MM in the concatenation is + /// a 1MM in exactly one segment). Errors if the combined length exceeds 32. + pub fn load_complex(paths: &[std::path::PathBuf]) -> Result { + let segs: Vec<(Vec, usize)> = paths + .iter() + .map(|p| Self::load_packed(p)) + .collect::>()?; + let total_len: usize = segs.iter().map(|(_, l)| l).sum(); + if total_len == 0 || total_len > CB_LEN_MAX { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("combined CB length {total_len} out of range (1..={CB_LEN_MAX})"), + ))); + } + let n_combos: usize = segs.iter().map(|(p, _)| p.len()).product(); + if n_combos > 100_000_000 { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("CB_UMI_Complex whitelist product is {n_combos} barcodes (too large)"), + ))); + } + let mut combined: Vec = vec![0]; + for (packed, len) in &segs { + let mut next = Vec::with_capacity(combined.len() * packed.len()); + for &c in &combined { + for &p in packed { + next.push((c << (2 * len)) | p); + } + } + combined = next; + } + Ok(Self::from_packed_list(combined, total_len)) + } + + /// Read a whitelist file into raw packed barcodes + barcode length. + fn load_packed(path: &Path) -> Result<(Vec, usize), Error> { + let reader = open_maybe_gzip(path)?; + let mut packed: Vec = Vec::new(); + let mut len: usize = 0; + for (lineno, line) in reader.lines().enumerate() { + let line = line.map_err(Error::from)?; + let bc = line.trim(); + if bc.is_empty() { + continue; + } + // STARsolo whitelists may carry a second column (e.g. translated + // barcodes for multi-ome); take the first whitespace token. + let bc = bc.split_whitespace().next().unwrap_or(""); + if bc.is_empty() { + continue; + } + if len == 0 { + len = bc.len(); + if len == 0 || len > CB_LEN_MAX { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("whitelist barcode length {len} out of range (1..={CB_LEN_MAX})"), + ))); + } + } else if bc.len() != len { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!( + "whitelist barcode on line {} has length {} (expected {len})", + lineno + 1, + bc.len() + ), + ))); + } + let encoded: Vec = bc.bytes().map(encode_base).collect(); + match pack_barcode(&encoded) { + PackResult::NoN(p) => packed.push(p), + _ => { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("whitelist barcode '{bc}' on line {} contains N", lineno + 1), + ))); + } + } + } + if packed.is_empty() { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "whitelist is empty", + ))); + } + Ok((packed, len)) + } + + /// Binary-search the sorted whitelist for `packed`; returns the sorted index. + fn search(&self, packed: u64) -> Option { + match self { + Self::List { sorted, .. } => sorted.binary_search(&packed).ok().map(|i| i as u32), + Self::NoWhitelist { .. } => None, + } + } + + /// Increment the exact-match count for sorted whitelist index `idx`. + fn bump_exact(&self, idx: u32) { + if let Self::List { exact_counts, .. } = self { + exact_counts[idx as usize].fetch_add(1, Ordering::Relaxed); + } + } + + /// Snapshot of exact-match counts per sorted whitelist index (for the + /// Phase 14.4 posterior). Empty for `NoWhitelist`. + pub fn exact_count_snapshot(&self) -> Vec { + match self { + Self::List { exact_counts, .. } => exact_counts + .iter() + .map(|c| c.load(Ordering::Relaxed)) + .collect(), + Self::NoWhitelist { .. } => Vec::new(), + } + } + + /// Match one cell barcode against the whitelist following STAR's read stage. + /// + /// `cb_seq` is encoded (`0..=4`); `cb_qual` is raw Phred+33 (parallel to + /// `cb_seq`). On an exact hit the whitelist's exact-count is incremented. + pub fn match_cb(&self, cb_seq: &[u8], cb_qual: &[u8], match_type: CbMatchType) -> CbMatch { + let len = cb_seq.len(); + match self { + Self::NoWhitelist { .. } => match pack_barcode(cb_seq) { + // No whitelist: every N-free barcode is its own "cell". We + // cannot return a stable index without a whitelist, so callers + // treat NoWhitelist specially; report NoMatch for N-containing. + PackResult::NoN(_) => CbMatch::Exact(0), + _ => CbMatch::NinCb, + }, + Self::List { .. } => match pack_barcode(cb_seq) { + PackResult::ManyN => CbMatch::NinCb, + PackResult::NoN(packed) => { + if let Some(idx) = self.search(packed) { + self.bump_exact(idx); + return CbMatch::Exact(idx); + } + if !match_type.mm1 { + return CbMatch::NoMatch; + } + // 1MM: every position × the 3 alternate bases. + let mut candidates: Vec = Vec::new(); + for pos in 0..len { + let shift = shift_for(pos, len); + let orig = (packed >> shift) & 0b11; + for alt in 0u64..4 { + if alt == orig { + continue; + } + let cand = (packed & !(0b11 << shift)) | (alt << shift); + if let Some(idx) = self.search(cand) { + candidates.push(CbCandidate { + wl_index: idx, + mismatch_pos: pos, + mismatch_qual: qual_at(cb_qual, pos), + }); + } + } + } + Self::resolve(candidates, match_type.mm1_multi) + } + PackResult::OneN { packed, pos } => { + if !match_type.mm1 { + return CbMatch::NoMatch; + } + // Substitute all 4 bases at the single N position. + let shift = shift_for(pos, len); + let mut candidates: Vec = Vec::new(); + for base in 0u64..4 { + let cand = (packed & !(0b11 << shift)) | (base << shift); + if let Some(idx) = self.search(cand) { + candidates.push(CbCandidate { + wl_index: idx, + mismatch_pos: pos, + mismatch_qual: qual_at(cb_qual, pos), + }); + } + } + Self::resolve(candidates, match_type.mm1_multi_nbase) + } + }, + } + } + + /// Turn a candidate list into a [`CbMatch`], honoring the multi flag. + fn resolve(candidates: Vec, allow_multi: bool) -> CbMatch { + match candidates.len() { + 0 => CbMatch::NoMatch, + 1 => CbMatch::Corrected(candidates[0].wl_index), + _ => { + if allow_multi { + CbMatch::Multi(candidates) + } else { + CbMatch::MultMatchRejected + } + } + } + } +} + +#[inline] +fn qual_at(qual: &[u8], pos: usize) -> u8 { + qual.get(pos).copied().unwrap_or(b'!') // '!' = Phred 0 +} + +/// Open a file, transparently decompressing `.gz`. +fn open_maybe_gzip(path: &Path) -> Result, Error> { + let file = File::open(path).map_err(|e| Error::io(e, path))?; + let is_gz = path + .extension() + .is_some_and(|e| e.eq_ignore_ascii_case("gz")); + if is_gz { + Ok(Box::new(BufReader::new(GzDecoder::new(file)))) + } else { + Ok(Box::new(BufReader::new(file))) + } +} + +// --------------------------------------------------------------------------- +// Stats (STAR cbMatch categories) +// --------------------------------------------------------------------------- + +/// Per-run barcode-matching statistics, mirroring STAR's `SoloReadBarcodeStats`. +#[derive(Debug, Default)] +pub struct CbMatchStats { + pub yes_exact: AtomicU64, + pub yes_one_mm: AtomicU64, + pub yes_mult_mm: AtomicU64, + pub no_match: AtomicU64, + pub n_in_cb: AtomicU64, + pub mult_rejected: AtomicU64, + pub n_in_umi: AtomicU64, + pub umi_homopolymer: AtomicU64, +} + +impl CbMatchStats { + pub fn new() -> Self { + Self::default() + } + + /// Record one CB match outcome. + pub fn record_cb(&self, m: &CbMatch) { + let c = match m { + CbMatch::Exact(_) => &self.yes_exact, + CbMatch::Corrected(_) => &self.yes_one_mm, + CbMatch::Multi(_) => &self.yes_mult_mm, + CbMatch::NoMatch => &self.no_match, + CbMatch::NinCb => &self.n_in_cb, + CbMatch::MultMatchRejected => &self.mult_rejected, + }; + c.fetch_add(1, Ordering::Relaxed); + } + + /// Record one UMI check outcome (only the rejection cases are counted). + pub fn record_umi(&self, u: &UmiCheck) { + match u { + UmiCheck::NinUmi => { + self.n_in_umi.fetch_add(1, Ordering::Relaxed); + } + UmiCheck::Homopolymer => { + self.umi_homopolymer.fetch_add(1, Ordering::Relaxed); + } + UmiCheck::Ok(_) => {} + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + fn enc(s: &str) -> Vec { + s.bytes().map(encode_base).collect() + } + + fn write_wl(barcodes: &[&str]) -> NamedTempFile { + let mut f = NamedTempFile::new().unwrap(); + for b in barcodes { + writeln!(f, "{b}").unwrap(); + } + f.flush().unwrap(); + f + } + + #[test] + fn pack_roundtrip() { + let s = "ACGTACGT"; + match pack_barcode(&enc(s)) { + PackResult::NoN(p) => assert_eq!(unpack_barcode(p, 8), s), + _ => panic!("should pack cleanly"), + } + } + + #[test] + fn pack_detects_one_and_many_n() { + assert!(matches!( + pack_barcode(&enc("ACNT")), + PackResult::OneN { pos: 2, .. } + )); + assert_eq!(pack_barcode(&enc("ANNT")), PackResult::ManyN); + } + + #[test] + fn exact_match_and_count() { + let f = write_wl(&["AAAA", "ACGT", "TTTT"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + let t = CbMatchType::from_str("1MM_multi").unwrap(); + let m = wl.match_cb(&enc("ACGT"), b"IIII", t); + match m { + CbMatch::Exact(idx) => assert_eq!(wl.barcode_string(idx).unwrap(), "ACGT"), + other => panic!("expected exact, got {other:?}"), + } + let counts = wl.exact_count_snapshot(); + assert_eq!(counts.iter().sum::(), 1); + } + + #[test] + fn single_mismatch_correction() { + let f = write_wl(&["AAAA", "ACGT", "TTTT"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + let t = CbMatchType::from_str("1MM").unwrap(); + // ACGA differs from ACGT at last position only. + let m = wl.match_cb(&enc("ACGA"), b"IIII", t); + match m { + CbMatch::Corrected(idx) => assert_eq!(wl.barcode_string(idx).unwrap(), "ACGT"), + other => panic!("expected corrected, got {other:?}"), + } + } + + #[test] + fn ambiguous_multi_match_behavior() { + // AAAA and CAAA both within 1MM of NAAA-ish read "GAAA"? Use TAAA read: + // candidates AAAA (pos0 T->A) and CAAA (pos0 T->C). Both in WL. + let f = write_wl(&["AAAA", "CAAA"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + + // 1MM (no multi): rejected as ambiguous. + let rej = wl.match_cb(&enc("TAAA"), b"IIII", CbMatchType::from_str("1MM").unwrap()); + assert_eq!(rej, CbMatch::MultMatchRejected); + + // 1MM_multi: both candidates kept for later resolution. + let multi = wl.match_cb( + &enc("TAAA"), + b"IIII", + CbMatchType::from_str("1MM_multi").unwrap(), + ); + match multi { + CbMatch::Multi(c) => assert_eq!(c.len(), 2), + other => panic!("expected multi, got {other:?}"), + } + } + + #[test] + fn no_match_when_too_far() { + let f = write_wl(&["AAAA", "TTTT"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + let t = CbMatchType::from_str("1MM_multi").unwrap(); + // GGGG is >1 edit from both. + assert_eq!(wl.match_cb(&enc("GGGG"), b"IIII", t), CbMatch::NoMatch); + } + + #[test] + fn n_correction_single() { + let f = write_wl(&["AAAA", "ACGT"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + let t = CbMatchType::from_str("1MM_multi").unwrap(); + // ACGN → only ACGT matches among the 4 substitutions. + let m = wl.match_cb(&enc("ACGN"), b"IIII", t); + match m { + CbMatch::Corrected(idx) => assert_eq!(wl.barcode_string(idx).unwrap(), "ACGT"), + other => panic!("expected corrected, got {other:?}"), + } + } + + #[test] + fn many_n_rejected() { + let f = write_wl(&["AAAA"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + let t = CbMatchType::from_str("1MM_multi").unwrap(); + assert_eq!(wl.match_cb(&enc("NNAA"), b"IIII", t), CbMatch::NinCb); + } + + #[test] + fn exact_only_mode_no_correction() { + let f = write_wl(&["ACGT"]); + let wl = CbWhitelist::load(f.path()).unwrap(); + let t = CbMatchType::from_str("Exact").unwrap(); + assert_eq!(wl.match_cb(&enc("ACGA"), b"IIII", t), CbMatch::NoMatch); + } + + #[test] + fn umi_checks() { + assert!(matches!(check_umi(&enc("ACGTAC")), UmiCheck::Ok(_))); + assert_eq!(check_umi(&enc("ACGTNC")), UmiCheck::NinUmi); + assert_eq!(check_umi(&enc("AAAAAA")), UmiCheck::Homopolymer); + assert_eq!(check_umi(&enc("TTTTTT")), UmiCheck::Homopolymer); + } + + #[test] + fn whitelist_length_mismatch_errors() { + let f = write_wl(&["AAAA", "TTT"]); + assert!(CbWhitelist::load(f.path()).is_err()); + } + + #[test] + fn whitelist_gzip_load() { + use flate2::Compression; + use flate2::write::GzEncoder; + let f = tempfile::Builder::new().suffix(".gz").tempfile().unwrap(); + let mut enc = GzEncoder::new(f.as_file(), Compression::default()); + writeln!(enc, "AAAA\nACGT\nTTTT").unwrap(); + enc.finish().unwrap(); + let wl = CbWhitelist::load(f.path()).unwrap(); + assert_eq!(wl.len(), 3); + } + + #[test] + fn match_type_parsing() { + assert!(!CbMatchType::from_str("Exact").unwrap().mm1); + assert!(CbMatchType::from_str("1MM").unwrap().mm1); + assert!(!CbMatchType::from_str("1MM").unwrap().mm1_multi); + assert!(CbMatchType::from_str("1MM_multi").unwrap().mm1_multi); + let n = CbMatchType::from_str("1MM_multi_Nbase_pseudocounts").unwrap(); + assert!(n.mm1_multi_nbase && n.pseudocounts); + assert!(CbMatchType::from_str("bogus").is_err()); + } +} diff --git a/test/Dockerfile.bench b/test/Dockerfile.bench new file mode 100644 index 0000000..f6397e0 --- /dev/null +++ b/test/Dockerfile.bench @@ -0,0 +1,15 @@ +# amd64 Linux image to benchmark CellRanger vs STARsolo vs rustar-aligner in a +# consistent environment. CellRanger is x86_64-only, so everything runs under +# linux/amd64 (Rosetta-accelerated on Apple Silicon) for a fair comparison. +# +# CellRanger itself is mounted at runtime (not baked in) from the extracted +# cellranger-10.0.0/ directory. +FROM --platform=linux/amd64 rust:1-bookworm + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + rna-star python3 samtools procps time pigz ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +RUN STAR --version && cargo --version && python3 --version +WORKDIR /work diff --git a/test/Dockerfile.solodiff b/test/Dockerfile.solodiff new file mode 100644 index 0000000..f6a5822 --- /dev/null +++ b/test/Dockerfile.solodiff @@ -0,0 +1,19 @@ +# Linux environment to run the STARsolo CellRanger differential test in a +# consistent way (real STAR works on Linux; the macOS build has a read bug). +# +# Build: docker build -f test/Dockerfile.solodiff -t rustar-solodiff . +# Run: docker run --rm -v "$PWD":/work -w /work \ +# -e CARGO_TARGET_DIR=/tmp/ct rustar-solodiff \ +# bash -c "cargo build --release && \ +# python3 test/solo_cellranger_diff.py \ +# --star \$(which STAR) --rustar /tmp/ct/release/rustar-aligner" +FROM rust:1-bookworm + +RUN apt-get update \ + && apt-get install -y --no-install-recommends rna-star python3 ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Report tool versions at build time for the record. +RUN STAR --version && cargo --version && python3 --version + +WORKDIR /work diff --git a/test/solo_bench.py b/test/solo_bench.py new file mode 100644 index 0000000..7f35c07 --- /dev/null +++ b/test/solo_bench.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""Runtime + output-stats benchmark: CellRanger vs STARsolo vs rustar-aligner. + +Runs inside the amd64 benchmark container (test/Dockerfile.bench) so all three +tools run in one consistent Linux/x86_64 environment. Mouse GRCm39-2024-A +reference (built from the CellRanger refdata fasta+gtf for STAR/rust; CellRanger +uses the refdata directly), 5' GEM-X chemistry. + +Each step is wall-clock + peak-RSS timed via /usr/bin/time -v. Output stats are +read from each tool's raw matrix (+ CellRanger metrics_summary.csv). + +Usage (inside container): + python3 test/solo_bench.py \ + --fasta REF/genome.fa --gtf REF/genes.gtf \ + --whitelist WL.txt --r1 R1.fq --r2 R2.fq \ + --cellranger /work/bench/cellranger-10.0.0/cellranger \ + --transcriptome /work/bench/refdata-gex-GRCm39-2024-A \ + --sample 5k_Mouse_PBMCs_5p_gem-x_GEX --fastqdir /work/bench/gex \ + --rustar /work/target-linux/release/rustar-aligner \ + --star $(which STAR) --threads 14 --mem-gb 36 --out /work/bench/results +""" +import argparse +import csv +import gzip +import json +import os +import re +import subprocess +import sys +import time + +# CellRanger 4/5-matching solo flags (3' clip omitted; 5' chemistry). +SOLO_COMMON = [ + "--soloType", "CB_UMI_Simple", + "--soloCBstart", "1", "--soloCBlen", "16", + "--soloUMIstart", "17", "--soloUMIlen", "12", + "--soloFeatures", "Gene", + "--soloStrand", "Reverse", # 5' GEX (SC5P-R2 strandedness "-") + "--soloCBmatchWLtype", "1MM_multi_Nbase_pseudocounts", + "--soloUMIfiltering", "MultiGeneUMI_CR", + "--soloUMIdedup", "1MM_CR", +] + +TIME = ["/usr/bin/time", "-v"] + + +def timed(cmd, logpath, env=None, cwd=None): + """Run cmd under /usr/bin/time -v; return (seconds, peak_rss_gb, ok).""" + print(" $", " ".join(str(c) for c in cmd), flush=True) + t0 = time.time() + with open(logpath, "w") as lf: + r = subprocess.run(TIME + list(map(str, cmd)), stdout=lf, stderr=subprocess.STDOUT, env=env, cwd=cwd) + wall = time.time() - t0 + peak = None + with open(logpath) as lf: + txt = lf.read() + m = re.search(r"Maximum resident set size \(kbytes\):\s*(\d+)", txt) + if m: + peak = int(m.group(1)) / 1024 / 1024 # KB -> GB (GNU time reports KB) + if r.returncode != 0: + print(f" !! exit {r.returncode}; tail:\n" + "\n".join(txt.splitlines()[-15:])) + return wall, peak, r.returncode == 0 + + +def index_built(idx_dir): + """True if a genome index already exists in idx_dir (skip rebuild/reuse).""" + return os.path.exists(os.path.join(idx_dir, "Genome")) or os.path.exists( + os.path.join(idx_dir, "SA") + ) + + +def opener(path): + return gzip.open(path, "rt") if path.endswith(".gz") else open(path) + + +def matrix_stats(raw_dir): + """Read a MatrixMarket raw dir -> {n_barcodes_with_counts, total_umi, n_genes_detected}.""" + mtx = None + for name in ("matrix.mtx.gz", "matrix.mtx"): + p = os.path.join(raw_dir, name) + if os.path.exists(p): + mtx = p + break + if not mtx: + return None + cells, genes, total = set(), set(), 0 + with opener(mtx) as f: + header_done = False + for line in f: + if line.startswith("%"): + continue + if not header_done: + header_done = True # dims line + continue + parts = line.split() + if len(parts) < 3: + continue + g, c, v = int(parts[0]), int(parts[1]), int(float(parts[2])) + if v > 0: + genes.add(g) + cells.add(c) + total += v + return {"n_barcodes_with_counts": len(cells), "n_genes_detected": len(genes), "total_umi": total} + + +def cellranger_metrics(outs_dir): + p = os.path.join(outs_dir, "metrics_summary.csv") + if not os.path.exists(p): + return {} + with open(p) as f: + rows = list(csv.reader(f)) + if len(rows) >= 2: + return dict(zip(rows[0], rows[1])) + return {} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--fasta", required=True) + ap.add_argument("--gtf", required=True) + ap.add_argument("--whitelist", required=True) + ap.add_argument("--r1", required=True) + ap.add_argument("--r2", required=True) + ap.add_argument("--cellranger", required=True) + ap.add_argument("--transcriptome", required=True) + ap.add_argument("--sample", required=True) + ap.add_argument("--fastqdir", required=True) + ap.add_argument("--rustar", required=True) + ap.add_argument("--star", default="STAR") + ap.add_argument("--threads", type=int, default=14) + ap.add_argument("--mem-gb", type=int, default=36) + ap.add_argument("--out", required=True) + ap.add_argument("--sa-nbases", default="14") + ap.add_argument("--chemistry", default="auto", help="CellRanger --chemistry") + ap.add_argument("--rust-temp-dir", default=None, + help="rustar --tempDir (caps-sa scratch; point at a disk with space)") + ap.add_argument("--skip", default="", help="comma list: cellranger,star,rustar") + args = ap.parse_args() + + os.makedirs(args.out, exist_ok=True) + logs = os.path.join(args.out, "logs") + os.makedirs(logs, exist_ok=True) + skip = set(s.strip() for s in args.skip.split(",") if s.strip()) + results = {} + + # ---- STARsolo ------------------------------------------------------- + if "star" not in skip: + print("\n===== STARsolo =====") + star_idx = os.path.join(args.out, "star_idx") + os.makedirs(star_idx, exist_ok=True) + if index_built(star_idx): + print(" (STAR index already present — skipping genomeGenerate)") + s_gen, s_gen_rss, ok = 0.0, 0.0, True + else: + s_gen, s_gen_rss, ok = timed( + [args.star, "--runMode", "genomeGenerate", "--genomeDir", star_idx, + "--genomeFastaFiles", args.fasta, "--sjdbGTFfile", args.gtf, + "--sjdbOverhang", "89", "--genomeSAindexNbases", args.sa_nbases, + "--runThreadN", args.threads], + os.path.join(logs, "star_genomeGenerate.log")) + star_out = os.path.join(args.out, "star_out") + "/" + os.makedirs(star_out, exist_ok=True) + gz = ["--readFilesCommand", "zcat"] if args.r1.endswith(".gz") else [] + s_run, s_run_rss, ok2 = timed( + [args.star, "--genomeDir", star_idx, "--readFilesIn", args.r2, args.r1, + "--runThreadN", args.threads, "--outSAMtype", "None"] + gz + + ["--soloCBwhitelist", args.whitelist, "--outFileNamePrefix", star_out] + + SOLO_COMMON, + os.path.join(logs, "star_solo.log")) + raw = os.path.join(star_out, "Solo.out", "Gene", "raw") + results["STARsolo"] = { + "index_build_s": round(s_gen, 1), "index_build_rss_gb": round(s_gen_rss or 0, 2), + "count_s": round(s_run, 1), "count_rss_gb": round(s_run_rss or 0, 2), + "stats": matrix_stats(raw), "ok": ok and ok2, + } + + # ---- rustar-aligner ------------------------------------------------- + if "rustar" not in skip: + print("\n===== rustar-aligner =====") + rust_idx = os.path.join(args.out, "rust_idx") + os.makedirs(rust_idx, exist_ok=True) + if index_built(rust_idx): + print(" (rustar index already present — skipping genomeGenerate)") + r_gen, r_gen_rss, ok = 0.0, 0.0, True + else: + tmp = ["--tempDir", args.rust_temp_dir] if args.rust_temp_dir else [] + r_gen, r_gen_rss, ok = timed( + [args.rustar, "--runMode", "genomeGenerate", "--genomeDir", rust_idx, + "--genomeFastaFiles", args.fasta, "--sjdbGTFfile", args.gtf, + "--sjdbOverhang", "89", "--genomeSAindexNbases", args.sa_nbases, + "--runThreadN", args.threads] + tmp, + os.path.join(logs, "rustar_genomeGenerate.log")) + rust_out = os.path.join(args.out, "rust_out") + "/" + os.makedirs(rust_out, exist_ok=True) + r_run, r_run_rss, ok2 = timed( + [args.rustar, "--genomeDir", rust_idx, "--readFilesIn", args.r2, args.r1, + "--sjdbGTFfile", args.gtf, "--runThreadN", args.threads, + "--outSAMtype", "SAM", + "--soloCBwhitelist", args.whitelist, "--outFileNamePrefix", rust_out] + + SOLO_COMMON, + os.path.join(logs, "rustar_solo.log")) + raw = os.path.join(rust_out, "Solo.out", "Gene", "raw") + results["rustar-aligner"] = { + "index_build_s": round(r_gen, 1), "index_build_rss_gb": round(r_gen_rss or 0, 2), + "count_s": round(r_run, 1), "count_rss_gb": round(r_run_rss or 0, 2), + "stats": matrix_stats(raw), "ok": ok and ok2, + } + + # ---- CellRanger ----------------------------------------------------- + if "cellranger" not in skip: + print("\n===== CellRanger =====") + cr_dir = os.path.join(args.out, "cr") + # cellranger count writes to ./; run in args.out + if os.path.exists(os.path.join(args.out, "cr_run")): + subprocess.run(["rm", "-rf", os.path.join(args.out, "cr_run")]) + c_run, c_rss, ok = timed( + [args.cellranger, "count", "--id", "cr_run", + "--transcriptome", args.transcriptome, + "--fastqs", args.fastqdir, "--sample", args.sample, + "--chemistry", args.chemistry, + "--create-bam", "false", "--nosecondary", + "--localcores", str(args.threads), "--localmem", str(args.mem_gb)], + os.path.join(logs, "cellranger_count.log"), + env={**os.environ}, cwd=args.out) + outs = os.path.join(args.out, "cr_run", "outs") + raw = os.path.join(outs, "raw_feature_bc_matrix") + results["CellRanger"] = { + "count_s": round(c_run, 1), "count_rss_gb": round(c_rss or 0, 2), + "stats": matrix_stats(raw), + "metrics": cellranger_metrics(outs), "ok": ok, + } + + # ---- report --------------------------------------------------------- + with open(os.path.join(args.out, "benchmark.json"), "w") as f: + json.dump(results, f, indent=2) + + print("\n================ BENCHMARK SUMMARY ================") + hdr = f"{'tool':<16}{'idx build(s)':>14}{'count(s)':>11}{'peak RSS(GB)':>14}{'barcodes':>10}{'genes':>8}{'total UMI':>12}" + print(hdr) + print("-" * len(hdr)) + for tool, r in results.items(): + st = r.get("stats") or {} + idx = r.get("index_build_s", "-") + peak = max(r.get("index_build_rss_gb", 0) or 0, r.get("count_rss_gb", 0) or 0) + print(f"{tool:<16}{str(idx):>14}{str(r.get('count_s','-')):>11}{peak:>14.2f}" + f"{str(st.get('n_barcodes_with_counts','-')):>10}" + f"{str(st.get('n_genes_detected','-')):>8}{str(st.get('total_umi','-')):>12}") + if "CellRanger" in results and results["CellRanger"].get("metrics"): + m = results["CellRanger"]["metrics"] + keys = ["Estimated Number of Cells", "Mean Reads per Cell", "Median Genes per Cell", + "Median UMI Counts per Cell", "Reads Mapped Confidently to Transcriptome"] + print("\nCellRanger reported metrics:") + for k in keys: + if k in m: + print(f" {k}: {m[k]}") + print(f"\nFull results: {os.path.join(args.out, 'benchmark.json')}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/solo_cellranger_diff.py b/test/solo_cellranger_diff.py new file mode 100644 index 0000000..c66270e --- /dev/null +++ b/test/solo_cellranger_diff.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +"""Differential test: rustar-aligner STARsolo vs real STAR, CellRanger-style run. + +Generates a small synthetic 10x-style dataset (genome + GTF + whitelist + cDNA +read + barcode read), runs BOTH STAR and rustar-aligner with the +CellRanger-4/5-matching solo flags from +https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#matching-cellranger-4xx-and-5xx-results +and compares the raw Gene count matrices decoded to {(barcode, gene_id): count}. + +Usage: + python3 test/solo_cellranger_diff.py [--star /path/to/STAR] [--rustar /path/to/rustar-aligner] [--keep] + +Exit code 0 = matrices match, 1 = mismatch / error. +""" +import argparse +import os +import random +import shutil +import subprocess +import sys +import tempfile + +# CellRanger 4.x/5.x matching flags (STARsolo.md). +CELLRANGER_FLAGS = [ + "--clipAdapterType", "CellRanger4", + "--outFilterScoreMin", "30", + "--soloCBmatchWLtype", "1MM_multi_Nbase_pseudocounts", + "--soloUMIfiltering", "MultiGeneUMI_CR", + "--soloUMIdedup", "1MM_CR", +] + +CB_LEN = 16 +UMI_LEN = 12 +READ_LEN = 90 +BASES = "ACGT" + + +def rand_seq(rng, n): + return "".join(rng.choice(BASES) for _ in range(n)) + + +# Two-exon gene layout (0-based): exon1 [s, s+150), intron [s+150, s+400) with +# canonical GT..AG, exon2 [s+400, s+550). Multi-exon genes give STAR a non-empty +# splice-junction DB, which it needs to set up the solo Transcriptome directory. +GENE_A_START = 10000 +GENE_B_START = 30000 + + +def _plant_gene(g, s, rng): + g[s : s + 150] = list(rand_seq(rng, 150)) # exon1 + g[s + 150 : s + 400] = list(rand_seq(rng, 250)) # intron body + g[s + 150], g[s + 151] = "G", "T" # donor + g[s + 398], g[s + 399] = "A", "G" # acceptor + g[s + 400 : s + 550] = list(rand_seq(rng, 150)) # exon2 + + +def build_genome(rng, length=50000): + g = list(rand_seq(rng, length)) + _plant_gene(g, GENE_A_START, rng) + _plant_gene(g, GENE_B_START, rng) + return "".join(g) + + +def pick_window(genome, exon_start): + """Pick a READ_LEN window inside exon1 ending in a non-A base (so the + CellRanger4 polyA trim is a guaranteed no-op for both tools). The window + stays inside the 150 bp exon1, so reads never span the junction.""" + a = exon_start + 20 + while genome[a + READ_LEN - 1] == "A": + a += 1 + return genome[a : a + READ_LEN] + + +def write_files(d, genome): + fa = os.path.join(d, "genome.fa") + with open(fa, "w") as f: + f.write(">chr1\n") + for i in range(0, len(genome), 70): + f.write(genome[i : i + 70] + "\n") + + gtf = os.path.join(d, "genes.gtf") + with open(gtf, "w") as f: + # Two exons per gene (1-based inclusive), matching the planted layout. + f.write('chr1\tsrc\texon\t10001\t10150\t.\t+\t.\tgene_id "GENEA"; transcript_id "GENEA.1"; gene_name "GeneA";\n') + f.write('chr1\tsrc\texon\t10401\t10550\t.\t+\t.\tgene_id "GENEA"; transcript_id "GENEA.1"; gene_name "GeneA";\n') + f.write('chr1\tsrc\texon\t30001\t30150\t.\t+\t.\tgene_id "GENEB"; transcript_id "GENEB.1"; gene_name "GeneB";\n') + f.write('chr1\tsrc\texon\t30401\t30550\t.\t+\t.\tgene_id "GENEB"; transcript_id "GENEB.1"; gene_name "GeneB";\n') + + wl = os.path.join(d, "whitelist.txt") + cbs = ["AAAACCCCGGGGTTTT", "ACACACACGTGTGTGT", "TTTTGGGGCCCCAAAA", "GTGTGTGTACACACAC"] + with open(wl, "w") as f: + f.write("\n".join(cbs) + "\n") + + readA = pick_window(genome, 10000) + readB = pick_window(genome, 30000) + + # (cell, gene-read, umi, n_reads). Designed to exercise: + # - exact CB match (all CBs in whitelist) + # - 1MM_CR UMI collapse: ACGTACGTACGT (5) + ACGTACGTACGA (1) -> 1 molecule + # - distinct molecules counted, two genes, two cells. + plan = [ + (cbs[0], readA, "ACGTACGTACGT", 5), + (cbs[0], readA, "ACGTACGTACGA", 1), # 1MM neighbor of the above + (cbs[0], readA, "TGCATGCATGCA", 3), # separate molecule + (cbs[0], readB, "GGGGTTTTAACC", 2), # GeneB, cell0 + (cbs[1], readA, "CATGCATGCATG", 4), # GeneA, cell1 + ] + # Expected decoded matrix. + expected = { + ("AAAACCCCGGGGTTTT", "GENEA"): 2, # two molecules (1MM pair collapses) + ("AAAACCCCGGGGTTTT", "GENEB"): 1, + ("ACACACACGTGTGTGT", "GENEA"): 1, + } + + cdna = os.path.join(d, "cdna.fq") + bc = os.path.join(d, "barcode.fq") + ci = 0 + with open(cdna, "w") as cf, open(bc, "w") as bf: + for (cb, read, umi, n) in plan: + for _ in range(n): + name = f"read{ci}" + ci += 1 + cf.write(f"@{name}\n{read}\n+\n{'I' * READ_LEN}\n") + barcode = cb + umi + bf.write(f"@{name}\n{barcode}\n+\n{'I' * len(barcode)}\n") + return fa, gtf, wl, cdna, bc, expected + + +def run(cmd, **kw): + print(" $", " ".join(str(c) for c in cmd)) + r = subprocess.run(cmd, capture_output=True, text=True, **kw) + if r.returncode != 0: + print(r.stdout[-2000:]) + print(r.stderr[-4000:]) + raise SystemExit(f"command failed ({r.returncode}): {cmd[0]}") + return r + + +def run_star(star, d, fa, gtf, wl, cdna, bc): + # Generate WITH the GTF so geneInfo.tab lands in the index, then reset the + # recorded sjdbGTFfile to "-" in genomeParameters.txt. STAR's solo + # Transcriptome uses `trInfoDir = sjdbGTFfile=="-" ? genomeDir : sjdbInsert.outDir` + # (Transcriptome.cpp:18); with the path still recorded it points at an empty + # insert dir and fails with "/geneInfo.tab". Resetting to "-" makes it read + # geneInfo.tab from the genome dir. (The gene model is intact in the index.) + idx = os.path.join(d, "star_index") + os.makedirs(idx, exist_ok=True) + run([star, "--runMode", "genomeGenerate", "--genomeDir", idx, + "--genomeFastaFiles", fa, "--sjdbGTFfile", gtf, + "--genomeSAindexNbases", "7", "--sjdbOverhang", "89"]) + gp = os.path.join(idx, "genomeParameters.txt") + lines = open(gp).read().splitlines() + with open(gp, "w") as f: + for ln in lines: + if ln.startswith("sjdbGTFfile\t"): + f.write("sjdbGTFfile\t-\n") + else: + f.write(ln + "\n") + + out = os.path.join(d, "star_out") + os.sep + run([star, "--genomeDir", idx, "--readFilesIn", cdna, bc, + "--soloType", "CB_UMI_Simple", "--soloCBwhitelist", wl, + "--soloCBstart", "1", "--soloCBlen", str(CB_LEN), + "--soloUMIstart", str(CB_LEN + 1), "--soloUMIlen", str(UMI_LEN), + "--soloFeatures", "Gene", "--outSAMtype", "SAM", + "--outFileNamePrefix", out] + CELLRANGER_FLAGS) + # Guard against a STAR binary that silently reads 0 reads (broken bottle). + log = os.path.join(out, "Log.final.out") + if os.path.exists(log): + for ln in open(log): + if "Number of input reads" in ln and ln.strip().endswith("0"): + raise SystemExit( + "STAR processed 0 input reads — the STAR binary appears broken " + "on this machine (immediate EOF on FASTQ input). Install a working " + "STAR and re-run with --star /path/to/STAR." + ) + return os.path.join(out, "Solo.out", "Gene", "raw") + + +def run_rustar(rustar, d, fa, gtf, wl, cdna, bc): + idx = os.path.join(d, "rustar_index") + os.makedirs(idx, exist_ok=True) + run([rustar, "--runMode", "genomeGenerate", "--genomeDir", idx, + "--genomeFastaFiles", fa, "--sjdbGTFfile", gtf, + "--genomeSAindexNbases", "7", "--sjdbOverhang", "89"]) + out = os.path.join(d, "rustar_out") + os.sep + run([rustar, "--genomeDir", idx, "--readFilesIn", cdna, bc, + "--soloType", "CB_UMI_Simple", "--soloCBwhitelist", wl, + "--soloCBstart", "1", "--soloCBlen", str(CB_LEN), + "--soloUMIstart", str(CB_LEN + 1), "--soloUMIlen", str(UMI_LEN), + "--soloFeatures", "Gene", "--sjdbGTFfile", gtf, + "--outSAMtype", "SAM", + "--outFileNamePrefix", out] + CELLRANGER_FLAGS) + return os.path.join(out, "Solo.out", "Gene", "raw") + + +def decode_matrix(raw_dir): + """Decode raw/{matrix.mtx,barcodes.tsv,features.tsv} -> {(barcode, gene_id): count}.""" + feats = [] + with open(os.path.join(raw_dir, "features.tsv")) as f: + for line in f: + feats.append(line.rstrip("\n").split("\t")[0]) + barcodes = [] + with open(os.path.join(raw_dir, "barcodes.tsv")) as f: + for line in f: + barcodes.append(line.strip()) + out = {} + with open(os.path.join(raw_dir, "matrix.mtx")) as f: + lines = [l for l in f if not l.startswith("%")] + # first non-% line is dims + for entry in lines[1:]: + parts = entry.split() + if len(parts) < 3: + continue + row, col, cnt = int(parts[0]), int(parts[1]), int(float(parts[2])) + out[(barcodes[col - 1], feats[row - 1])] = cnt + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--star", default=shutil.which("STAR") or "/opt/homebrew/bin/STAR") + ap.add_argument("--rustar", default=None) + ap.add_argument("--keep", action="store_true") + ap.add_argument("--seed", type=int, default=20260612) + args = ap.parse_args() + + repo = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + if args.rustar: + # Honor an explicit path exactly — never silently fall back to a + # different (possibly foreign-arch) binary. + rustar = args.rustar + if not os.path.exists(rustar): + raise SystemExit(f"--rustar binary not found: {rustar}") + else: + rustar = os.path.join(repo, "target", "release", "rustar-aligner") + if not os.path.exists(rustar): + rustar = os.path.join(repo, "target", "debug", "rustar-aligner") + if not os.path.exists(rustar): + raise SystemExit( + "rustar-aligner binary not found — build it first (cargo build [--release]) " + "or pass --rustar /path/to/rustar-aligner" + ) + if not (args.star and os.path.exists(args.star)): + raise SystemExit(f"STAR binary not found: {args.star}") + + d = tempfile.mkdtemp(prefix="solo_diff_") + print(f"workdir: {d}") + print(f"STAR: {args.star}") + print(f"rustar: {rustar}") + rng = random.Random(args.seed) + try: + genome = build_genome(rng) + fa, gtf, wl, cdna, bc, expected = write_files(d, genome) + + print("\n== rustar-aligner ==") + rustar_raw = run_rustar(rustar, d, fa, gtf, wl, cdna, bc) + rustar_m = decode_matrix(rustar_raw) + + print("\n== expected (hand-computed CellRanger result) ==") + for k, v in sorted(expected.items()): + print(f" {k} = {v}") + print("== rustar matrix ==") + for k, v in sorted(rustar_m.items()): + print(f" {k} = {v}") + + # Core guarantee: rustar's CellRanger-style matrix matches the expectation. + if rustar_m != expected: + print("\nFAIL: rustar matrix does not match the expected CellRanger result:") + for k in sorted(set(rustar_m) | set(expected)): + if rustar_m.get(k) != expected.get(k): + print(f" {k}: rustar={rustar_m.get(k)} expected={expected.get(k)}") + return 1 + print("\nrustar matrix matches the expected CellRanger result.") + + # Live comparison against the real STAR binary, when it works on this host. + print("\n== STAR ==") + try: + star_raw = run_star(args.star, d, fa, gtf, wl, cdna, bc) + star_m = decode_matrix(star_raw) + except SystemExit as e: + print(f"\nSTAR could not run a live comparison on this host: {e}") + print("PASS (rustar validated against the CellRanger expectation; " + "run on a host with a working STAR for the live diff).") + return 0 + print("== STAR matrix ==") + for k, v in sorted(star_m.items()): + print(f" {k} = {v}") + if star_m == rustar_m: + print("\nPASS: rustar-aligner matrix matches real STARsolo exactly.") + return 0 + print("\nFAIL: rustar vs STAR mismatch:") + for k in sorted(set(star_m) | set(rustar_m)): + if star_m.get(k) != rustar_m.get(k): + print(f" {k}: STAR={star_m.get(k)} rustar={rustar_m.get(k)}") + return 1 + finally: + if args.keep: + print(f"(kept workdir {d})") + else: + shutil.rmtree(d, ignore_errors=True) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/solo_compare_h5ad.py b/test/solo_compare_h5ad.py new file mode 100644 index 0000000..99b3b9f --- /dev/null +++ b/test/solo_compare_h5ad.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +"""Knee-call + compare CellRanger / STARsolo / rustar-aligner raw matrices. + +For a fair comparison that isolates *counting* differences from *cell-calling* +differences, the SAME knee filter (CellRanger 2.2 — STARsolo's default +--soloCellFilter) is applied to each tool's RAW matrix. Each filtered result is +written as an .h5ad (AnnData, cells x genes) and the three are compared: +n cells, median UMI/genes per cell, barcode overlap, per-cell UMI correlation on +shared barcodes, and gene-level pseudobulk correlation. + +Usage: + .venv/bin/python test/solo_compare_h5ad.py \ + --cellranger \ + --starsolo \ + --rustar \ + --out +""" +import argparse +import gzip +import json +import os +import sys + +import anndata as ad +import numpy as np +import pandas as pd +import scipy.io +import scipy.sparse as sp + + +def _find(d, base): + for c in (base, base + ".gz"): + p = os.path.join(d, c) + if os.path.exists(p): + return p + raise FileNotFoundError(f"{base}[.gz] not found in {d}") + + +def _open_text(p): + return gzip.open(p, "rt") if p.endswith(".gz") else open(p) + + +def load_raw(d): + """Load a 10x/STARsolo raw matrix dir -> (X cells x genes CSR, barcodes, gene_ids).""" + mp = _find(d, "matrix.mtx") + handle = gzip.open(mp, "rb") if mp.endswith(".gz") else open(mp, "rb") + with handle: + m = scipy.io.mmread(handle) # features x barcodes + X = sp.csr_matrix(m).T.tocsr() # -> barcodes (cells) x features (genes) + barcodes = np.array([l.split("\t")[0].strip() for l in _open_text(_find(d, "barcodes.tsv"))]) + genes = np.array([l.split("\t")[0].strip() for l in _open_text(_find(d, "features.tsv"))]) + return X, barcodes, genes + + +def norm_bc(bc): + """Strip 10x '-1' gem-group suffix so barcodes are comparable across tools.""" + return np.array([b.split("-")[0] for b in bc]) + + +def revcomp(s): + t = str.maketrans("ACGT", "TGCA") + return s.translate(t)[::-1] + + +def knee_cr22(totals, n_expected=3000, max_pct=0.99, max_min_ratio=10): + """CellRanger-2.2 knee threshold on per-barcode totals (STARsolo default).""" + counts = np.sort(totals[totals > 0])[::-1] + if counts.size == 0: + return 0.0 + idx = min(int(round(n_expected * (1 - max_pct))), counts.size - 1) + robust_max = counts[idx] + return robust_max / max_min_ratio + + +def load_cell_set(path): + """Load an EmptyDrops cells.txt (one barcode/line) -> normalized set, or None.""" + if not path or not os.path.exists(path): + return None + with _open_text(path) as fh: + return set(l.split("\t")[0].split("-")[0].strip() for l in fh if l.strip()) + + +def build_filtered(name, raw_dir, rc_barcodes=False, cells=None): + """Filter a raw matrix to called cells. If `cells` (a normalized barcode set, + e.g. from EmptyDrops) is given, keep exactly those; otherwise CR2.2 knee.""" + X, bc, genes = load_raw(raw_dir) + bc = norm_bc(bc) + if rc_barcodes: + bc = np.array([revcomp(b) for b in bc]) + totals = np.asarray(X.sum(axis=1)).ravel() + if cells is not None: + thr = -1.0 + keep = np.array([b in cells for b in bc]) + else: + thr = knee_cr22(totals) + keep = totals >= thr + Xf = X[keep] + bcf = bc[keep] + A = ad.AnnData(X=Xf, obs=pd.DataFrame(index=bcf), var=pd.DataFrame(index=genes)) + A.obs["n_umi"] = np.asarray(Xf.sum(axis=1)).ravel() + A.obs["n_genes"] = np.asarray((Xf > 0).sum(axis=1)).ravel() + return A, thr + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--cellranger", required=True) + ap.add_argument("--starsolo", required=True) + ap.add_argument("--rustar", required=True) + ap.add_argument("--out", required=True) + # Optional EmptyDrops cells.txt per tool; when given, filter by these calls + # instead of the CR2.2 knee (CellRanger uses its own filtered barcodes). + ap.add_argument("--rustar-cells") + ap.add_argument("--starsolo-cells") + ap.add_argument("--cellranger-cells") + args = ap.parse_args() + os.makedirs(args.out, exist_ok=True) + + r_cells = load_cell_set(args.rustar_cells) + s_cells = load_cell_set(args.starsolo_cells) + c_cells = load_cell_set(args.cellranger_cells) + + # Build STARsolo / rustar first; detect whether CellRanger barcodes need RC + # (some 5' chemistries report the reverse complement). + star, star_thr = build_filtered("STARsolo", args.starsolo, cells=s_cells) + rust, rust_thr = build_filtered("rustar", args.rustar, cells=r_cells) + + cr_plain, _ = build_filtered("CellRanger", args.cellranger, rc_barcodes=False, cells=c_cells) + ov_plain = len(set(cr_plain.obs_names) & set(star.obs_names)) + cr_rc, _ = build_filtered("CellRanger", args.cellranger, rc_barcodes=True, cells=c_cells) + ov_rc = len(set(cr_rc.obs_names) & set(star.obs_names)) + cr = cr_rc if ov_rc > ov_plain else cr_plain + cr_orient = "reverse-complement" if ov_rc > ov_plain else "as-reported" + + objs = {"CellRanger": cr, "STARsolo": star, "rustar-aligner": rust} + for name, A in objs.items(): + path = os.path.join(args.out, f"{name.replace('-aligner','')}.filtered.h5ad") + A.write_h5ad(path) + + print(f"\nCellRanger barcode orientation vs STARsolo: {cr_orient} " + f"(overlap as-reported={ov_plain}, rc={ov_rc})") + + # ---- per-tool summary ---- + print("\n================ filtered (CR2.2 knee) summary ================") + hdr = f"{'tool':<16}{'cells':>8}{'median UMI/cell':>17}{'median genes/cell':>19}{'genes detected':>16}{'total UMI':>12}" + print(hdr); print("-" * len(hdr)) + rows = {} + for name, A in objs.items(): + med_umi = int(np.median(A.obs["n_umi"])) if A.n_obs else 0 + med_g = int(np.median(A.obs["n_genes"])) if A.n_obs else 0 + genes_det = int((np.asarray(A.X.sum(axis=0)).ravel() > 0).sum()) + tot = int(A.X.sum()) + rows[name] = dict(cells=A.n_obs, median_umi=med_umi, median_genes=med_g, + genes_detected=genes_det, total_umi=tot) + print(f"{name:<16}{A.n_obs:>8}{med_umi:>17}{med_g:>19}{genes_det:>16}{tot:>12}") + + # ---- barcode overlap (called-cell sets) ---- + sets = {n: set(A.obs_names) for n, A in objs.items()} + names = list(objs) + print("\n================ called-cell barcode overlap ================") + allc = sets[names[0]] & sets[names[1]] & sets[names[2]] + print(f"shared by all 3: {len(allc)}") + for i in range(len(names)): + for j in range(i + 1, len(names)): + a, b = names[i], names[j] + inter = len(sets[a] & sets[b]); uni = len(sets[a] | sets[b]) + print(f" {a} ∩ {b}: {inter} (Jaccard {inter/uni:.3f})") + + # ---- correlations on shared cells & genes ---- + print("\n================ agreement on shared cells/genes ================") + shared_genes = list(set(cr.var_names) & set(star.var_names) & set(rust.var_names)) + common_cells = sorted(allc) + corr = {} + if common_cells and shared_genes: + # per-cell total UMI vectors (aligned to common cells) + def cell_totals(A): + idx = [A.obs_names.get_loc(c) for c in common_cells] + return np.asarray(A[idx].X.sum(axis=1)).ravel() + tot = {n: cell_totals(A) for n, A in objs.items()} + # pseudobulk per gene (sum over shared cells), aligned to shared genes + def pseudobulk(A): + idx = [A.obs_names.get_loc(c) for c in common_cells] + gi = [A.var_names.get_loc(g) for g in shared_genes] + return np.asarray(A[idx][:, gi].X.sum(axis=0)).ravel() + pb = {n: pseudobulk(A) for n, A in objs.items()} + for i in range(len(names)): + for j in range(i + 1, len(names)): + a, b = names[i], names[j] + rc_cell = np.corrcoef(tot[a], tot[b])[0, 1] + rc_gene = np.corrcoef(pb[a], pb[b])[0, 1] + corr[f"{a} vs {b}"] = dict(per_cell_umi_r=round(float(rc_cell), 4), + pseudobulk_gene_r=round(float(rc_gene), 4)) + print(f" {a} vs {b}: per-cell UMI r={rc_cell:.4f}, gene pseudobulk r={rc_gene:.4f} " + f"(n_cells={len(common_cells)}, n_genes={len(shared_genes)})") + + out = dict(threshold=dict(STARsolo=star_thr, rustar=rust_thr), + cellranger_orientation=cr_orient, summary=rows, correlations=corr, + shared_all3_cells=len(allc)) + with open(os.path.join(args.out, "compare.json"), "w") as f: + json.dump(out, f, indent=2) + print(f"\nWrote {len(objs)} h5ad files + compare.json to {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/solo_diff_docker.sh b/test/solo_diff_docker.sh new file mode 100755 index 0000000..dc0c921 --- /dev/null +++ b/test/solo_diff_docker.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Run the STARsolo CellRanger differential test (rustar-aligner vs real STAR) in +# a consistent Linux container, so the comparison works regardless of the host +# (the macOS STAR build has a FASTQ-read bug; Linux STAR works). +# +# Requires a Docker-compatible runtime. On macOS without Docker Desktop: +# brew install colima docker && colima start +# +# Usage: test/solo_diff_docker.sh [N_RUNS] +set -euo pipefail + +cd "$(dirname "$0")/.." +RUNS="${1:-1}" +IMAGE=rustar-solodiff + +docker build -f test/Dockerfile.solodiff -t "$IMAGE" . >/dev/null + +# Build rustar for Linux into a host-mounted dir (persisted across runs), then +# run the harness against the Linux STAR + Linux rustar binary. +docker run --rm -v "$PWD":/work -w /work -e CARGO_TARGET_DIR=/work/target-linux "$IMAGE" bash -c ' + set -e + cargo build --release 2>&1 | tail -1 + RUSTAR=/work/target-linux/release/rustar-aligner + STARBIN=$(which STAR) + for i in $(seq 1 '"$RUNS"'); do + echo "===== differential run $i =====" + python3 test/solo_cellranger_diff.py --star "$STARBIN" --rustar "$RUSTAR" + done +' diff --git a/test/solo_genefull_compare.py b/test/solo_genefull_compare.py new file mode 100644 index 0000000..ff1b5c7 --- /dev/null +++ b/test/solo_genefull_compare.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Compare GeneFull (intron-inclusive) quantification across rustar / STARsolo / +CellRanger, plus the EmptyDrops-filtered cell sets. + +Part A (raw count parity): load each tool's raw matrix, report total UMIs, genes +detected, cells with >0 UMI, and the per-cell UMI-total correlation between +rustar-GeneFull and STARsolo-GeneFull (they should match closely) and each vs +CellRanger (whose default raw matrix is intron-inclusive). + +Part B (filtered h5 parity): given EmptyDrops-called barcode lists for each tool +(from the `emptydrops` Rust binary) and CellRanger's own filtered barcodes, +report cell-set overlap (Jaccard) and per-cell UMI agreement on shared cells. + +Usage: + solo_genefull_compare.py \ + --rustar \ + --starsolo \ + --cellranger \ + [--rustar-cells f.txt --starsolo-cells f.txt --cr-cells f.txt] \ + --out compare_genefull.json +""" +import argparse +import gzip +import json +import os +import sys + + +def _open(p): + return gzip.open(p, "rt") if p.endswith(".gz") else open(p) + + +def _find(d, base): + for c in (base, base + ".gz"): + p = os.path.join(d, c) + if os.path.exists(p): + return p + raise FileNotFoundError(f"{base}[.gz] not in {d}") + + +def load_raw(d): + """Return (barcodes list, dict cell_idx->total_umi, n_genes, total_umi). + + Barcodes are normalized (10x '-1' gem-group suffix stripped) so they are + comparable across tools (CellRanger appends '-1', STARsolo/rustar do not).""" + bcs = [l.split("\t")[0].split("-")[0].strip() for l in _open(_find(d, "barcodes.tsv"))] + genes = [l.split("\t")[0].strip() for l in _open(_find(d, "features.tsv"))] + totals = {} + total_umi = 0 + genes_seen = set() + with _open(_find(d, "matrix.mtx")) as fh: + for line in fh: + if line.startswith("%"): + continue + break # first non-% line is the dims header; skip it + for line in fh: + g, c, v = line.split()[:3] + v = int(float(v)) + if v == 0: + continue + ci = int(c) - 1 + totals[ci] = totals.get(ci, 0) + v + total_umi += v + genes_seen.add(int(g) - 1) + return bcs, totals, len(genes), total_umi, len(genes_seen) + + +def summarize(name, d): + bcs, totals, n_genes, total_umi, genes_detected = load_raw(d) + cells = sum(1 for v in totals.values() if v > 0) + print(f"[{name}] cells>0={cells:,} total_UMI={total_umi:,} " + f"genes_detected={genes_detected:,}/{n_genes:,}") + return {"name": name, "barcodes": bcs, "totals": totals, + "cells_gt0": cells, "total_umi": total_umi, + "genes_detected": genes_detected, "n_genes": n_genes} + + +def pearson(xs, ys): + n = len(xs) + if n < 2: + return float("nan") + mx = sum(xs) / n + my = sum(ys) / n + sxy = sum((x - mx) * (y - my) for x, y in zip(xs, ys)) + sxx = sum((x - mx) ** 2 for x in xs) + syy = sum((y - my) ** 2 for y in ys) + if sxx == 0 or syy == 0: + return float("nan") + return sxy / (sxx ** 0.5 * syy ** 0.5) + + +def per_cell_corr(a, b): + """Per-cell UMI-total correlation over the shared barcode set.""" + a_by_bc = {a["barcodes"][i]: t for i, t in a["totals"].items()} + b_by_bc = {b["barcodes"][i]: t for i, t in b["totals"].items()} + shared = sorted(set(a_by_bc) & set(b_by_bc)) + xs = [a_by_bc[bc] for bc in shared] + ys = [b_by_bc[bc] for bc in shared] + r = pearson(xs, ys) + exact = sum(1 for x, y in zip(xs, ys) if x == y) + return {"shared_cells": len(shared), "pearson_r": r, + "exact_total_match": exact, + "exact_frac": exact / len(shared) if shared else float("nan")} + + +def read_cells(p): + if not p or not os.path.exists(p): + return None + return set(l.split("\t")[0].strip() for l in _open(p)) + + +def jaccard(a, b): + if a is None or b is None: + return None + inter = len(a & b) + union = len(a | b) + return {"a": len(a), "b": len(b), "intersection": inter, + "jaccard": inter / union if union else float("nan"), + "a_only": len(a - b), "b_only": len(b - a)} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--rustar", required=True) + ap.add_argument("--starsolo", required=True) + ap.add_argument("--cellranger", required=True) + ap.add_argument("--rustar-cells") + ap.add_argument("--starsolo-cells") + ap.add_argument("--cr-cells") + ap.add_argument("--out", default="compare_genefull.json") + a = ap.parse_args() + + print("=== Part A: GeneFull raw count parity ===") + R = summarize("rustar-GeneFull", a.rustar) + S = summarize("STARsolo-GeneFull", a.starsolo) + C = summarize("CellRanger-raw", a.cellranger) + + print("\n=== per-cell UMI-total correlation ===") + rs = per_cell_corr(R, S) + print(f"rustar vs STARsolo : shared={rs['shared_cells']:,} r={rs['pearson_r']:.6f} " + f"exact_total={rs['exact_frac']:.4%}") + rc = per_cell_corr(R, C) + print(f"rustar vs CellRgr : shared={rc['shared_cells']:,} r={rc['pearson_r']:.6f}") + sc = per_cell_corr(S, C) + print(f"STAR vs CellRgr : shared={sc['shared_cells']:,} r={sc['pearson_r']:.6f}") + + out = { + "raw": {k: {kk: v[kk] for kk in ("cells_gt0", "total_umi", + "genes_detected", "n_genes")} + for k, v in (("rustar", R), ("starsolo", S), ("cellranger", C))}, + "corr": {"rustar_vs_starsolo": rs, "rustar_vs_cr": rc, "starsolo_vs_cr": sc}, + } + + rcells = read_cells(a.rustar_cells) + scells = read_cells(a.starsolo_cells) + ccells = read_cells(a.cr_cells) + if rcells or ccells: + print("\n=== Part B: filtered cell-set overlap (EmptyDrops / CellRanger) ===") + out["filtered"] = {} + if rcells and ccells: + j = jaccard(rcells, ccells) + print(f"rustar-ED vs CR-filtered : rustar={j['a']:,} CR={j['b']:,} " + f"shared={j['intersection']:,} jaccard={j['jaccard']:.4f}") + out["filtered"]["rustar_vs_cr"] = j + if scells and ccells: + j = jaccard(scells, ccells) + print(f"STAR-ED vs CR-filtered : star={j['a']:,} CR={j['b']:,} " + f"shared={j['intersection']:,} jaccard={j['jaccard']:.4f}") + out["filtered"]["starsolo_vs_cr"] = j + if rcells and scells: + j = jaccard(rcells, scells) + print(f"rustar-ED vs STAR-ED : jaccard={j['jaccard']:.4f}") + out["filtered"]["rustar_vs_starsolo"] = j + + with open(a.out, "w") as fh: + json.dump(out, fh, indent=2, default=str) + print(f"\nwrote {a.out}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/solo_genefull_h5_compare.py b/test/solo_genefull_h5_compare.py new file mode 100644 index 0000000..d75e60b --- /dev/null +++ b/test/solo_genefull_h5_compare.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""GeneFull intron-gap + EmptyDrops-filtered h5 comparison (rustar vs CellRanger). + +Loads matrices one at a time (memory-careful), reports: + A. intron effect — rustar Gene vs GeneFull total UMI (same cells); + B. raw-count parity — rustar GeneFull vs CellRanger raw total UMI / genes; + C. cell-set agreement — rustar EmptyDrops cells vs CellRanger native filtered, + and rustar-ED vs CellRanger-raw+same-EmptyDrops (isolates algorithm); + D. per-cell UMI correlation on the shared filtered cells; + writes rustar.GeneFull.filtered.h5ad + CellRanger.filtered.h5ad. +""" +import argparse, gzip, json, os, sys +import numpy as np, scipy.io, scipy.sparse as sp, anndata as ad, pandas as pd + + +def _find(d, base): + for c in (base, base + ".gz"): + p = os.path.join(d, c) + if os.path.exists(p): + return p + raise FileNotFoundError(f"{base}[.gz] in {d}") + + +def _open(p): + return gzip.open(p, "rt") if p.endswith(".gz") else open(p) + + +def load(d): + mp = _find(d, "matrix.mtx") + h = gzip.open(mp, "rb") if mp.endswith(".gz") else open(mp, "rb") + with h: + X = sp.csr_matrix(scipy.io.mmread(h)).T.tocsr() # cells x genes + bc = np.array([l.split("\t")[0].split("-")[0].strip() for l in _open(_find(d, "barcodes.tsv"))]) + genes = np.array([l.split("\t")[0].strip() for l in _open(_find(d, "features.tsv"))]) + return X, bc, genes + + +def cellset(p): + with _open(p) as fh: + return set(l.split("\t")[0].split("-")[0].strip() for l in fh if l.strip()) + + +def revcomp(s): + return s.translate(str.maketrans("ACGT", "TGCA"))[::-1] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--rustar-gene", required=True) + ap.add_argument("--rustar-genefull", required=True) + ap.add_argument("--cellranger-raw", required=True) + ap.add_argument("--rustar-ed-cells", required=True) + ap.add_argument("--cr-ed-cells", required=True) + ap.add_argument("--cr-native-cells", required=True) + ap.add_argument("--out", required=True) + a = ap.parse_args() + os.makedirs(a.out, exist_ok=True) + out = {} + + # ---- A. intron effect: rustar Gene vs GeneFull ---- + Xg, bcg, _ = load(a.rustar_gene) + tot_gene = int(Xg.sum()) + g_by_bc = dict(zip(bcg, np.asarray(Xg.sum(1)).ravel())) + del Xg + Xf, bcf, genes_f = load(a.rustar_genefull) + tot_genefull = int(Xf.sum()) + f_by_bc = dict(zip(bcf, np.asarray(Xf.sum(1)).ravel())) + print(f"[A] intron effect (all barcodes):") + print(f" rustar Gene total UMI = {tot_gene:,}") + print(f" rustar GeneFull total UMI = {tot_genefull:,} " + f"(+{100*(tot_genefull-tot_gene)/tot_gene:.1f}%)") + out["intron_effect"] = {"gene_total_umi": tot_gene, "genefull_total_umi": tot_genefull, + "pct_increase": round(100*(tot_genefull-tot_gene)/tot_gene, 2)} + + # ---- B. raw parity: rustar GeneFull vs CellRanger raw ---- + Xc, bcc, genes_c = load(a.cellranger_raw) + # 5' chemistry: CellRanger may report RC barcodes — detect against rustar. + rust_set = set(bcf) + ov_plain = len(set(bcc) & rust_set) + bcc_rc = np.array([revcomp(b) for b in bcc]) + ov_rc = len(set(bcc_rc) & rust_set) + if ov_rc > ov_plain: + bcc = bcc_rc + cr_orient = "reverse-complement" + else: + cr_orient = "as-reported" + print(f"\n[B] CellRanger barcode orientation vs rustar: {cr_orient} " + f"(overlap plain={ov_plain:,} rc={ov_rc:,})") + tot_cr = int(Xc.sum()) + print(f" rustar GeneFull raw total UMI = {tot_genefull:,}, genes={ (np.asarray(Xf.sum(0)).ravel()>0).sum():,}") + print(f" CellRanger raw total UMI = {tot_cr:,}, genes={ (np.asarray(Xc.sum(0)).ravel()>0).sum():,}") + c_by_bc = dict(zip(bcc, np.asarray(Xc.sum(1)).ravel())) + out["raw_parity"] = {"rustar_genefull_total_umi": tot_genefull, "cellranger_total_umi": tot_cr, + "cr_orientation": cr_orient} + + # ---- C. cell-set agreement ---- + r_ed = cellset(a.rustar_ed_cells) + cr_ed = cellset(a.cr_ed_cells) + cr_nat = cellset(a.cr_native_cells) + if cr_orient == "reverse-complement": + cr_ed = {revcomp(b) for b in cr_ed} + cr_nat = {revcomp(b) for b in cr_nat} + + def jac(x, y): + i, u = len(x & y), len(x | y) + return {"a": len(x), "b": len(y), "shared": i, "jaccard": round(i/u, 4) if u else None, + "a_only": len(x - y), "b_only": len(y - x)} + + print("\n[C] cell-set agreement:") + out["cell_sets"] = {} + for label, x, y in [("rustar-ED vs CR-raw-ED (same algo)", r_ed, cr_ed), + ("rustar-ED vs CR-native-filtered", r_ed, cr_nat), + ("CR-raw-ED vs CR-native-filtered", cr_ed, cr_nat)]: + j = jac(x, y) + print(f" {label:<38}: a={j['a']:,} b={j['b']:,} shared={j['shared']:,} " + f"jaccard={j['jaccard']}") + out["cell_sets"][label] = j + + # ---- D. per-cell UMI correlation on shared (rustar-ED ∩ CR-native) ---- + shared = sorted(r_ed & cr_nat) + xs = [f_by_bc.get(b, 0) for b in shared] + ys = [c_by_bc.get(b, 0) for b in shared] + if len(shared) > 2: + r = float(np.corrcoef(xs, ys)[0, 1]) + print(f"\n[D] per-cell UMI corr (rustar GeneFull vs CR raw) on {len(shared):,} shared " + f"filtered cells: r={r:.4f}") + out["per_cell_corr"] = {"shared_cells": len(shared), "pearson_r": round(r, 4)} + + # ---- write filtered h5ad ---- + def write_h5ad(name, X, bc, genes, keep_set): + keep = np.array([b in keep_set for b in bc]) + Xk = X[keep] + A = ad.AnnData(X=Xk, obs=pd.DataFrame(index=bc[keep]), var=pd.DataFrame(index=genes)) + A.obs["n_umi"] = np.asarray(Xk.sum(1)).ravel() + A.obs["n_genes"] = np.asarray((Xk > 0).sum(1)).ravel() + p = os.path.join(a.out, f"{name}.h5ad") + A.write_h5ad(p) + print(f" wrote {p} ({A.n_obs:,} cells)") + return A.n_obs + + print("\n[E] writing EmptyDrops-filtered h5ad:") + out["h5ad"] = { + "rustar_genefull_ed": write_h5ad("rustar.GeneFull.emptydrops", Xf, bcf, genes_f, r_ed), + "cellranger_native": write_h5ad("CellRanger.filtered", Xc, bcc, genes_c, cr_nat), + } + + with open(os.path.join(a.out, "genefull_h5_compare.json"), "w") as fh: + json.dump(out, fh, indent=2, default=str) + print(f"\nwrote {a.out}/genefull_h5_compare.json") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/solo_sj_multi_compare.py b/test/solo_sj_multi_compare.py new file mode 100644 index 0000000..75fd9b5 --- /dev/null +++ b/test/solo_sj_multi_compare.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +"""Diff rustar vs STARsolo SJ-feature and --soloMultiMappers matrices. + +Both tools index barcodes by the same sorted whitelist (columns align directly) +and genes by the same GTF order (Gene rows align). SJ junctions differ per tool +(each has its own SJ.out.tab), so SJ rows are matched by (chr,start,end). + +Reports, per matrix: shared rows/cols, total counts, Pearson r over shared +entries, and the fraction of shared entries that match exactly. + +Usage: + solo_sj_multi_compare.py --rustar --starsolo +""" +import argparse +import gzip +import os +import sys + +import numpy as np +import scipy.io +import scipy.sparse as sp + + +def _open(p): + return gzip.open(p, "rb") if p.endswith(".gz") else open(p, "rb") + + +def _find(d, base): + for c in (base, base + ".gz"): + p = os.path.join(d, c) + if os.path.exists(p): + return p + return None + + +def load_mtx(d, name="matrix.mtx"): + p = _find(d, name) + if p is None: + return None + with _open(p) as fh: + m = scipy.io.mmread(fh).tocsr() # features x barcodes + return m + + +def load_features_keys(d): + """SJ features.tsv → list of (chr, start, end) per row. STARsolo symlinks + features.tsv → SJ.out.tab (run root); fall back to that if the symlink is + broken (it points at the in-container path).""" + p = _find(d, "features.tsv") + if p is None or not os.path.exists(p): + # d = .../Solo.out/SJ/raw → run root is three levels up. + alt = os.path.join(d, "..", "..", "..", "SJ.out.tab") + p = alt if os.path.exists(alt) else None + keys = [] + op = gzip.open(p, "rt") if p.endswith(".gz") else open(p) + with op as fh: + for line in fh: + f = line.rstrip("\n").split("\t") + keys.append((f[0], f[1], f[2])) + return keys + + +def compare_aligned(name, A, B): + """A, B are features×barcodes with identical row+col indexing.""" + if A is None or B is None: + print(f"[{name}] missing matrix"); return + r = min(A.shape[0], B.shape[0]) + c = min(A.shape[1], B.shape[1]) + A = A[:r, :c] + B = B[:r, :c] + da = np.asarray(A.sum()); db = np.asarray(B.sum()) + # union of nonzero coords + U = (A != 0).astype(np.int8) + (B != 0).astype(np.int8) + coo = U.tocoo() + av = np.asarray(A[coo.row, coo.col]).ravel() + bv = np.asarray(B[coo.row, coo.col]).ravel() + rr = np.corrcoef(av, bv)[0, 1] if len(av) > 1 else float("nan") + exact = np.mean(np.isclose(av, bv, atol=1e-4)) if len(av) else float("nan") + print(f"[{name}] rustar_total={float(da):,.1f} star_total={float(db):,.1f} " + f"shared_entries={len(av):,} r={rr:.5f} exact={exact:.4%}") + + +def compare_sj(rdir, sdir): + ra = load_mtx(rdir); sa = load_mtx(sdir) + if ra is None or sa is None: + print("[SJ] missing matrix"); return + rk = load_features_keys(rdir); sk = load_features_keys(sdir) + print(f"[SJ] rustar junctions={len(rk):,} star junctions={len(sk):,}") + sidx = {k: i for i, k in enumerate(sk)} + shared = [(i, sidx[k]) for i, k in enumerate(rk) if k in sidx] + print(f"[SJ] shared junctions (by chr/start/end) = {len(shared):,} " + f"({len(shared)/max(len(rk),1):.1%} of rustar)") + if not shared: + return + rrows = [i for i, _ in shared] + srows = [j for _, j in shared] + c = min(ra.shape[1], sa.shape[1]) + Rm = ra[rrows, :c] + Sm = sa[srows, :c] + U = (Rm != 0).astype(np.int8) + (Sm != 0).astype(np.int8) + coo = U.tocoo() + av = np.asarray(Rm[coo.row, coo.col]).ravel() + bv = np.asarray(Sm[coo.row, coo.col]).ravel() + rr = np.corrcoef(av, bv)[0, 1] if len(av) > 1 else float("nan") + exact = np.mean(av == bv) if len(av) else float("nan") + print(f"[SJ] on shared junctions: rustar_total={float(Rm.sum()):,} " + f"star_total={float(Sm.sum()):,} shared_entries={len(av):,} " + f"r={rr:.5f} exact={exact:.4%}") + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--rustar", required=True, help="rustar Solo.out dir") + ap.add_argument("--starsolo", required=True, help="STARsolo Solo.out dir") + a = ap.parse_args() + + rg = os.path.join(a.rustar, "Gene", "raw") + sg = os.path.join(a.starsolo, "Gene", "raw") + print("=== Gene (unique) matrix sanity ===") + compare_aligned("Gene", load_mtx(rg), load_mtx(sg)) + + print("\n=== UniqueAndMult (--soloMultiMappers) ===") + for method in ("Uniform", "PropUnique", "Rescue", "EM"): + fn = f"UniqueAndMult-{method}.mtx" + compare_aligned(method, load_mtx(rg, fn), load_mtx(sg, fn)) + + print("\n=== SJ feature ===") + compare_sj(os.path.join(a.rustar, "SJ", "raw"), + os.path.join(a.starsolo, "SJ", "raw")) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/solo_summary_compare.py b/test/solo_summary_compare.py new file mode 100644 index 0000000..4b880b0 --- /dev/null +++ b/test/solo_summary_compare.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +"""Cross-compare CellRanger-style summary metrics across rustar / STARsolo / +CellRanger. + +rustar and STARsolo emit `Solo.out//Summary.csv` (key,value with +fractions in [0,1]); CellRanger emits `metrics_summary.csv` (one header row + one +value row, percentages like "53.5%" and comma-grouped integers). This pulls the +shared metrics into one table — genome/exon/intron/intergenic mapping rates plus +per-cell UMI/gene stats. + +Usage: + solo_summary_compare.py \ + --rustar \ + --starsolo \ + --cellranger +""" +import argparse +import csv +import sys + + +def load_summary_csv(path): + """rustar/STARsolo Summary.csv -> {key: float-or-int}.""" + d = {} + with open(path) as fh: + for line in fh: + if "," not in line: + continue + k, v = line.rstrip("\n").split(",", 1) + try: + d[k] = float(v) if "." in v else int(v) + except ValueError: + d[k] = v + return d + + +def load_cr_metrics(path): + """CellRanger metrics_summary.csv -> {key: float} (percents -> fraction).""" + with open(path) as fh: + rows = list(csv.reader(fh)) + keys, vals = rows[0], rows[1] + out = {} + for k, v in zip(keys, vals): + v = v.strip() + if v.endswith("%"): + out[k] = float(v[:-1]) / 100.0 + else: + try: + out[k] = float(v.replace(",", "")) + except ValueError: + out[k] = v + return out + + +def fmt_pct(x): + return f"{x*100:.1f}%" if isinstance(x, (int, float)) else str(x) + + +def fmt_int(x): + return f"{int(x):,}" if isinstance(x, (int, float)) else str(x) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--rustar", required=True, help="rustar GeneFull Summary.csv") + ap.add_argument("--starsolo", required=True, help="STARsolo GeneFull Summary.csv") + ap.add_argument("--cellranger", required=True, help="CellRanger metrics_summary.csv") + ap.add_argument("--feature", default="GeneFull") + a = ap.parse_args() + + R = load_summary_csv(a.rustar) + S = load_summary_csv(a.starsolo) + C = load_cr_metrics(a.cellranger) + f = a.feature + + # (label, rustar key, starsolo key, cellranger key, formatter) + pct = fmt_pct + intg = fmt_int + rows = [ + ("Valid barcodes", "Reads With Valid Barcodes", "Reads With Valid Barcodes", "Valid Barcodes", pct), + ("Sequencing saturation", "Sequencing Saturation", "Sequencing Saturation", "Sequencing Saturation", pct), + ("Reads mapped to genome (U+M)", "Reads Mapped to Genome: Unique+Multiple", "Reads Mapped to Genome: Unique+Multiple", "Reads Mapped to Genome", pct), + (" ... exonic", "Reads Mapped Confidently to Exonic Regions", None, "Reads Mapped Confidently to Exonic Regions", pct), + (" ... intronic", "Reads Mapped Confidently to Intronic Regions", None, "Reads Mapped Confidently to Intronic Regions", pct), + (" ... intergenic", "Reads Mapped Confidently to Intergenic Regions", None, "Reads Mapped Confidently to Intergenic Regions", pct), + ("Reads antisense to gene", "Reads Mapped Antisense to Gene", None, "Reads Mapped Antisense to Gene", pct), + ("Estimated number of cells", "Estimated Number of Cells", "Estimated Number of Cells", "Estimated Number of Cells", intg), + ("Mean reads / cell", "Mean Reads per Cell", "Mean Reads per Cell", "Mean Reads per Cell", intg), + (f"Median genes / cell", f"Median {f} per Cell", f"Median {f} per Cell", "Median Genes per Cell", intg), + ("Median UMI / cell", "Median UMI per Cell", "Median UMI per Cell", "Median UMI Counts per Cell", intg), + ("Total genes detected", f"Total {f} Detected", f"Total {f} Detected", "Total Genes Detected", intg), + ("Fraction reads in cells", "Fraction of Unique Reads in Cells", "Fraction of Unique Reads in Cells", "Fraction Reads in Cells", pct), + ] + + w = 34 + print(f"\nCross-tool summary ({f} for rustar/STARsolo; CellRanger raw is intron-inclusive)\n") + print(f"{'metric':<{w}}{'rustar':>14}{'STARsolo':>14}{'CellRanger':>14}") + print("-" * (w + 42)) + for label, rk, sk, ck, fn in rows: + rv = fn(R.get(rk)) if rk and rk in R else "—" + sv = fn(S.get(sk)) if sk and sk in S else "—" + cv = fn(C.get(ck)) if ck and ck in C else "—" + print(f"{label:<{w}}{rv:>14}{sv:>14}{cv:>14}") + print() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/alignment_features.rs b/tests/alignment_features.rs index 27e3a8d..28525c6 100644 --- a/tests/alignment_features.rs +++ b/tests/alignment_features.rs @@ -879,3 +879,792 @@ fn test_bare_dot_prefix_is_literal_string() { } assert!(count >= 1, "expected at least 1 BAM record, got {count}"); } + +// --------------------------------------------------------------------------- +// Test 9 — STARsolo (Phase 14.1–14.4): barcode parse, CB match, gene assign, +// UMI dedup, raw count-matrix output +// --------------------------------------------------------------------------- + +#[test] +fn test_starsolo_gene_matrix() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + // cDNA reads (R2): 50 bp from Exon1 of gene G1 (genome[10000..10050]), + // so each maps uniquely on the + strand inside G1 → Forward sense. + let cdna_path = tmpdir.path().join("cdna.fq"); + let barcode_path = tmpdir.path().join("barcode.fq"); + let wl_path = tmpdir.path().join("whitelist.txt"); + + let cb = "AAAACCCCGGGGTTTT"; // 16 bp, sorts first in the whitelist + // 8 reads, one cell, two well-separated UMI clouds (Hamming distance 10 + // apart, 4 reads each) → 1MM_All collapses each cloud to 1 molecule → 2. + let umi_a = "ACGTACGTAC"; + let umi_b = "TGCATGCATG"; + let n_reads = 8usize; + { + let mut cf = fs::File::create(&cdna_path).unwrap(); + let mut bf = fs::File::create(&barcode_path).unwrap(); + let exon1 = &genome[10000..10050]; + for i in 0..n_reads { + writeln!(cf, "@read{i}").unwrap(); + cf.write_all(exon1).unwrap(); + writeln!(cf, "\n+\n{}", "I".repeat(50)).unwrap(); + + let umi = if i < 4 { umi_a } else { umi_b }; + writeln!(bf, "@read{i}").unwrap(); + writeln!(bf, "{cb}{umi}").unwrap(); + writeln!(bf, "+\n{}", "I".repeat(26)).unwrap(); + } + } + { + let mut wf = fs::File::create(&wl_path).unwrap(); + writeln!(wf, "{cb}").unwrap(); + writeln!(wf, "CCCCGGGGTTTTAAAA").unwrap(); // decoys + writeln!(wf, "GGGGTTTTAAAACCCC").unwrap(); + } + + let output_dir = tmpdir.path().join("out_solo"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + + let assert = cargo_bin_cmd!("rustar-aligner") + .env("RUST_LOG", "info") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--readFilesIn", + cdna_path.to_str().unwrap(), + barcode_path.to_str().unwrap(), + "--soloType", + "CB_UMI_Simple", + "--soloCBwhitelist", + wl_path.to_str().unwrap(), + "--soloFeatures", + "Gene", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + // cDNA alignments are emitted like a normal SE run. + let sam_path = output_dir.join("Aligned.out.sam"); + assert!(sam_path.exists(), "Aligned.out.sam not found"); + assert!( + count_sam_records(&sam_path) >= n_reads, + "expected >= {n_reads} cDNA alignment records" + ); + + // 8 reads collected, all exact CB matches. + let stderr = String::from_utf8_lossy(&assert.get_output().stderr).to_string(); + assert!( + stderr.contains("collected 8 resolved"), + "expected 8 resolved solo records in log, stderr was:\n{stderr}" + ); + assert!( + stderr.contains("exact=8"), + "expected 8 exact CB matches in log, stderr was:\n{stderr}" + ); + + // Raw matrix output. + let raw = output_dir.join("Solo.out").join("Gene").join("raw"); + let features = fs::read_to_string(raw.join("features.tsv")).unwrap(); + let barcodes = fs::read_to_string(raw.join("barcodes.tsv")).unwrap(); + let matrix = fs::read_to_string(raw.join("matrix.mtx")).unwrap(); + + // One gene G1 with a name column + feature type. + assert_eq!(features.lines().count(), 1); + assert!( + features.starts_with("G1\tG1\tGene Expression"), + "unexpected features.tsv:\n{features}" + ); + // Three whitelist barcodes; the assayed CB sorts first. + assert_eq!(barcodes.lines().count(), 3); + assert_eq!(barcodes.lines().next().unwrap(), cb); + + // MatrixMarket: header, dims "1 3 1" (1 gene × 3 barcodes, 1 entry), + // single entry "1 1 2" (gene 1, cell 1, 2 deduped molecules). + let mtx_lines: Vec<&str> = matrix.lines().collect(); + assert!( + mtx_lines[0].starts_with("%%MatrixMarket matrix coordinate integer general"), + "unexpected mtx banner: {}", + mtx_lines[0] + ); + let dims = mtx_lines.iter().find(|l| !l.starts_with('%')).unwrap(); + assert_eq!(*dims, "1 3 1", "unexpected matrix dimensions"); + let entry = mtx_lines.last().unwrap(); + assert_eq!( + *entry, "1 1 2", + "expected 2 deduped molecules for G1 in cell 1" + ); + + // The default --soloCellFilter (CellRanger2.2) also writes a filtered/ matrix + // containing only the called cell (the one assayed barcode), column-renumbered. + let filt = output_dir.join("Solo.out").join("Gene").join("filtered"); + let f_barcodes = fs::read_to_string(filt.join("barcodes.tsv")).unwrap(); + assert_eq!(f_barcodes.lines().count(), 1, "expected 1 filtered cell"); + assert_eq!(f_barcodes.lines().next().unwrap(), cb); + let f_matrix = fs::read_to_string(filt.join("matrix.mtx")).unwrap(); + let f_dims = f_matrix.lines().find(|l| !l.starts_with('%')).unwrap(); + assert_eq!(f_dims, "1 1 1", "unexpected filtered matrix dimensions"); + assert_eq!(f_matrix.lines().last().unwrap(), "1 1 2"); + + // A CellRanger-style summary is written per feature. + let summary = + fs::read_to_string(output_dir.join("Solo.out").join("Gene").join("Summary.csv")).unwrap(); + assert!( + summary.contains("Estimated Number of Cells,1"), + "summary:\n{summary}" + ); +} + +// --------------------------------------------------------------------------- +// Test 9b — STARsolo SJ (splice-junction) feature +// +// Spliced cDNA reads (last 25 bp of Exon1 + first 25 bp of Exon2) cross the +// planted GT-AG intron, producing one junction. --soloFeatures SJ must write a +// Solo.out/SJ/raw matrix whose features.tsv equals SJ.out.tab and whose single +// junction row carries the deduped molecule count for the one cell. +// --------------------------------------------------------------------------- +#[test] +fn test_starsolo_sj_feature() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let cdna_path = tmpdir.path().join("cdna.fq"); + let barcode_path = tmpdir.path().join("barcode.fq"); + let wl_path = tmpdir.path().join("whitelist.txt"); + let cb = "AAAACCCCGGGGTTTT"; + let umi = "ACGTACGTAC"; + // Spliced read: 25 bp from end of Exon1 + 25 bp from start of Exon2, which + // aligns across the intron [10050,10250) → one GT-AG junction. + let mut spliced = genome[10025..10050].to_vec(); + spliced.extend_from_slice(&genome[10250..10275]); + { + let mut cf = fs::File::create(&cdna_path).unwrap(); + let mut bf = fs::File::create(&barcode_path).unwrap(); + for i in 0..6 { + writeln!(cf, "@r{i}").unwrap(); + cf.write_all(&spliced).unwrap(); + writeln!(cf, "\n+\n{}", "I".repeat(50)).unwrap(); + writeln!(bf, "@r{i}\n{cb}{umi}\n+\n{}", "I".repeat(26)).unwrap(); + } + let mut wf = fs::File::create(&wl_path).unwrap(); + writeln!(wf, "{cb}\nCCCCGGGGTTTTAAAA\nGGGGTTTTAAAACCCC").unwrap(); + } + + let output_dir = tmpdir.path().join("out_sj"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--readFilesIn", + cdna_path.to_str().unwrap(), + barcode_path.to_str().unwrap(), + "--soloType", + "CB_UMI_Simple", + "--soloCBwhitelist", + wl_path.to_str().unwrap(), + "--soloFeatures", + "Gene", + "SJ", + "--soloStrand", + "Forward", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let sj_raw = output_dir.join("Solo.out").join("SJ").join("raw"); + let features = fs::read_to_string(sj_raw.join("features.tsv")).unwrap(); + let sj_tab = fs::read_to_string(output_dir.join("SJ.out.tab")).unwrap(); + // SJ feature file mirrors SJ.out.tab and contains exactly the one junction. + assert_eq!(features, sj_tab, "SJ features.tsv must equal SJ.out.tab"); + assert_eq!(features.lines().count(), 1, "expected one junction"); + assert!( + features.starts_with("chr1\t10051\t10250\t"), + "unexpected junction: {features}" + ); + // Matrix: 1 junction × 3 barcodes, single entry "1 1 1" (one deduped molecule + // — all 6 reads share one UMI in one cell). + let matrix = fs::read_to_string(sj_raw.join("matrix.mtx")).unwrap(); + let dims = matrix.lines().find(|l| !l.starts_with('%')).unwrap(); + assert_eq!(dims, "1 3 1", "unexpected SJ matrix dims"); + assert_eq!(matrix.lines().last().unwrap(), "1 1 1"); +} + +// --------------------------------------------------------------------------- +// Test 9c — STARsolo --soloMultiMappers (gene-ambiguous distribution) +// +// G1 and G3 share Exon1 (so a read there is ambiguous {G1,G3}); G2 has Exon2. +// One cell has a unique G2 molecule + one ambiguous {G1,G3} molecule. The unique +// matrix counts only G2; UniqueAndMult-Uniform spreads the ambiguous molecule +// 0.5/0.5 to G1 and G3 while keeping G2 at 1. +// --------------------------------------------------------------------------- +#[test] +fn test_starsolo_multimappers() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + // GTF order: G1, G3 (both Exon1), G2 (Exon2) → gene indices 0,1,2. + let gtf = tmpdir.path().join("multi.gtf"); + { + let mut f = fs::File::create(>f).unwrap(); + for g in ["G1", "G3"] { + writeln!( + f, + "chr1\tt\texon\t10001\t10050\t.\t+\t.\tgene_id \"{g}\"; transcript_id \"{g}t\";" + ) + .unwrap(); + } + writeln!( + f, + "chr1\tt\texon\t10251\t10300\t.\t+\t.\tgene_id \"G2\"; transcript_id \"G2t\";" + ) + .unwrap(); + } + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let cdna_path = tmpdir.path().join("cdna.fq"); + let barcode_path = tmpdir.path().join("barcode.fq"); + let wl_path = tmpdir.path().join("whitelist.txt"); + let cb = "AAAACCCCGGGGTTTT"; + { + let mut cf = fs::File::create(&cdna_path).unwrap(); + let mut bf = fs::File::create(&barcode_path).unwrap(); + // 4 reads in Exon2 → unique G2 (UMI a); 4 reads in Exon1 → ambiguous (UMI b). + let exon2 = &genome[10250..10300]; + let exon1 = &genome[10000..10050]; + for (i, (seq, umi)) in [(exon2, "ACGTACGTAC"), (exon1, "TGCATGCATG")] + .iter() + .flat_map(|x| std::iter::repeat_n(*x, 4)) + .enumerate() + { + writeln!(cf, "@r{i}").unwrap(); + cf.write_all(seq).unwrap(); + writeln!(cf, "\n+\n{}", "I".repeat(50)).unwrap(); + writeln!(bf, "@r{i}\n{cb}{umi}\n+\n{}", "I".repeat(26)).unwrap(); + } + let mut wf = fs::File::create(&wl_path).unwrap(); + writeln!(wf, "{cb}\nCCCCGGGGTTTTAAAA\nGGGGTTTTAAAACCCC").unwrap(); + } + + let output_dir = tmpdir.path().join("out_mm"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--readFilesIn", + cdna_path.to_str().unwrap(), + barcode_path.to_str().unwrap(), + "--soloType", + "CB_UMI_Simple", + "--soloCBwhitelist", + wl_path.to_str().unwrap(), + "--soloFeatures", + "Gene", + "--soloStrand", + "Forward", + "--soloMultiMappers", + "Uniform", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let raw = output_dir.join("Solo.out").join("Gene").join("raw"); + // Unique matrix: only G2 (gene index 2 → row 3), count 1. + let matrix = fs::read_to_string(raw.join("matrix.mtx")).unwrap(); + assert_eq!( + matrix.lines().last().unwrap(), + "3 1 1", + "unique matrix:\n{matrix}" + ); + // UniqueAndMult-Uniform: G1=0.5, G3=0.5, G2=1. + let um = fs::read_to_string(raw.join("UniqueAndMult-Uniform.mtx")).unwrap(); + assert!(um.contains("coordinate real general"), "um header:\n{um}"); + let rows: Vec<&str> = um.lines().filter(|l| !l.starts_with('%')).skip(1).collect(); + assert!(rows.contains(&"1 1 0.50000"), "expected G1 0.5, got:\n{um}"); + assert!(rows.contains(&"2 1 0.50000"), "expected G3 0.5, got:\n{um}"); + assert!(rows.contains(&"3 1 1"), "expected G2 1, got:\n{um}"); +} + +// --------------------------------------------------------------------------- +// Test 9d — STARsolo SmartSeq (plate-based, manifest, no UMI) +// +// Two "cells" (manifest entries) of Exon1 reads → gene G1. With no UMIs each read +// is a count, so the matrix is G1 × {CellA,CellB} = read counts (5, 3). +// --------------------------------------------------------------------------- +#[test] +fn test_starsolo_smartseq() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let exon1 = &genome[10000..10050]; + let write_cell = |name: &str, n: usize| -> PathBuf { + let p = tmpdir.path().join(name); + let mut f = fs::File::create(&p).unwrap(); + for i in 0..n { + writeln!(f, "@{name}_{i}").unwrap(); + f.write_all(exon1).unwrap(); + writeln!(f, "\n+\n{}", "I".repeat(50)).unwrap(); + } + p + }; + let a = write_cell("cellA.fq", 5); + let b = write_cell("cellB.fq", 3); + let manifest = tmpdir.path().join("manifest.tsv"); + fs::write( + &manifest, + format!("{}\t-\tCellA\n{}\t-\tCellB\n", a.display(), b.display()), + ) + .unwrap(); + + let output_dir = tmpdir.path().join("out_ss"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--soloType", + "SmartSeq", + "--readFilesManifest", + manifest.to_str().unwrap(), + "--soloStrand", + "Forward", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let raw = output_dir.join("Solo.out").join("Gene").join("raw"); + let barcodes = fs::read_to_string(raw.join("barcodes.tsv")).unwrap(); + assert_eq!(barcodes, "CellA\nCellB\n"); + let matrix = fs::read_to_string(raw.join("matrix.mtx")).unwrap(); + let dims = matrix.lines().find(|l| !l.starts_with('%')).unwrap(); + assert_eq!(dims, "1 2 2", "SmartSeq matrix dims:\n{matrix}"); + let entries: Vec<&str> = matrix + .lines() + .filter(|l| !l.starts_with('%')) + .skip(1) + .collect(); + assert!(entries.contains(&"1 1 5"), "expected CellA G1=5:\n{matrix}"); + assert!(entries.contains(&"1 2 3"), "expected CellB G1=3:\n{matrix}"); +} + +// --------------------------------------------------------------------------- +// Test 9d-PE — STARsolo SmartSeq paired-end (fragment counts) +// +// One cell, 4 read pairs: mate1 in Exon1, mate2 in (reverse-complement) Exon2 → +// a proper FR pair on gene G1. Each fragment is counted once (no UMI) → G1 = 4. +// --------------------------------------------------------------------------- +#[test] +fn test_starsolo_smartseq_paired() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let r1_path = tmpdir.path().join("r1.fq"); + let r2_path = tmpdir.path().join("r2.fq"); + let mate1 = &genome[10000..10050]; // Exon1, forward + let mate2 = rc(&genome[10250..10300]); // Exon2, reverse-complement (FR mate) + { + let mut f1 = fs::File::create(&r1_path).unwrap(); + let mut f2 = fs::File::create(&r2_path).unwrap(); + for i in 0..4 { + writeln!(f1, "@p{i}").unwrap(); + f1.write_all(mate1).unwrap(); + writeln!(f1, "\n+\n{}", "I".repeat(50)).unwrap(); + writeln!(f2, "@p{i}").unwrap(); + f2.write_all(&mate2).unwrap(); + writeln!(f2, "\n+\n{}", "I".repeat(50)).unwrap(); + } + } + let manifest = tmpdir.path().join("manifest.tsv"); + fs::write( + &manifest, + format!("{}\t{}\tCellPE\n", r1_path.display(), r2_path.display()), + ) + .unwrap(); + + let output_dir = tmpdir.path().join("out_sspe"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--soloType", + "SmartSeq", + "--readFilesManifest", + manifest.to_str().unwrap(), + "--soloStrand", + "Unstranded", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let raw = output_dir.join("Solo.out").join("Gene").join("raw"); + let matrix = fs::read_to_string(raw.join("matrix.mtx")).unwrap(); + let dims = matrix.lines().find(|l| !l.starts_with('%')).unwrap(); + // One gene (G1) × one cell; 4 fragments counted. + assert_eq!(dims, "1 1 1", "PE SmartSeq matrix dims:\n{matrix}"); + assert_eq!( + matrix.lines().last().unwrap(), + "1 1 4", + "expected G1=4 fragments:\n{matrix}" + ); +} + +// --------------------------------------------------------------------------- +// Test 9f — STARsolo Velocyto (spliced / unspliced / ambiguous) +// +// Three reads on gene G1, one per category: a junction-spanning read (spliced), +// a purely intronic read (unspliced), and a wholly-exonic read with no junction +// (ambiguous, per Sullivan 2025). Distinct UMIs → one molecule in each matrix. +// --------------------------------------------------------------------------- +#[test] +fn test_starsolo_velocyto() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let cdna_path = tmpdir.path().join("cdna.fq"); + let bc_path = tmpdir.path().join("bc.fq"); + let wl_path = tmpdir.path().join("whitelist.txt"); + let cb = "AAAACCCCGGGGTTTT"; + // category → cDNA read + a distinct (non-homopolymer) 12 bp UMI. + let mut spliced = genome[10025..10050].to_vec(); // Exon1 end ... + spliced.extend_from_slice(&genome[10250..10275]); // ... + Exon2 start → junction + let reads: [(Vec, &str); 3] = [ + (spliced, "ACGTACGTACGT"), // spliced + (genome[10100..10150].to_vec(), "TGCATGCATGCA"), // intronic → unspliced + (genome[10000..10050].to_vec(), "GATCGATCGATC"), // exonic, no junction → ambiguous + ]; + { + let mut cf = fs::File::create(&cdna_path).unwrap(); + let mut bf = fs::File::create(&bc_path).unwrap(); + for (i, (seq, umi)) in reads.iter().enumerate() { + writeln!(cf, "@r{i}").unwrap(); + cf.write_all(seq).unwrap(); + writeln!(cf, "\n+\n{}", "I".repeat(seq.len())).unwrap(); + writeln!(bf, "@r{i}\n{cb}{umi}\n+\n{}", "I".repeat(28)).unwrap(); + } + fs::write(&wl_path, format!("{cb}\nCCCCGGGGTTTTAAAA\n")).unwrap(); + } + + let output_dir = tmpdir.path().join("out_velo"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--readFilesIn", + cdna_path.to_str().unwrap(), + bc_path.to_str().unwrap(), + "--soloType", + "CB_UMI_Simple", + "--soloCBwhitelist", + wl_path.to_str().unwrap(), + "--soloCBstart", + "1", + "--soloCBlen", + "16", + "--soloUMIstart", + "17", + "--soloUMIlen", + "12", + "--soloFeatures", + "Velocyto", + "--soloStrand", + "Forward", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let raw = output_dir.join("Solo.out").join("Velocyto").join("raw"); + // Each category matrix holds exactly its one molecule for G1 (row 1, col 1). + for name in ["spliced", "unspliced", "ambiguous"] { + let m = fs::read_to_string(raw.join(format!("{name}.mtx"))).unwrap(); + assert_eq!( + m.lines().last().unwrap(), + "1 1 1", + "{name}.mtx should have G1=1:\n{m}" + ); + } +} + +// --------------------------------------------------------------------------- +// Test 9e — STARsolo CB_UMI_Complex (multi-segment barcode) +// +// Barcode read layout: seg1(2bp) + linker(2bp) + seg2(2bp) + UMI(2bp). The cell +// barcode is seg1++seg2 matched against the cartesian product of two segment +// whitelists. All reads share CB=AAGG / UMI=AT → one molecule for gene G1. +// --------------------------------------------------------------------------- +#[test] +fn test_starsolo_cb_umi_complex() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let cdna_path = tmpdir.path().join("cdna.fq"); + let bc_path = tmpdir.path().join("bc.fq"); + let wl1 = tmpdir.path().join("wl1.txt"); + let wl2 = tmpdir.path().join("wl2.txt"); + fs::write(&wl1, "AA\nCC\n").unwrap(); // seg1 whitelist + fs::write(&wl2, "GG\nTT\n").unwrap(); // seg2 whitelist + { + let mut cf = fs::File::create(&cdna_path).unwrap(); + let mut bf = fs::File::create(&bc_path).unwrap(); + let exon1 = &genome[10000..10050]; + for i in 0..4 { + writeln!(cf, "@r{i}").unwrap(); + cf.write_all(exon1).unwrap(); + writeln!(cf, "\n+\n{}", "I".repeat(50)).unwrap(); + // seg1=AA, linker=CC, seg2=GG, UMI=AT → CB "AAGG", UMI "AT". + writeln!(bf, "@r{i}\nAACCGGAT\n+\nIIIIIIII").unwrap(); + } + } + + let output_dir = tmpdir.path().join("out_cx"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--readFilesIn", + cdna_path.to_str().unwrap(), + bc_path.to_str().unwrap(), + "--soloType", + "CB_UMI_Complex", + "--soloCBwhitelist", + wl1.to_str().unwrap(), + wl2.to_str().unwrap(), + "--soloCBposition", + "0_0_0_1", + "0_4_0_5", + "--soloUMIposition", + "0_6_0_7", + "--soloUMIlen", + "2", + "--soloCBmatchWLtype", + "Exact", + "--soloFeatures", + "Gene", + "--soloStrand", + "Forward", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let raw = output_dir.join("Solo.out").join("Gene").join("raw"); + // Combined whitelist = {AA,CC}×{GG,TT} = 4 barcodes. The matched cell is AAGG; + // all 4 reads share UMI AT → one molecule for G1. + let matrix = fs::read_to_string(raw.join("matrix.mtx")).unwrap(); + let dims = matrix.lines().find(|l| !l.starts_with('%')).unwrap(); + let parts: Vec<&str> = dims.split_whitespace().collect(); + assert_eq!( + parts[1], "4", + "expected 4 combined-whitelist cells, dims={dims}" + ); + assert_eq!(matrix.lines().last().unwrap(), "1 1 1", "matrix:\n{matrix}"); +} + +// --------------------------------------------------------------------------- +// Test 10 — CellRanger-style STARsolo run (Phase 14.5) +// +// Exercises the full CellRanger 4.x/5.x flag set from STARsolo.md: +// --clipAdapterType CellRanger4 --outFilterScoreMin 30 +// --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts +// --soloUMIfiltering MultiGeneUMI_CR --soloUMIdedup 1MM_CR +// and asserts the raw Gene matrix. The 1MM_CR UMI collapse is the key +// CellRanger-specific behavior verified here. A live differential comparison +// against the real STAR binary is in test/solo_cellranger_diff.py. +// --------------------------------------------------------------------------- + +#[test] +fn test_starsolo_cellranger_style_matrix() { + let tmpdir = TempDir::new().unwrap(); + let genome = build_genome(); + let fasta = write_fasta(&tmpdir, &genome); + let gtf = write_gtf(&tmpdir); + + let genome_dir = tmpdir.path().join("genome"); + build_index(&fasta, &genome_dir, "7", Some(>f)); + + let cdna_path = tmpdir.path().join("cdna.fq"); + let barcode_path = tmpdir.path().join("barcode.fq"); + let wl_path = tmpdir.path().join("whitelist.txt"); + + // One cell (CB sorts first), 8 reads in Exon1 of G1. UMIs: M x5 + a 1MM + // neighbor of M x1 (1MM_CR collapses these to ONE molecule) + N x2 (a second + // molecule) => 2 deduped molecules for (CB, G1). + let cb = "AAAACCCCGGGGTTTT"; + let umi_m = "ACGTACGTAC"; // 10 bp (default soloUMIlen) + let umi_m_1mm = "ACGTACGTAG"; // 1 mismatch from umi_m (last base) + let umi_n = "TGCATGCATG"; + let plan = [(umi_m, 5usize), (umi_m_1mm, 1), (umi_n, 2)]; + { + let mut cf = fs::File::create(&cdna_path).unwrap(); + let mut bf = fs::File::create(&barcode_path).unwrap(); + let exon1 = &genome[10000..10050]; + let mut i = 0; + for (umi, n) in plan { + for _ in 0..n { + writeln!(cf, "@read{i}").unwrap(); + cf.write_all(exon1).unwrap(); + writeln!(cf, "\n+\n{}", "I".repeat(50)).unwrap(); + writeln!( + bf, + "@read{i}\n{cb}{umi}\n+\n{}", + "I".repeat(cb.len() + umi.len()) + ) + .unwrap(); + i += 1; + } + } + } + { + let mut wf = fs::File::create(&wl_path).unwrap(); + writeln!(wf, "{cb}").unwrap(); + writeln!(wf, "TTTTGGGGCCCCAAAA").unwrap(); // decoy (sorts after cb) + } + + let output_dir = tmpdir.path().join("out_cr"); + fs::create_dir_all(&output_dir).unwrap(); + let prefix = format!("{}/", output_dir.display()); + + cargo_bin_cmd!("rustar-aligner") + .args([ + "--runMode", + "alignReads", + "--genomeDir", + genome_dir.to_str().unwrap(), + "--readFilesIn", + cdna_path.to_str().unwrap(), + barcode_path.to_str().unwrap(), + "--soloType", + "CB_UMI_Simple", + "--soloCBwhitelist", + wl_path.to_str().unwrap(), + "--soloCBstart", + "1", + "--soloCBlen", + "16", + "--soloUMIstart", + "17", + "--soloUMIlen", + "10", + "--soloFeatures", + "Gene", + "--sjdbGTFfile", + gtf.to_str().unwrap(), + // CellRanger 4.x/5.x matching flags: + "--clipAdapterType", + "CellRanger4", + "--outFilterScoreMin", + "30", + "--soloCBmatchWLtype", + "1MM_multi_Nbase_pseudocounts", + "--soloUMIfiltering", + "MultiGeneUMI_CR", + "--soloUMIdedup", + "1MM_CR", + "--outSAMtype", + "SAM", + "--outFileNamePrefix", + &prefix, + ]) + .assert() + .success(); + + let raw = output_dir.join("Solo.out").join("Gene").join("raw"); + let features = fs::read_to_string(raw.join("features.tsv")).unwrap(); + let barcodes = fs::read_to_string(raw.join("barcodes.tsv")).unwrap(); + let matrix = fs::read_to_string(raw.join("matrix.mtx")).unwrap(); + + assert!(features.starts_with("G1\t"), "features.tsv: {features}"); + assert_eq!(barcodes.lines().count(), 2); + assert_eq!(barcodes.lines().next().unwrap(), cb); // CB sorts first + + let lines: Vec<&str> = matrix.lines().collect(); + let dims = lines.iter().find(|l| !l.starts_with('%')).unwrap(); + assert_eq!( + *dims, "1 2 1", + "matrix dims (1 gene x 2 barcodes x 1 entry)" + ); + // 1MM_CR: M(5)+M_1mm(1) collapse to 1 molecule, N(2) is another => 2. + assert_eq!( + *lines.last().unwrap(), + "1 1 2", + "expected 2 deduped molecules" + ); +}