diff --git a/Cargo.toml b/Cargo.toml index c297033..ebf02f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,10 @@ categories = ["command-line-utilities", "science"] homepage = "https://seqeralabs.github.io/RustQC/" exclude = ["benchmark/", "docs/", "paper/", "tests/", ".github/", "Dockerfile", ".dockerignore", ".pre-commit-config.yaml", "netlify.toml", "CONTRIBUTING.md", "AGENTS.md"] +[lib] +name = "rustqc" +path = "src/lib.rs" + [[bin]] name = "rustqc" path = "src/main.rs" diff --git a/README.md b/README.md index d0f48d3..f576fc7 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,17 @@ cargo install rustqc See the [documentation](https://seqeralabs.github.io/RustQC/) for full usage details, configuration options, output file descriptions, and benchmark results. +## Use as a Rust library + +The crate is also published as a library, so the QC analysis modules (GTF parsing, dupRadar, featureCounts, RSeQC, Qualimap, preseq, samtools-style outputs) can be embedded into other Rust programs: + +```toml +[dependencies] +rustqc = "0.2" +``` + +See the [library guide](https://seqeralabs.github.io/RustQC/usage/library/) and the full API reference on [docs.rs/rustqc](https://docs.rs/rustqc). + ## AI & Provenance RustQC was developed with substantial assistance from AI coding agents (primarily [Claude](https://claude.ai/)), using the upstream tool source code as reference. Correctness is validated by comparing output against the original tools on real sequencing data, not by manual code review alone. See the [AI & Provenance](https://seqeralabs.github.io/RustQC/about/ai-statement/) documentation for full details, including known validation gaps. diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs index f58fb4e..4e1c2f4 100644 --- a/docs/astro.config.mjs +++ b/docs/astro.config.mjs @@ -57,6 +57,7 @@ export default defineConfig({ slug: "usage/configuration", }, { label: "Performance & Tuning", slug: "usage/performance" }, + { label: "Rust Library", slug: "usage/library" }, ], }, { diff --git a/docs/src/content/docs/usage/library.mdx b/docs/src/content/docs/usage/library.mdx new file mode 100644 index 0000000..dd74d60 --- /dev/null +++ b/docs/src/content/docs/usage/library.mdx @@ -0,0 +1,102 @@ +--- +title: Rust Library +description: Use RustQC as a Rust library crate, embedding its QC analysis modules in your own programs. +--- + +import { Aside } from "@astrojs/starlight/components"; + +RustQC is published on [crates.io](https://crates.io/crates/rustqc) as both a +binary and a library. The CLI (`rustqc rna ...`) is the primary interface, but +the same analysis modules are also exposed as a library so they can be embedded +into other Rust programs. + +Full API reference: **[docs.rs/rustqc](https://docs.rs/rustqc)**. + +## Adding RustQC as a dependency + +```toml +[dependencies] +rustqc = "0.2" +``` + +`rust-htslib` is linked statically and a small C++ component (used by the preseq +tool) is built from source, so a working C/C++ toolchain (`cc`, `c++`) is +required when building. No runtime dependencies are added beyond what the binary +already needs. + +## What's in the library + +The crate exposes these modules: + +| Module | Contents | +| ----------------------------- | --------------------------------------------------------------------------------------------------------- | +| [`gtf`][docs-gtf] | GTF gene-annotation parsing. `Gene`, `Transcript`, `Exon`, `parse_gtf`. | +| [`io`][docs-io] | Transparent gzip-aware reader, FNV-1a hashing, number formatters. | +| [`config`][docs-config] | Configuration types mirroring the CLI's YAML config file. | +| [`summary`][docs-summary] | Serializable types for the JSON run summary. | +| [`cpu`][docs-cpu] | CPU feature detection and binary-target identification. | +| [`rna`][docs-rna] | RNA-Seq analyses: `dupradar`, `featurecounts`, `qualimap`, `preseq`, `rseqc`. | + +[`Strandedness`][docs-strandedness] lives at the crate root because it is used +across most analysis modules. + +[docs-gtf]: https://docs.rs/rustqc/latest/rustqc/gtf/ +[docs-io]: https://docs.rs/rustqc/latest/rustqc/io/ +[docs-config]: https://docs.rs/rustqc/latest/rustqc/config/ +[docs-summary]: https://docs.rs/rustqc/latest/rustqc/summary/ +[docs-cpu]: https://docs.rs/rustqc/latest/rustqc/cpu/ +[docs-rna]: https://docs.rs/rustqc/latest/rustqc/rna/ +[docs-strandedness]: https://docs.rs/rustqc/latest/rustqc/enum.Strandedness.html + +## Quick examples + +Parse a GTF file: + +```rust +use rustqc::gtf; + +let genes = gtf::parse_gtf("genes.gtf", &[])?; +println!("{} genes parsed", genes.len()); +for (gene_id, gene) in genes.iter().take(3) { + println!("{gene_id}: {} transcripts", gene.transcripts.len()); +} +# Ok::<(), anyhow::Error>(()) +``` + +Open a possibly-gzipped annotation or output file with one call: + +```rust +use std::io::BufRead; +use rustqc::io::open_reader; + +let reader = open_reader("counts.tsv.gz")?; +for line in reader.lines() { + println!("{}", line?); +} +# Ok::<(), anyhow::Error>(()) +``` + +Use the `Strandedness` enum (it derives `serde::Deserialize` and clap's +`ValueEnum`, so it integrates with both YAML configs and CLI parsers): + +```rust +use rustqc::Strandedness; + +let s = Strandedness::Reverse; +assert_eq!(s.to_string(), "reverse"); +``` + +## Stability + +The library is at `0.2.x` and the public surface is intentionally small. Expect +breaking changes in minor releases until `1.0`. Module visibility may be +narrowed in future versions if internal types are inadvertently exposed. + + + +[issue-72]: https://github.com/seqeralabs/RustQC/issues/72 diff --git a/src/cli.rs b/src/cli.rs index 07e50a1..6e6459e 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -10,34 +10,9 @@ //! //! A GTF gene annotation file is required for all analyses. -use clap::{CommandFactory, Parser, Subcommand, ValueEnum}; -use serde::Deserialize; +use clap::{CommandFactory, Parser, Subcommand}; -/// Library strandedness protocol. -/// -/// Determines how read strand is interpreted relative to the gene annotation -/// strand during counting. Accepted CLI values: `unstranded`, `forward`, `reverse`. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, ValueEnum, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum Strandedness { - /// Count reads on either strand (library is not strand-specific). - #[default] - Unstranded, - /// Forward stranded: read 1 maps to the transcript strand. - Forward, - /// Reverse stranded: read 2 maps to the transcript strand (e.g. dUTP). - Reverse, -} - -impl std::fmt::Display for Strandedness { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Strandedness::Unstranded => write!(f, "unstranded"), - Strandedness::Forward => write!(f, "forward"), - Strandedness::Reverse => write!(f, "reverse"), - } - } -} +use rustqc::Strandedness; /// Fast quality control tools for sequencing data, written in Rust. #[derive(Parser, Debug)] @@ -407,7 +382,7 @@ pub fn parse_args() -> Cli { env!("CARGO_PKG_VERSION"), env!("GIT_SHORT_HASH"), env!("BUILD_TIMESTAMP"), - crate::cpu::cpu_info_line(), + rustqc::cpu::cpu_info_line(), ) .into_boxed_str(), ); diff --git a/src/config.rs b/src/config.rs index 7dd93bc..4952a1f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -4,7 +4,7 @@ //! like chromosome name mappings between alignment file and GTF references, //! per-tool output configuration, and tool enable/disable toggles. -use crate::cli::Strandedness; +use crate::Strandedness; use anyhow::{Context, Result}; use serde::Deserialize; use serde_yaml_ng::Value; @@ -1213,7 +1213,7 @@ preseq: deep_merge(&mut base, overlay); let m = base.as_mapping().unwrap(); let items = m - .get(&Value::String("items".into())) + .get(Value::String("items".into())) .unwrap() .as_sequence() .unwrap(); @@ -1268,7 +1268,7 @@ preseq: let paths = collect_config_paths(Some("/tmp/nonexistent.yml")); // The -c flag should always be last - assert!(paths.last().unwrap().0 == PathBuf::from("/tmp/nonexistent.yml")); + assert!(paths.last().unwrap().0 == Path::new("/tmp/nonexistent.yml")); assert_eq!(paths.last().unwrap().1, "-c flag"); // Restore diff --git a/src/io.rs b/src/io.rs index 9177ae2..9030aff 100644 --- a/src/io.rs +++ b/src/io.rs @@ -9,6 +9,7 @@ use flate2::read::GzDecoder; use std::fs::File; use std::io::{BufRead, BufReader, Read, Seek}; use std::path::Path; +use std::time::Duration; /// Gzip magic bytes: the first two bytes of any gzip-compressed file. const GZIP_MAGIC: [u8; 2] = [0x1f, 0x8b]; @@ -101,6 +102,56 @@ pub fn format_with_commas(n: u64) -> String { result } +/// Format a count with SI suffixes (e.g. "1.5K", "48.2M", "2.3G"). +/// +/// Used for compact human-readable counts in progress messages and summaries. +pub fn format_count(n: u64) -> String { + use number_prefix::NumberPrefix; + match NumberPrefix::decimal(n as f64) { + NumberPrefix::Standalone(n) => format!("{n}"), + NumberPrefix::Prefixed(prefix, n) => { + // Map SI prefixes to short single-char suffixes + let suffix = match prefix { + number_prefix::Prefix::Kilo => "K", + number_prefix::Prefix::Mega => "M", + number_prefix::Prefix::Giga => "G", + number_prefix::Prefix::Tera => "T", + _ => return format!("{:.1}{prefix:?}", n), + }; + format!("{n:.1}{suffix}") + } + } +} + +/// Format a percentage string (e.g. "(83.3%)"). +pub fn format_pct(n: u64, total: u64) -> String { + if total == 0 { + return "(0.0%)".to_string(); + } + format!("({:.1}%)", n as f64 / total as f64 * 100.0) +} + +/// Format a duration as human-friendly mm:ss or h:mm:ss. +/// +/// - Under 60s: `"45.2s"` +/// - Under 1h: `"1:23"` +/// - Over 1h: `"1:02:34"` +pub fn format_duration(d: Duration) -> String { + let total_secs = d.as_secs_f64(); + if total_secs < 60.0 { + return format!("{total_secs:.1}s"); + } + let total_secs = d.as_secs(); + let hours = total_secs / 3600; + let minutes = (total_secs % 3600) / 60; + let seconds = total_secs % 60; + if hours > 0 { + format!("{hours}:{minutes:02}:{seconds:02}") + } else { + format!("{minutes}:{seconds:02}") + } +} + // ============================================================ // Numeric helpers // ============================================================ @@ -181,6 +232,60 @@ mod tests { assert_eq!(format_with_commas(1234567), "1,234,567"); } + #[test] + fn test_format_count_small() { + assert_eq!(format_count(0), "0"); + assert_eq!(format_count(42), "42"); + assert_eq!(format_count(999), "999"); + } + + #[test] + fn test_format_count_thousands() { + assert_eq!(format_count(1000), "1.0K"); + assert_eq!(format_count(1500), "1.5K"); + assert_eq!(format_count(50000), "50.0K"); + } + + #[test] + fn test_format_count_millions() { + assert_eq!(format_count(1_000_000), "1.0M"); + assert_eq!(format_count(48_200_000), "48.2M"); + assert_eq!(format_count(50_000_000), "50.0M"); + } + + #[test] + fn test_format_count_billions() { + assert_eq!(format_count(1_000_000_000), "1.0G"); + assert_eq!(format_count(5_000_000_000), "5.0G"); + } + + #[test] + fn test_format_pct() { + assert_eq!(format_pct(833, 1000), "(83.3%)"); + assert_eq!(format_pct(0, 0), "(0.0%)"); + assert_eq!(format_pct(1000, 1000), "(100.0%)"); + } + + #[test] + fn test_format_duration_seconds() { + assert_eq!(format_duration(Duration::from_secs_f64(0.5)), "0.5s"); + assert_eq!(format_duration(Duration::from_secs_f64(45.2)), "45.2s"); + assert_eq!(format_duration(Duration::from_secs_f64(59.9)), "59.9s"); + } + + #[test] + fn test_format_duration_minutes() { + assert_eq!(format_duration(Duration::from_secs(60)), "1:00"); + assert_eq!(format_duration(Duration::from_secs(83)), "1:23"); + assert_eq!(format_duration(Duration::from_secs(3599)), "59:59"); + } + + #[test] + fn test_format_duration_hours() { + assert_eq!(format_duration(Duration::from_secs(3600)), "1:00:00"); + assert_eq!(format_duration(Duration::from_secs(3754)), "1:02:34"); + } + #[test] fn test_open_reader_plain() { let content = "line1\nline2\nline3\n"; diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..9a228ca --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,99 @@ +//! RustQC — fast quality control tools for sequencing data. +//! +//! RustQC is primarily a CLI (`rustqc rna ...`) that runs a single-pass +//! RNA-Seq QC pipeline (dupRadar, featureCounts, 8 RSeQC tools, Qualimap, +//! preseq, samtools-style outputs). The same analysis modules are also +//! exposed as a library so they can be embedded into other Rust programs. +//! +//! # Adding RustQC as a dependency +//! +//! ```toml +//! [dependencies] +//! rustqc = "0.2" +//! ``` +//! +//! The library pulls in `rust-htslib` (linked statically), `plotters`, and +//! a small C++ component used by the preseq tool (built via `build.rs`), +//! so a working C/C++ toolchain is required at build time. +//! +//! # Modules +//! +//! - [`gtf`] — GTF gene-annotation parsing into [`gtf::Gene`] / [`gtf::Transcript`] / [`gtf::Exon`]. +//! - [`io`] — shared I/O helpers (transparent gzip decompression, FNV-1a, number formatting). +//! - [`config`] — configuration types that mirror the CLI's YAML config file. +//! - [`summary`] — serializable types for the JSON run summary. +//! - [`cpu`] — CPU feature detection and binary-target identification. +//! - [`rna`] — the RNA-Seq analysis modules: +//! - [`rna::dupradar`], [`rna::featurecounts`], [`rna::qualimap`], +//! [`rna::preseq`], [`rna::rseqc`]. +//! +//! [`Strandedness`] lives at the crate root because it is used across most +//! analysis modules. +//! +//! # Stability +//! +//! The library is at `0.2.x` and the public surface is intentionally small +//! at this stage. Expect breaking changes in minor releases until `1.0`. +//! The full single-pass RNA-Seq pipeline (the `run_rna` orchestrator that +//! the binary uses) is not yet exposed as a library entry point — for now +//! library consumers drive individual analyses themselves. Pipeline-level +//! orchestration may be exposed in a future release; see issue +//! [#72](https://github.com/seqeralabs/RustQC/issues/72). +//! +//! # Examples +//! +//! Parse a GTF file and inspect the first gene: +//! +//! ```no_run +//! use rustqc::gtf; +//! +//! let genes = gtf::parse_gtf("genes.gtf", &[]).unwrap(); +//! if let Some((gene_id, gene)) = genes.iter().next() { +//! println!("{gene_id}: {} transcripts", gene.transcripts.len()); +//! } +//! ``` +//! +//! Use the [`Strandedness`] enum (also accepted by `serde` for YAML configs): +//! +//! ``` +//! use rustqc::Strandedness; +//! +//! let s = Strandedness::Reverse; +//! assert_eq!(s.to_string(), "reverse"); +//! ``` + +use clap::ValueEnum; +use serde::Deserialize; + +pub mod config; +pub mod cpu; +pub mod gtf; +pub mod io; +pub mod rna; +pub mod summary; + +/// Library strandedness protocol. +/// +/// Determines how read strand is interpreted relative to the gene annotation +/// strand during counting. Accepted CLI values: `unstranded`, `forward`, `reverse`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, ValueEnum, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum Strandedness { + /// Count reads on either strand (library is not strand-specific). + #[default] + Unstranded, + /// Forward stranded: read 1 maps to the transcript strand. + Forward, + /// Reverse stranded: read 2 maps to the transcript strand (e.g. dUTP). + Reverse, +} + +impl std::fmt::Display for Strandedness { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Strandedness::Unstranded => write!(f, "unstranded"), + Strandedness::Forward => write!(f, "forward"), + Strandedness::Reverse => write!(f, "reverse"), + } + } +} diff --git a/src/main.rs b/src/main.rs index d78152b..4c66c17 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,12 +11,6 @@ mod citations; mod cli; -mod config; -mod cpu; -mod gtf; -mod io; -mod rna; -mod summary; mod ui; use anyhow::{ensure, Context, Result}; @@ -27,7 +21,10 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; use std::time::{Instant, SystemTime, UNIX_EPOCH}; -use ui::{format_count, format_duration, format_pct, Ui, Verbosity}; +use rustqc::io::{format_count, format_duration, format_pct}; +use rustqc::{config, cpu, gtf, rna, summary}; + +use ui::{Ui, Verbosity}; use rust_htslib::bam::Read as BamRead; @@ -310,7 +307,7 @@ fn run_rna(args: cli::RnaArgs, ui: &Ui) -> Result<()> { let effective_stranded = args .stranded .or(config.stranded) - .unwrap_or(cli::Strandedness::Unstranded); + .unwrap_or(rustqc::Strandedness::Unstranded); let effective_paired = args.paired || config.paired.unwrap_or(false); if n_bams == 1 { @@ -749,7 +746,7 @@ struct SharedParams<'a> { /// Terminal UI handle. ui: &'a Ui, /// Library strandedness. - stranded: cli::Strandedness, + stranded: rustqc::Strandedness, /// Whether the library is paired-end. paired: bool, /// Alignment-to-GTF chromosome name mapping. diff --git a/src/rna/dupradar/counting.rs b/src/rna/dupradar/counting.rs index ff165dd..785115c 100644 --- a/src/rna/dupradar/counting.rs +++ b/src/rna/dupradar/counting.rs @@ -9,11 +9,11 @@ //! //! This implements a simplified featureCounts-compatible counting strategy. -use crate::cli::Strandedness; use crate::gtf::Gene; +use crate::io::format_count; use crate::rna::qualimap::QualimapAccum; use crate::rna::rseqc::accumulators::{RseqcAccumulators, RseqcAnnotations, RseqcConfig}; -use crate::ui::format_count; +use crate::Strandedness; use anyhow::{Context, Result}; use coitrees::{COITree, Interval, IntervalTree}; use indexmap::IndexMap; @@ -1273,11 +1273,11 @@ pub fn count_reads( // with BGZF I/O. When total threads exceed num_workers the extra // threads are distributed evenly; when threads == num_workers every // worker still gets 1 dedicated decompression thread. - let htslib_threads = if num_workers > 0 { - ((threads.saturating_sub(num_workers)) / num_workers).max(1) - } else { - 0 - }; + let htslib_threads = threads + .saturating_sub(num_workers) + .checked_div(num_workers) + .map(|n| n.max(1)) + .unwrap_or(0); // Process chromosome batches in parallel let results: Vec)>> = pool.install(|| { diff --git a/src/rna/qualimap/accumulator.rs b/src/rna/qualimap/accumulator.rs index bbecbbf..f8e82fd 100644 --- a/src/rna/qualimap/accumulator.rs +++ b/src/rna/qualimap/accumulator.rs @@ -13,7 +13,7 @@ use coitrees::IntervalTree; use rust_htslib::bam; use rust_htslib::bam::record::Cigar; -use crate::cli::Strandedness; +use crate::Strandedness; use super::coverage::TranscriptCoverage; use super::index::QualimapIndex; diff --git a/src/rna/qualimap/index.rs b/src/rna/qualimap/index.rs index 36ae11c..3e164f8 100644 --- a/src/rna/qualimap/index.rs +++ b/src/rna/qualimap/index.rs @@ -322,7 +322,7 @@ impl QualimapIndex { // work (interval tree, intron gaps) is done. let coverage_exons = if tx.strand == '-' { let mut desc = exons_0based.clone(); - desc.sort_unstable_by(|a, b| b.0.cmp(&a.0)); + desc.sort_unstable_by_key(|e| std::cmp::Reverse(e.0)); desc } else { exons_0based diff --git a/src/rna/qualimap/output.rs b/src/rna/qualimap/output.rs index 9081eb4..5d53082 100644 --- a/src/rna/qualimap/output.rs +++ b/src/rna/qualimap/output.rs @@ -13,7 +13,7 @@ use super::coverage::TranscriptCoverage; use super::index::QualimapIndex; use super::plots; use super::QualimapResult; -use crate::cli::Strandedness; +use crate::Strandedness; // ============================= Constants ======================================= diff --git a/src/rna/qualimap/report.rs b/src/rna/qualimap/report.rs index 284aa7e..bf0b488 100644 --- a/src/rna/qualimap/report.rs +++ b/src/rna/qualimap/report.rs @@ -9,7 +9,7 @@ use std::path::Path; use anyhow::{Context, Result}; -use crate::cli::Strandedness; +use crate::Strandedness; use log::debug; // =================================================================== diff --git a/src/rna/rseqc/accumulators.rs b/src/rna/rseqc/accumulators.rs index 0692ba9..b91a409 100644 --- a/src/rna/rseqc/accumulators.rs +++ b/src/rna/rseqc/accumulators.rs @@ -781,13 +781,12 @@ impl BamStatAccum { // GC content: cumulative step function with ngc=200 bins. // Matches samtools stats.c:925-941. For a read with gc_count G/C // bases out of read_len total, increment bins gc_idx_min..gc_idx_max. - if read_len > 0 { - let ngc: usize = 200; - let gc_idx_min = gc_count as usize * (ngc - 1) / read_len; - let mut gc_idx_max = (gc_count as usize + 1) * (ngc - 1) / read_len; - if gc_idx_max >= ngc { - gc_idx_max = ngc - 1; - } + let ngc: usize = 200; + if let (Some(gc_idx_min), Some(gc_idx_max)) = ( + (gc_count as usize * (ngc - 1)).checked_div(read_len), + ((gc_count as usize + 1) * (ngc - 1)).checked_div(read_len), + ) { + let gc_idx_max = gc_idx_max.min(ngc - 1); for item in gc_arr.iter_mut().take(gc_idx_max).skip(gc_idx_min) { *item += 1; } diff --git a/src/rna/rseqc/infer_experiment.rs b/src/rna/rseqc/infer_experiment.rs index 4153b8a..bb25788 100644 --- a/src/rna/rseqc/infer_experiment.rs +++ b/src/rna/rseqc/infer_experiment.rs @@ -4,8 +4,8 @@ //! gene models (from GTF annotation) and determines the fraction consistent with //! each strand protocol. -use crate::cli::Strandedness; use crate::gtf::Gene; +use crate::Strandedness; use anyhow::{Context, Result}; use indexmap::IndexMap; use log::debug; diff --git a/src/rna/rseqc/read_distribution.rs b/src/rna/rseqc/read_distribution.rs index f6e2283..0973f63 100644 --- a/src/rna/rseqc/read_distribution.rs +++ b/src/rna/rseqc/read_distribution.rs @@ -204,17 +204,13 @@ pub fn build_regions_from_genes(genes: &IndexMap) -> RegionSets { // 5' UTR: exon portions before CDS (strand-aware) for (&es, &ee) in exon_starts.iter().zip(exon_ends.iter()) { match strand { - '+' => { - if es < cds_start { - let e = ee.min(cds_start); - regions.utr_5.entry(chrom.clone()).or_default().add(es, e); - } + '+' if es < cds_start => { + let e = ee.min(cds_start); + regions.utr_5.entry(chrom.clone()).or_default().add(es, e); } - '-' => { - if ee > cds_end { - let s = es.max(cds_end); - regions.utr_5.entry(chrom.clone()).or_default().add(s, ee); - } + '-' if ee > cds_end => { + let s = es.max(cds_end); + regions.utr_5.entry(chrom.clone()).or_default().add(s, ee); } _ => {} } @@ -223,17 +219,13 @@ pub fn build_regions_from_genes(genes: &IndexMap) -> RegionSets { // 3' UTR: exon portions after CDS (strand-aware) for (&es, &ee) in exon_starts.iter().zip(exon_ends.iter()) { match strand { - '+' => { - if ee > cds_end { - let s = es.max(cds_end); - regions.utr_3.entry(chrom.clone()).or_default().add(s, ee); - } + '+' if ee > cds_end => { + let s = es.max(cds_end); + regions.utr_3.entry(chrom.clone()).or_default().add(s, ee); } - '-' => { - if es < cds_start { - let e = ee.min(cds_start); - regions.utr_3.entry(chrom.clone()).or_default().add(es, e); - } + '-' if es < cds_start => { + let e = ee.min(cds_start); + regions.utr_3.entry(chrom.clone()).or_default().add(es, e); } _ => {} } diff --git a/src/rna/rseqc/tin.rs b/src/rna/rseqc/tin.rs index 341c648..4ce2dda 100644 --- a/src/rna/rseqc/tin.rs +++ b/src/rna/rseqc/tin.rs @@ -127,6 +127,11 @@ impl TinResults { pub fn len(&self) -> usize { self.transcripts.len() } + + /// Whether there are no transcripts with computed TIN scores. + pub fn is_empty(&self) -> bool { + self.transcripts.is_empty() + } } // =================================================================== @@ -331,7 +336,7 @@ pub struct TinAccum { /// Per-transcript unique read start positions, capped at `min_cov + 1`. /// Once exceeded, the set is drained and `exceeded_threshold[tx_idx]` /// is set instead. - pub unique_starts: Vec>, + pub(crate) unique_starts: Vec>, /// Per-transcript flag: true once unique start count exceeded `min_cov`. /// Avoids further HashSet inserts for high-coverage transcripts. pub exceeded_threshold: Vec, diff --git a/src/ui.rs b/src/ui.rs index c07be01..fb92edd 100644 --- a/src/ui.rs +++ b/src/ui.rs @@ -8,6 +8,8 @@ use console::Style; use indicatif::{ProgressBar, ProgressStyle}; use std::time::Duration; +use rustqc::io::{format_count, format_duration}; + // ============================================================================ // Verbosity // ============================================================================ @@ -444,113 +446,3 @@ fn format_summary_row( content } } - -/// Format a count with SI prefix (e.g. 48200000 → "48.2M"). -/// -/// Values below 1000 are shown as-is. Values above use K/M/G/T suffixes -/// with one decimal place. -pub fn format_count(n: u64) -> String { - use number_prefix::NumberPrefix; - match NumberPrefix::decimal(n as f64) { - NumberPrefix::Standalone(n) => format!("{n}"), - NumberPrefix::Prefixed(prefix, n) => { - // Map SI prefixes to short single-char suffixes - let suffix = match prefix { - number_prefix::Prefix::Kilo => "K", - number_prefix::Prefix::Mega => "M", - number_prefix::Prefix::Giga => "G", - number_prefix::Prefix::Tera => "T", - _ => return format!("{:.1}{prefix:?}", n), - }; - format!("{n:.1}{suffix}") - } - } -} - -/// Format a percentage string (e.g. "83.3%"). -pub fn format_pct(n: u64, total: u64) -> String { - if total == 0 { - return "(0.0%)".to_string(); - } - format!("({:.1}%)", n as f64 / total as f64 * 100.0) -} - -/// Format a duration as human-friendly mm:ss or h:mm:ss. -/// -/// - Under 60s: "45.2s" -/// - Under 1h: "1:23" -/// - Over 1h: "1:02:34" -pub fn format_duration(d: Duration) -> String { - let total_secs = d.as_secs_f64(); - if total_secs < 60.0 { - return format!("{total_secs:.1}s"); - } - let total_secs = d.as_secs(); - let hours = total_secs / 3600; - let minutes = (total_secs % 3600) / 60; - let seconds = total_secs % 60; - if hours > 0 { - format!("{hours}:{minutes:02}:{seconds:02}") - } else { - format!("{minutes}:{seconds:02}") - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_format_count_small() { - assert_eq!(format_count(0), "0"); - assert_eq!(format_count(42), "42"); - assert_eq!(format_count(999), "999"); - } - - #[test] - fn test_format_count_thousands() { - assert_eq!(format_count(1000), "1.0K"); - assert_eq!(format_count(1500), "1.5K"); - assert_eq!(format_count(50000), "50.0K"); - } - - #[test] - fn test_format_count_millions() { - assert_eq!(format_count(1_000_000), "1.0M"); - assert_eq!(format_count(48_200_000), "48.2M"); - assert_eq!(format_count(50_000_000), "50.0M"); - } - - #[test] - fn test_format_count_billions() { - assert_eq!(format_count(1_000_000_000), "1.0G"); - assert_eq!(format_count(5_000_000_000), "5.0G"); - } - - #[test] - fn test_format_pct() { - assert_eq!(format_pct(833, 1000), "(83.3%)"); - assert_eq!(format_pct(0, 0), "(0.0%)"); - assert_eq!(format_pct(1000, 1000), "(100.0%)"); - } - - #[test] - fn test_format_duration_seconds() { - assert_eq!(format_duration(Duration::from_secs_f64(0.5)), "0.5s"); - assert_eq!(format_duration(Duration::from_secs_f64(45.2)), "45.2s"); - assert_eq!(format_duration(Duration::from_secs_f64(59.9)), "59.9s"); - } - - #[test] - fn test_format_duration_minutes() { - assert_eq!(format_duration(Duration::from_secs(60)), "1:00"); - assert_eq!(format_duration(Duration::from_secs(83)), "1:23"); - assert_eq!(format_duration(Duration::from_secs(3599)), "59:59"); - } - - #[test] - fn test_format_duration_hours() { - assert_eq!(format_duration(Duration::from_secs(3600)), "1:00:00"); - assert_eq!(format_duration(Duration::from_secs(3754)), "1:02:34"); - } -}