From b1aac9392ae01e173c32abbe04086961d6830931 Mon Sep 17 00:00:00 2001 From: Jeff Jaureguy <67065808+Jaureguy760@users.noreply.github.com> Date: Sun, 15 Mar 2026 00:38:36 -0700 Subject: [PATCH] fix: revert GTF from count-variants-sc, harden Dockerfile, fix docs - Revert GTF/GFF3 support from count-variants-sc (sc commands are scATAC-only; gene annotation is a downstream ArchR/Signac step) - Bulk count-variants retains full GTF support - Dockerfile: tini PID 1, g++ purge assertion, wasp2-ipscore check - Smoke test: add wasp2-ipscore, fix sample name case - Docs: rewrite counting/mapping/analysis/installation pages - Remove stale --min-count footnote from analysis.rst Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 4 +- .gitignore | 29 +- Dockerfile | 10 +- Makefile | 6 +- README.md | 110 ++-- docs/CONTAINER_USAGE.md | 267 ++-------- docs/source/index.rst | 1 + docs/source/installation.rst | 85 ++- docs/source/methods/mapping_filter.rst | 7 +- docs/source/tutorials/quickstart_mapping.rst | 12 +- docs/source/tutorials/scrna_seq.rst | 4 +- docs/source/user_guide/analysis.rst | 526 ++----------------- docs/source/user_guide/counting.rst | 222 +++----- docs/source/user_guide/ipscore.rst | 53 ++ docs/source/user_guide/mapping.rst | 257 ++++----- pixi.toml | 1 + pyproject.toml | 2 +- rust/Cargo.lock | 1 + rust/Cargo.toml | 3 + rust/build.rs | 4 + scripts/container_smoke_test.sh | 11 +- src/counting/__main__.py | 9 +- src/counting/count_alleles_sc.py | 59 ++- src/counting/parse_gene_data.py | 43 +- src/counting/run_counting_sc.py | 50 +- src/mapping/intersect_variant_data.py | 11 +- src/mapping/remap_utils.py | 12 +- 27 files changed, 582 insertions(+), 1217 deletions(-) create mode 100644 docs/source/user_guide/ipscore.rst create mode 100644 rust/build.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3518670..2e37711 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,8 +146,8 @@ jobs: - name: Run cargo test working-directory: rust - run: cargo test + run: PYO3_PYTHON=$(command -v python3) cargo test - name: Run clippy working-directory: rust - run: cargo clippy -- -W warnings + run: PYO3_PYTHON=$(command -v python3) cargo clippy -- -W warnings diff --git a/.gitignore b/.gitignore index 058d27f..9418dda 100644 --- a/.gitignore +++ b/.gitignore @@ -154,6 +154,11 @@ benchmark_figures/ # Sanity test data (downloaded from GitHub releases) tests/sanity/data/ +# Real data test files (downloaded from 1000 Genomes, ~2-3 GB) +tests/real_data/data/ +tests/real_data/samplesheets/ +tests/real_data/configs/ + # Nextflow runtime .nextflow/ .nextflow.log* @@ -175,7 +180,29 @@ test-output/ results_stub/ pipelines/*/test-output/ pipelines/*/results_stub/ +pipelines/*/results_*/ +pipelines/*/artifacts/ + +# Artifacts directory +artifacts/ + +# Benchmark infrastructure (large data/envs/results) +test_benchmarks/ + +# Claude Code local state +.claude/ + +# Nextflow pipeline-level logs +pipelines/*/.nextflow.log* +pipelines/*/.nf-test.log + +# Nextflow reports and visualizations +trace.txt +timeline.html +report.html +dag.svg +dag.dot # Claude Code memory files (per-directory) **/CLAUDE.md -!./CLAUDE.md +!/CLAUDE.md diff --git a/Dockerfile b/Dockerfile index 0ade5c9..d3a4821 100644 --- a/Dockerfile +++ b/Dockerfile @@ -82,6 +82,8 @@ LABEL maintainer="Jeff Jaureguy " # Install runtime deps + temporary build deps for pybedtools C++ extension # Combined into one RUN to minimize layers; build tools purged at the end RUN apt-get update && apt-get install -y --no-install-recommends \ + # PID 1 init for proper signal handling (Nextflow/HPC) + tini \ # Bioinformatics tools samtools \ bcftools \ @@ -106,14 +108,16 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install /tmp/*.whl \ && rm -rf /tmp/*.whl \ && apt-get purge -y --auto-remove g++ zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && ! command -v g++ WORKDIR /app # Verify non-Python tools are available (Python tools skipped during build # because Polars uses AVX2 instructions that fail under QEMU emulation # on ARM64 CI runners building linux/amd64 images) -RUN samtools --version && bcftools --version && bedtools --version +RUN samtools --version && bcftools --version && bedtools --version \ + && wasp2-ipscore --help > /dev/null 2>&1 # Create non-root user for security RUN groupadd -g 1000 wasp2 && \ @@ -147,5 +151,7 @@ WORKDIR /data HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD wasp2-count --version || exit 1 +ENTRYPOINT ["tini", "--"] + # Default command CMD ["wasp2-count", "--help"] diff --git a/Makefile b/Makefile index d99addc..8e67b2a 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ .PHONY: all build install test test-quick test-sanity lint format clean help .PHONY: download-sanity-data sanity-data-local rust-build rust-test +.PHONY: test-mapping-parity # Configuration PYTHON ?= python @@ -48,7 +49,7 @@ rust-dev: ## Build Rust extension in debug mode (faster compile) $(MATURIN) develop -m $(RUST_DIR)/Cargo.toml rust-test: ## Run Rust unit tests - cd $(RUST_DIR) && $(CARGO) test + cd $(RUST_DIR) && PYO3_PYTHON=$$($(PYTHON) -c "import sys; print(sys.executable)") $(CARGO) test rust-bench: ## Run Rust benchmarks cd $(RUST_DIR) && $(CARGO) bench @@ -68,6 +69,9 @@ test-quick: ## Run quick validation tests only test-rust: ## Run Rust-specific tests $(PYTEST) $(TESTS_DIR) -v --tb=short -m "rust" +test-mapping-parity: ## Run mapping parity tests against legacy and unified paths + $(PYTEST) $(TESTS_DIR)/regression/test_mapping_stage_parity.py -v --tb=short + test-integration: ## Run integration tests $(PYTEST) $(TESTS_DIR) -v --tb=short -m "integration" diff --git a/README.md b/README.md index 29b7667..d89e29a 100644 --- a/README.md +++ b/README.md @@ -15,114 +15,100 @@ Documentation - - License - -

- -

- Documentation • - McVicker Lab • - Original WASP

---- - ## Installation -### Recommended: Bioconda +### Bioconda ```bash mamba install -c conda-forge -c bioconda wasp2 ``` -Installs WASP2 and all dependencies (samtools, bcftools, bedtools, htslib) automatically. Available for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon). Requires [miniforge](https://github.com/conda-forge/miniforge). - -### Via PyPI +### PyPI ```bash pip install wasp2 ``` -Pre-built wheels for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon) with Python 3.10-3.13. The Rust extension and htslib are bundled in the wheel. Requires samtools, bcftools, and bedtools installed separately. +The PyPI package does not install external tools such as `samtools`, +`bcftools`, or `bedtools`; install those separately. -### For development - -```bash -git clone https://github.com/mcvickerlab/WASP2.git -cd WASP2 -pixi install # resolves all dependencies including Rust toolchain -pixi run verify # build + test -``` - -### Via Docker +### Docker ```bash docker pull ghcr.io/mcvickerlab/wasp2:1.4.0 -docker run --rm -v $PWD:/data ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help +docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help ``` -Multi-platform image (linux/amd64 + linux/arm64) with all dependencies included. - -### Via Singularity/Apptainer (HPC) +### Singularity/Apptainer ```bash singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0 singularity exec wasp2.sif wasp2-count --help ``` -### Reproducible Environment (conda-lock) +## CLI Tools -For fully pinned, reproducible installs (HPC clusters, CI, shared lab environments): +WASP2 installs four command-line entry points: -```bash -# Recommended: mamba (fastest) -mamba create -n WASP2 --file conda-lock.yml +- `wasp2-map` +- `wasp2-count` +- `wasp2-analyze` +- `wasp2-ipscore` -# Or with conda -conda-lock install -n WASP2 conda-lock.yml -``` +## Quick Start -`conda-lock.yml` pins every package to exact versions with checksums for `linux-64` and `osx-64`. To regenerate after updating `environment.lock.yml`: +### 1. Correct mapping bias ```bash -conda-lock lock -f environment.lock.yml --lockfile conda-lock.yml -``` - -See the [documentation](https://mcvickerlab.github.io/WASP2/) for detailed install options and development setup. +wasp2-map make-reads input.bam variants.vcf.gz -s sample1 -o remap_dir -## Quick Start +# Realign remap_dir/*_swapped_alleles_r1.fq and r2.fq with the same aligner +# and settings used for the original BAM, then: -WASP2 has three steps that run in order: +wasp2-map filter-remapped remapped.bam \ + -j remap_dir/input_wasp_data_files.json \ + -o filtered.bam +``` -**Step 1: Remap reads** to correct mapping bias +### 2. Count alleles ```bash -wasp2-map make-reads input.bam variants.vcf.gz -s sample1 -o remap_dir/ -# Realign the swapped-allele reads with your aligner, then: -wasp2-map filter-remapped remapped.bam -j remap_dir/sample1_wasp_data_files.json -o filtered.bam +wasp2-count count-variants filtered.bam variants.vcf.gz -s sample1 -o counts.tsv ``` -**Step 2: Count alleles** at heterozygous SNPs +### 3. Test for imbalance ```bash -wasp2-count count-variants filtered.bam variants.vcf.gz -s sample1 +wasp2-analyze find-imbalance counts.tsv -o ai_results.tsv ``` -**Step 3: Test for allelic imbalance** +## Single-Cell Example ```bash -wasp2-analyze find-imbalance counts.tsv -o results.tsv +wasp2-count count-variants-sc \ + cellranger.bam \ + variants.vcf.gz \ + barcodes.tsv \ + --samples sample1 \ + --feature genes.gtf \ + --out_file allele_counts.h5ad + +wasp2-analyze find-imbalance-sc \ + allele_counts.h5ad \ + barcode_groups.tsv \ + --sample sample1 \ + -o ai_results.tsv ``` -See the [documentation](https://mcvickerlab.github.io/WASP2/) for detailed usage, single-cell workflows, and supported variant formats (VCF, BCF, PGEN). - -## Authors - -- **Aaron Ho** — Creator of WASP2 -- **Jeff Jaureguy** — Developer and maintainer -- **[McVicker Lab](https://mcvicker.salk.edu/)**, Salk Institute +## iPSCORE Utilities -## Citation +```bash +wasp2-ipscore inventory --output inventory.tsv +wasp2-ipscore manifest --output manifest.csv +wasp2-ipscore validate +``` -If you use WASP2 in your research, please cite our paper (coming soon). +See the [documentation](https://mcvickerlab.github.io/WASP2/) for complete +usage, tutorials, and API details. diff --git a/docs/CONTAINER_USAGE.md b/docs/CONTAINER_USAGE.md index 8dd77e7..6bda535 100644 --- a/docs/CONTAINER_USAGE.md +++ b/docs/CONTAINER_USAGE.md @@ -1,262 +1,81 @@ -# WASP2 Container Usage Guide +# WASP2 Container Usage -This guide covers how to use WASP2 containers for local development, HPC clusters, and Nextflow pipelines. +## Verified Docker Flow -## Container Registries - -WASP2 images are available from: - -| Registry | Image | Pull Command | -|----------|-------|--------------| -| **DockerHub** | `mcvickerlab/wasp2` | `docker pull mcvickerlab/wasp2:latest` | -| **GitHub Container Registry** | `ghcr.io/mcvickerlab/wasp2` | `docker pull ghcr.io/mcvickerlab/wasp2:latest` | - -### Available Tags - -- `:latest` - Most recent release -- `:1.3.0` - Specific version -- `:1.3` - Minor version (tracks patches) -- `:main` - Development builds from main branch - -## Docker Usage - -### Pull and Run - -```bash -# Pull the image -docker pull mcvickerlab/wasp2:latest - -# Run WASP2 commands -docker run --rm mcvickerlab/wasp2 wasp2-count --help -docker run --rm mcvickerlab/wasp2 wasp2-map --help -docker run --rm mcvickerlab/wasp2 wasp2-analyze --help - -# Process files (mount local directory) -docker run --rm -v $(pwd):/data mcvickerlab/wasp2 \ - wasp2-count /data/sample.bam /data/variants.vcf.gz -o /data/counts.tsv -``` - -### Interactive Shell +The Docker image validated for this update is: ```bash -docker run -it --rm -v $(pwd):/data mcvickerlab/wasp2 /bin/bash +ghcr.io/mcvickerlab/wasp2:1.4.0 ``` -## Singularity/Apptainer Usage (HPC) - -### Pull from Docker Registry +Pull and inspect the available CLI tools: ```bash -# Pull and convert to SIF -singularity pull wasp2.sif docker://mcvickerlab/wasp2:latest +docker pull ghcr.io/mcvickerlab/wasp2:1.4.0 -# Or from GHCR -singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:latest +docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help +docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-map --help +docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-analyze --help +docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-ipscore --help ``` -### Build from Definition File +Mount local data when running workflows: ```bash -# Clone the repository -git clone https://github.com/mcvickerlab/WASP2.git -cd WASP2 - -# Build the container -singularity build wasp2.sif Singularity.def +docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \ + wasp2-count count-variants /data/sample.bam /data/variants.vcf.gz -o /data/counts.tsv ``` -### Run Commands +## Intended Singularity / Apptainer Flow -```bash -# Run WASP2 commands -singularity exec wasp2.sif wasp2-count --help - -# Process files (current directory is auto-bound) -singularity exec wasp2.sif wasp2-count sample.bam variants.vcf.gz -o counts.tsv - -# With explicit bindings -singularity exec --bind /scratch:/scratch wasp2.sif \ - wasp2-map make-reads /scratch/input.bam /scratch/variants.vcf -``` - -### SLURM Job Script Example +For HPC environments using SIF images: ```bash -#!/bin/bash -#SBATCH --job-name=wasp2 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=32G -#SBATCH --time=4:00:00 - -module load singularity - -CONTAINER=/path/to/wasp2.sif - -# Count variants (wasp2-count does not have --threads option) -singularity exec ${CONTAINER} wasp2-count \ - input.bam \ - variants.vcf.gz \ - -o counts.tsv - -# WASP mapping filter (supports --threads) -singularity exec ${CONTAINER} wasp2-map make-reads \ - input.bam \ - variants.vcf.gz \ - --threads ${SLURM_CPUS_PER_TASK} \ - --out_dir ./wasp_output -``` - -## Nextflow Integration - -### Configuration - -Add to your `nextflow.config`: - -```groovy -profiles { - docker { - docker.enabled = true - docker.runOptions = '-u $(id -u):$(id -g)' - } - singularity { - singularity.enabled = true - singularity.autoMounts = true - singularity.cacheDir = "${HOME}/.singularity/cache" - } -} - -process { - withLabel: 'wasp2' { - container = 'mcvickerlab/wasp2:latest' - } -} -``` - -### Running Pipelines - -```bash -# With Docker -nextflow run main.nf -profile docker - -# With Singularity -nextflow run main.nf -profile singularity -``` - -## Building Locally - -### Docker Build - -```bash -# Clone repository -git clone https://github.com/mcvickerlab/WASP2.git -cd WASP2 - -# Build image -docker build -t wasp2:local . - -# Test the build -docker run --rm wasp2:local wasp2-count --version -docker run --rm wasp2:local python -c "import wasp2_rust; print('OK')" +singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0 +singularity exec wasp2.sif wasp2-count --help ``` -### Manual Build (for maintainers) - -Note: Currently only `linux/amd64` is supported. +or: ```bash -# Set up buildx -docker buildx create --name wasp2builder --use - -# Build with version argument -docker buildx build \ - --platform linux/amd64 \ - --build-arg VERSION=1.3.0 \ - -t mcvickerlab/wasp2:1.3.0 \ - -t mcvickerlab/wasp2:latest \ - --push . +apptainer pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0 +apptainer exec wasp2.sif wasp2-count --help ``` -## Container Contents - -The WASP2 container includes: +These commands are the intended container path, but they were not executed in +this development environment because neither `singularity` nor `apptainer` was +installed locally. -### Python Environment -- Python 3.10+ (container uses 3.11) -- wasp2 package with Rust extension -- Core: pysam, pandas (<2.0), numpy, scipy, polars -- CLI: typer, rich -- Single-cell: anndata, scanpy (optional) - -### Rust Components -- Pre-built `wasp2_rust` Python extension -- Compiled with release optimizations - -### CLI Tools - -Each tool has subcommands for different analysis modes: - -- **`wasp2-count`** - Allele counting - - `count-variants` - Bulk allele counting at heterozygous sites (default) - - `count-variants-sc` - Single-cell allele counting - -- **`wasp2-map`** - WASP mapping filter - - `make-reads` - Generate reads with swapped alleles for remapping - - `filter-remapped` - Filter remapped reads using WASP algorithm - -- **`wasp2-analyze`** - Statistical analysis - - `find-imbalance` - Calculate allelic imbalance - - `find-imbalance-sc` - Single-cell allelic imbalance analysis - - `compare-imbalance` - Compare imbalance between cell types/groups - -### Bioinformatics Tools -- samtools -- bcftools -- bedtools -- tabix - -## Troubleshooting - -### Permission Issues (Docker) +## Mapping Example ```bash -# Run as current user -docker run --rm -u $(id -u):$(id -g) -v $(pwd):/data mcvickerlab/wasp2 ... +docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \ + wasp2-map make-reads /data/sample.bam /data/variants.vcf.gz \ + --samples sample1 \ + --out_dir /data/remap_dir ``` -### Cache Issues (Singularity) +After realigning the swapped FASTQ reads with your aligner of choice: ```bash -# Clear Singularity cache -singularity cache clean - -# Use different cache directory -export SINGULARITY_CACHEDIR=/scratch/singularity_cache +docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \ + wasp2-map filter-remapped /data/remapped.bam \ + --wasp_data_json /data/remap_dir/sample_wasp_data_files.json \ + --out_bam /data/filtered.bam ``` -### Verify Installation +## Counting Example ```bash -# Docker -docker run --rm mcvickerlab/wasp2 wasp2-count --version -docker run --rm mcvickerlab/wasp2 python -c "import wasp2_rust; print('Rust extension OK')" - -# Singularity -singularity exec wasp2.sif wasp2-count --version -singularity exec wasp2.sif python -c "import wasp2_rust; print('Rust extension OK')" +docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \ + wasp2-count count-variants /data/filtered.bam /data/variants.vcf.gz \ + --samples sample1 \ + --region /data/genes.gtf \ + --out_file /data/counts.tsv ``` -## GitHub Actions Secrets Setup - -To enable automated container builds, repository maintainers must configure: - -1. **DockerHub Secrets** (Settings → Secrets and variables → Actions): - - `DOCKERHUB_USERNAME`: Your DockerHub username - - `DOCKERHUB_TOKEN`: DockerHub access token (Account Settings → Security → Access Tokens) - -2. **GitHub Container Registry**: Uses `GITHUB_TOKEN` automatically (no setup needed) - -## Related Documentation +## Notes -- [Nextflow Pipelines](../pipelines/nf-atacseq/README.md) -- [WASP2 Ecosystem](WASP2_ECOSYSTEM.md) -- [GitHub Repository](https://github.com/mcvickerlab/WASP2) +- The image contains the WASP2 package plus `samtools`, `bcftools`, and `bedtools`. +- The documented public mapping workflow is `make-reads -> realign -> filter-remapped`. +- `wasp2-ipscore` is present in the container alongside the main analysis tools. diff --git a/docs/source/index.rst b/docs/source/index.rst index 1a5fc6a..ab712bd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -60,6 +60,7 @@ Documentation user_guide/mapping user_guide/analysis user_guide/single_cell + user_guide/ipscore .. toctree:: :maxdepth: 2 diff --git a/docs/source/installation.rst b/docs/source/installation.rst index f039a97..51e9c15 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -1,25 +1,16 @@ Installation ============ -Via Bioconda (Recommended) --------------------------- - -`Bioconda `_ installs WASP2 and **all** dependencies -(samtools, bcftools, bedtools, htslib) in one command. Requires -`miniforge `_. +Via Bioconda +------------ .. code-block:: bash mamba install -c conda-forge -c bioconda wasp2 -Or with conda: - -.. code-block:: bash - - conda install -c conda-forge -c bioconda wasp2 - -Available for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon) with -Python 3.10-3.13. +Bioconda installs WASP2 together with the external command-line dependencies +required by the workflows, including ``samtools``, ``bcftools``, and +``bedtools``. Via PyPI -------- @@ -28,73 +19,58 @@ Via PyPI pip install wasp2 -Pre-built wheels include the Rust extension and bundled htslib for Linux -(x86_64, aarch64) and macOS (Intel, Apple Silicon) with Python 3.10-3.13. - -.. note:: - - The PyPI package does not include samtools, bcftools, or bedtools. - Install them separately before running WASP2: - - * On Ubuntu/Debian: ``sudo apt-get install bcftools bedtools samtools`` - * On macOS: ``brew install bcftools bedtools samtools`` - * Via conda: ``mamba install -c bioconda samtools bcftools bedtools`` +The PyPI wheel includes the WASP2 Python package and Rust extension, but it +does not install external tools such as ``samtools``, ``bcftools``, or +``bedtools``. Install those separately before running mapping or counting. Via Docker ---------- -WASP2 is available as a multi-platform Docker image (linux/amd64 + linux/arm64) -with all dependencies pre-installed: +The Docker image is the most reproducible fully bundled option available in +this environment. .. code-block:: bash docker pull ghcr.io/mcvickerlab/wasp2:1.4.0 - # Run a command - docker run --rm -v /path/to/data:/data ghcr.io/mcvickerlab/wasp2:1.4.0 \ - wasp2-count count-variants /data/sample.bam /data/variants.vcf - - # Interactive shell - docker run -it --rm -v /path/to/data:/data ghcr.io/mcvickerlab/wasp2:1.4.0 bash - -The image includes samtools, bcftools, bedtools, and the Rust-accelerated backend. + docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help + docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-map --help + docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-analyze --help + docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-ipscore --help Via Singularity/Apptainer ------------------------- -For HPC environments that don't support Docker: +For HPC environments that require SIF images: .. code-block:: bash singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0 singularity exec wasp2.sif wasp2-count --help -Development Installation ------------------------- - -For contributing or building from source: +or with Apptainer: .. code-block:: bash - git clone https://github.com/mcvickerlab/WASP2.git - cd WASP2 - pixi install # resolves all deps including Rust toolchain - pixi run verify # build + test suite in one step + apptainer pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0 + apptainer exec wasp2.sif wasp2-count --help -`pixi `_ resolves Python, Rust toolchain, samtools, bcftools, -bedtools, and htslib automatically. No system packages required. +.. note:: -Compiling pgenlib -~~~~~~~~~~~~~~~~~ + Docker was validated in this development environment. ``apptainer`` / + ``singularity`` binaries were not available locally during this doc update, + so those examples reflect the intended pull/exec workflow but were not + executed here. -WASP2 optionally uses `pgenlib `_ -for PLINK2 file I/O. This requires a C compiler: +Development Installation +------------------------ -* On Ubuntu/Debian: ``sudo apt-get install build-essential python3-dev`` -* On macOS: ``xcode-select --install`` -* On RHEL/Fedora: ``sudo dnf install gcc gcc-c++ python3-devel`` +.. code-block:: bash -pgenlib is installed automatically via pip when you install WASP2. + git clone https://github.com/mcvickerlab/WASP2.git + cd WASP2 + pixi install + pixi run verify Verification ------------ @@ -104,3 +80,4 @@ Verification wasp2-count --help wasp2-map --help wasp2-analyze --help + wasp2-ipscore --help diff --git a/docs/source/methods/mapping_filter.rst b/docs/source/methods/mapping_filter.rst index c3257f2..37b24d7 100644 --- a/docs/source/methods/mapping_filter.rst +++ b/docs/source/methods/mapping_filter.rst @@ -132,7 +132,10 @@ The WASP filter compares original and remapped positions: .. code-block:: bash - wasp2-map filter-remapped original_to_remap.bam remapped.bam output.bam + wasp2-map filter-remapped \ + remapped.bam \ + --wasp_data_json sample_wasp_data_files.json \ + --out_bam output.bam A read passes if: @@ -272,7 +275,7 @@ The typical WASP2 workflow: # Step 2: Create swapped reads wasp2-map make-reads sample.bam variants.vcf \ - --samples SAMPLE1 --out-dir wasp_temp/ + --samples SAMPLE1 --out_dir wasp_temp # Step 3: Remap swapped reads (SAME ALIGNER!) bwa mem -M genome.fa wasp_temp/swapped_r1.fq wasp_temp/swapped_r2.fq | \ diff --git a/docs/source/tutorials/quickstart_mapping.rst b/docs/source/tutorials/quickstart_mapping.rst index 57baffb..41317a0 100644 --- a/docs/source/tutorials/quickstart_mapping.rst +++ b/docs/source/tutorials/quickstart_mapping.rst @@ -68,7 +68,7 @@ Identify reads overlapping heterozygous SNPs and generate allele-swapped version wasp2-map make-reads sample.bam variants.vcf.gz \ --samples SAMPLE1 \ - --out-dir wasp_output/ + --out_dir wasp_output This produces (where ``sample`` is your BAM file prefix): @@ -103,9 +103,9 @@ The WASP filter compares original and remapped positions: .. code-block:: bash wasp2-map filter-remapped \ - wasp_output/sample_to_remap.bam \ wasp_output/sample_remapped.bam \ - wasp_output/sample_wasp_filtered.bam + --wasp_data_json wasp_output/sample_wasp_data_files.json \ + --out_bam wasp_output/sample_wasp_filtered.bam Understanding Filter Statistics ------------------------------- @@ -172,7 +172,7 @@ Complete Workflow Script echo "Step 1: Creating swapped reads..." wasp2-map make-reads $BAM $VCF \ --samples $SAMPLE \ - --out-dir $OUTDIR/ + --out_dir $OUTDIR # Step 2: Remap with same aligner echo "Step 2: Remapping swapped reads..." @@ -185,9 +185,9 @@ Complete Workflow Script # Step 3: Filter biased reads echo "Step 3: Filtering biased reads..." wasp2-map filter-remapped \ - $OUTDIR/${PREFIX}_to_remap.bam \ $OUTDIR/${PREFIX}_remapped.bam \ - $OUTDIR/${PREFIX}_wasp_filtered.bam + --wasp_data_json $OUTDIR/${PREFIX}_wasp_data_files.json \ + --out_bam $OUTDIR/${PREFIX}_wasp_filtered.bam # Step 4: Merge with non-overlapping reads echo "Step 4: Merging final BAM..." diff --git a/docs/source/tutorials/scrna_seq.rst b/docs/source/tutorials/scrna_seq.rst index dcdb325..1ee8d54 100644 --- a/docs/source/tutorials/scrna_seq.rst +++ b/docs/source/tutorials/scrna_seq.rst @@ -162,7 +162,7 @@ Run the single-cell allele counting: cellranger_output/outs/possorted_genome_bam.bam \ phased_variants.vcf.gz \ barcodes_celltype.tsv \ - --region genes.gtf \ + --feature genes.gtf \ --samples SAMPLE_ID \ --out_file allele_counts.h5ad @@ -293,7 +293,7 @@ Low Read Counts Single-cell data is sparse. Consider: * Using pseudo-bulk aggregation by cell type -* Lowering ``--min-count`` threshold +* Lowering ``--min`` / ``--min_count`` threshold * Focusing on highly expressed genes Memory Issues diff --git a/docs/source/user_guide/analysis.rst b/docs/source/user_guide/analysis.rst index f486d11..23b015e 100644 --- a/docs/source/user_guide/analysis.rst +++ b/docs/source/user_guide/analysis.rst @@ -4,520 +4,92 @@ Analysis Module Overview -------- -The analysis module detects statistically significant allelic imbalance using beta-binomial models. +``wasp2-analyze`` runs allelic imbalance analysis on bulk or single-cell count +outputs. -Purpose -------- - -* Detect allelic imbalance at genomic regions -* Control for biological and technical variation -* Support single-cell and bulk RNA-seq -* Compare imbalance between groups/conditions - -Statistical Models ------------------- - -Beta-Binomial Model -~~~~~~~~~~~~~~~~~~~ - -WASP2 uses beta-binomial distribution to model: -* Overdispersion (variation beyond binomial) -* Biological variability between regions -* Technical noise in sequencing - -The model: -* Null hypothesis: Equal expression from both alleles (p=0.5) -* Alternative: Allelic imbalance (p ≠ 0.5) -* FDR correction for multiple testing +Commands: -Dispersion Parameter -~~~~~~~~~~~~~~~~~~~~ - -Two models: -1. **Single**: One dispersion parameter for all regions -2. **Linear**: Dispersion varies with read depth - -CLI Usage ---------- - -Basic Analysis -~~~~~~~~~~~~~~ - -.. code-block:: bash +* ``find-imbalance``: bulk allelic imbalance from TSV counts +* ``find-imbalance-sc``: per-group single-cell imbalance from ``.h5ad`` counts +* ``compare-imbalance``: differential imbalance between single-cell groups - wasp2-analyze find-imbalance counts.tsv - -Options -~~~~~~~ +Bulk Analysis +------------- .. code-block:: bash wasp2-analyze find-imbalance \ counts.tsv \ - --min-count 10 \ + --min 10 \ --pseudocount 1 \ - --model single \ - --output results.tsv - -Parameters ----------- - -``--min-count`` -~~~~~~~~~~~~~~~ - -Minimum total read count per region (default: 10): - -.. code-block:: bash - - --min-count 20 # More stringent - -``--pseudocount`` -~~~~~~~~~~~~~~~~~ - -Pseudocount added to avoid zero counts (default: 1): - -.. code-block:: bash - - --pseudocount 0 # No pseudocount - -``--model`` -~~~~~~~~~~~ - -Dispersion model (default: single): - -.. code-block:: bash - - --model linear # Depth-dependent dispersion - -``--phased`` -~~~~~~~~~~~~ - -Use phased genotype information: - -.. code-block:: bash - - --phased # Requires phased VCF - -Output Format -------------- - -Tab-separated file with columns: - -Statistical Columns -~~~~~~~~~~~~~~~~~~~ - -* ``region``: Genomic region identifier -* ``ref_count``: Total reference allele counts -* ``alt_count``: Total alternate allele counts -* ``p_value``: Likelihood ratio test p-value -* ``fdr_pval``: FDR-corrected p-value -* ``effect_size``: Log2 fold-change (ref/alt) + --output ai_results.tsv -Model Parameters -~~~~~~~~~~~~~~~~ +Useful options: -* ``dispersion``: Beta-binomial dispersion parameter -* ``log_likelihood_null``: Null model log-likelihood -* ``log_likelihood_alt``: Alternative model log-likelihood - -Interpreting Results --------------------- - -Significant Imbalance -~~~~~~~~~~~~~~~~~~~~~ - -FDR < 0.05 indicates significant imbalance: - -* **Biological**: cis-regulatory variation, ASE -* **Technical**: mapping bias (check WASP), PCR artifacts - -Effect Size -~~~~~~~~~~~ - -* log2FC > 1: Strong imbalance (2-fold difference) -* log2FC > 2: Very strong imbalance (4-fold difference) +* ``--min`` / ``--min_count``: minimum total count threshold +* ``--pseudocount``: pseudocount added before modeling +* ``--model``: dispersion model (currently ``single`` or ``linear`` input) +* ``--output`` / ``--out_file`` / ``-o``: output TSV path +* ``--region_col``: explicit region column name if auto-detection is not desired +* ``--groupby``: group on an alternate annotation column, such as a parent gene column Single-Cell Analysis -------------------- -For single-cell data, WASP2 detects allelic imbalance within specific cell populations -using aggregated counts across cells of the same type. - .. code-block:: bash wasp2-analyze find-imbalance-sc \ - adata.h5ad \ - barcode_map.tsv \ - --sample donor1 \ - --min-count 10 - -Output: Per-celltype TSV files (``ai_results_[CELLTYPE].tsv``). - -Single-Cell Comparative Imbalance ---------------------------------- - -Overview -~~~~~~~~ - -The comparative imbalance analysis detects **differential allelic imbalance** between -cell types, conditions, or biological groups. This is useful for identifying: - -* Cell-type-specific regulatory variation -* Sex differences in chromatin accessibility -* Condition-dependent allelic effects (e.g., treatment vs control) -* Developmental stage-specific imbalance - -Statistical Model -~~~~~~~~~~~~~~~~~ - -The analysis uses a **likelihood ratio test (LRT)** comparing two hypotheses: - -* **Null (H0)**: Both groups share the same allelic imbalance (μ_combined) -* **Alternative (H1)**: Groups have different imbalance (μ₁ ≠ μ₂) - -The test statistic follows a chi-squared distribution with 1 degree of freedom: - -.. code-block:: text - - LRT = -2 × (log L_null - log L_alt) - p-value = P(χ²(df=1) > LRT) - -Input Format: Count Matrix (.h5ad) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The count matrix must be an AnnData object with the following structure: - -.. code-block:: text - - AnnData object (n_obs × n_vars) - ├── .obs # SNP metadata (rows) - │ ├── index # SNP identifiers (0, 1, 2, ...) - │ └── [sample_name] # Genotypes: '0|1', '1|0', '0/1', '1/0' - │ - ├── .var # Cell metadata (columns) - │ └── group # Cell type/group assignment - │ - ├── .layers - │ ├── "ref" # Reference allele counts (sparse matrix) - │ └── "alt" # Alternate allele counts (sparse matrix) - │ - └── .uns - ├── feature # DataFrame: SNP → region mapping - └── samples # List of sample names - -**Example count matrix creation:** - -.. code-block:: bash - - # Generate counts from BAM + variants + barcodes - wasp2-count count-variants-sc \ - aligned.bam \ - variants.vcf.gz \ - barcodes.txt \ - --samples NA12878 \ - --feature peaks.bed \ - --out_file allele_counts.h5ad - -Input Format: Barcode Map (TSV) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A two-column TSV file (no header) mapping cell barcodes to groups: - -.. code-block:: text - - AAACGAACAGTCAGTT-1 excitatory_neurons - AAACGAAGTCGCTCTA-1 inhibitory_neurons - AAACGAAGTGAACCTA-1 excitatory_neurons - AAAGGATCATCGATGT-1 astrocytes - AAAGGATGTGCAACGA-1 microglia - -**Requirements:** - -* Tab-separated (``\t``) -* No header row -* Barcodes must match those in the count matrix -* Groups can be cell types, conditions, sex, or any categorical variable - -Basic Usage -~~~~~~~~~~~ - -Compare imbalance between all groups: - -.. code-block:: bash - - wasp2-analyze compare-imbalance \ - allele_counts.h5ad \ - barcode_map.tsv - -Compare specific groups only: - -.. code-block:: bash - - wasp2-analyze compare-imbalance \ allele_counts.h5ad \ - barcode_map.tsv \ - --groups "excitatory_neurons,inhibitory_neurons" - -Output Format -~~~~~~~~~~~~~ - -Results are written to ``ai_results_[GROUP1]_[GROUP2].tsv``: - -.. list-table:: - :header-rows: 1 - :widths: 20 80 - - * - Column - - Description - * - ``region`` - - Genomic region identifier (peak or gene) - * - ``num_snps`` - - Number of shared heterozygous SNPs in region - * - ``combined_mu`` - - Reference allele frequency under null hypothesis (shared) - * - ``mu1`` - - Reference allele frequency in group 1 - * - ``mu2`` - - Reference allele frequency in group 2 - * - ``null_ll`` - - Log-likelihood under null (shared μ) - * - ``alt_ll`` - - Log-likelihood under alternative (separate μ values) - * - ``pval`` - - Likelihood ratio test p-value - * - ``fdr_pval`` - - FDR-corrected p-value (Benjamini-Hochberg) - -**Interpreting results:** - -* ``fdr_pval < 0.05``: Significant differential imbalance -* ``|mu1 - mu2| > 0.1``: Meaningful effect size (~20% difference) -* ``mu < 0.5``: Alternate allele favored; ``mu > 0.5``: Reference allele favored - -Parameters -~~~~~~~~~~ - -``--groups`` - Comma-separated list of groups to compare. If omitted, compares all pairwise - combinations found in the barcode map. - -``--min`` - Minimum total allele count per region per group (default: 10). Higher values - increase statistical power but reduce the number of testable regions. - -``--pseudocount`` - Pseudocount added to avoid zero counts (default: 1). Affects dispersion estimation. - -``--sample`` - Sample name for heterozygous SNP filtering. Required if multiple samples are - present in the count matrix. - -``--phased`` - Use phased genotype information from VCF. Requires genotypes in ``0|1`` or ``1|0`` - format. Improves power when haplotype phase is known. - -``-z/--z_cutoff`` - Remove SNPs with counts exceeding this z-score threshold. Useful for removing - outliers caused by mapping artifacts or copy number variation. - -Example: Sex Differences Analysis -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Identify chromatin accessibility regions with sex-biased allelic imbalance: + barcode_groups.tsv \ + --sample SAMPLE1 \ + --min 10 \ + --out_file ai_results.tsv -**Step 1: Prepare barcode map with sex labels** +``barcode_groups.tsv`` is a two-column TSV: .. code-block:: text - # barcode_sex_map.tsv - AAACGAACAGTCAGTT-1 male - AAACGAAGTCGCTCTA-1 female - AAACGAAGTGAACCTA-1 male - AAAGGATCATCGATGT-1 female - -**Step 2: Run comparative analysis** - -.. code-block:: bash - - wasp2-analyze compare-imbalance \ - allele_counts.h5ad \ - barcode_sex_map.tsv \ - --groups "male,female" \ - --min 20 \ - --out_file ai_results_sex_comparison.tsv - -**Step 3: Filter significant results** - -.. code-block:: bash - - # Extract regions with significant sex differences - awk -F'\t' 'NR==1 || $9 < 0.05' ai_results_male_female.tsv > significant_sex_diff.tsv - - # Find regions with large effect size - awk -F'\t' 'NR==1 || ($4 - $5 > 0.15 || $5 - $4 > 0.15)' significant_sex_diff.tsv - -Example: snATAC-seq Cell Type Analysis -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Complete workflow for analyzing cell-type-specific chromatin accessibility imbalance: - -**Step 1: Count alleles from snATAC-seq BAM** - -.. code-block:: bash - - # Extract valid barcodes from Cell Ranger output - zcat filtered_barcodes.tsv.gz > barcodes.txt - - # Count alleles at heterozygous SNPs overlapping peaks - wasp2-count count-variants-sc \ - possorted_bam.bam \ - phased_variants.vcf.gz \ - barcodes.txt \ - --samples sample1 \ - --feature atac_peaks.bed \ - --out_file snATAC_counts.h5ad - -**Step 2: Create barcode-to-celltype mapping** - -Export cell type annotations from your clustering analysis (e.g., Seurat, ArchR): - -.. code-block:: r - - # R/Seurat example - write.table( - data.frame(barcode = Cells(seurat_obj), - celltype = Idents(seurat_obj)), - "barcode_celltype_map.tsv", - sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE - ) - -**Step 3: Run single-cell imbalance analysis** - -.. code-block:: bash + BARCODEGROUP - # Per-celltype analysis - wasp2-analyze find-imbalance-sc \ - snATAC_counts.h5ad \ - barcode_celltype_map.tsv \ - --sample sample1 \ - --phased \ - --min 10 \ - -z 3.0 +The command writes one output file per group using the requested output prefix. -**Step 4: Compare imbalance between cell types** +Comparative Single-Cell Analysis +-------------------------------- .. code-block:: bash - # Compare specific cell types wasp2-analyze compare-imbalance \ - snATAC_counts.h5ad \ - barcode_celltype_map.tsv \ - --sample sample1 \ - --groups "excitatory,inhibitory,astrocyte" \ - --phased \ - --min 15 - - # This produces: - # - ai_results_excitatory_inhibitory.tsv - # - ai_results_excitatory_astrocyte.tsv - # - ai_results_inhibitory_astrocyte.tsv - -**Step 5: Identify cell-type-specific regulatory regions** - -.. code-block:: bash - - # Find peaks with differential imbalance between excitatory and inhibitory neurons - awk -F'\t' '$9 < 0.01 && ($4 > 0.6 || $4 < 0.4)' \ - ai_results_excitatory_inhibitory.tsv > neuron_subtype_specific_peaks.tsv - -Best Practices for Single-Cell Analysis -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Data Quality:** - -* Use WASP-filtered BAM files to remove mapping bias -* Require ≥10 total counts per region per group (``--min 10``) -* Apply z-score filtering to remove outliers (``-z 3.0``) - -**Statistical Power:** - -* Merge similar cell types if individual populations have low coverage -* Use phased genotypes when available (``--phased``) -* Focus on regions with multiple SNPs for better estimates - -**Interpretation:** - -* Consider biological replication across samples -* Validate top hits with orthogonal methods (allele-specific CRISPR, etc.) -* Integrate with eQTL data to identify causal variants - -Example Workflow ----------------- - -.. code-block:: bash - - # 1. Count alleles - wasp2-count count-variants \ - wasp_filtered.bam \ - variants.vcf \ - --region genes.gtf \ - --samples NA12878 \ - --output counts.tsv - - # 2. Analyze imbalance - wasp2-analyze find-imbalance \ - counts.tsv \ - --min-count 20 \ - --model single \ - --output imbalance.tsv - - # 3. Filter significant results - awk '$5 < 0.05' imbalance.tsv > significant.tsv - -Best Practices --------------- - -Read Depth -~~~~~~~~~~ - -* Minimum 10 reads per region (use ``--min-count``) -* Higher depth = more power -* Consider downsampling very deep regions - -Quality Control -~~~~~~~~~~~~~~~ + allele_counts.h5ad \ + barcode_groups.tsv \ + --sample SAMPLE1 \ + --groups B_cell T_cell \ + --out_file compare_ai.tsv -* Use WASP-filtered reads -* Remove low-complexity regions -* Filter low-quality SNPs +This compares allelic imbalance between the requested groups and writes one TSV +per comparison. -Multiple Testing -~~~~~~~~~~~~~~~~ +Notes +----- -* FDR correction is automatic -* Consider Bonferroni for very important regions -* Validate top hits experimentally +* If your count file contains genotype columns for multiple samples, you must + provide ``--sample`` for single-cell analysis. +* For bulk analysis, region columns are auto-detected when present in the count + TSV. Use ``--region_col`` only when you need to override that behavior. -Common Issues -------------- - -No Significant Results -~~~~~~~~~~~~~~~~~~~~~~ +Outputs +------- -* Increase sample size -* Check read depth (use deeper sequencing) -* Verify heterozygous SNPs present +Typical bulk outputs include: -Many Significant Results -~~~~~~~~~~~~~~~~~~~~~~~~ +* region or feature identifier +* aggregated ``ref_count`` and ``alt_count`` +* p-values and FDR-adjusted p-values -* Check for batch effects -* Verify WASP filtering was applied -* Consider stricter FDR threshold +Typical single-cell outputs include the same statistics stratified by barcode +group. Next Steps ---------- -* Validate results with qPCR or DNA-seq -* Integrate with eQTL data -* Perform pathway enrichment analysis +* :doc:`counting` to generate bulk or single-cell counts +* :doc:`/tutorials/comparative_imbalance` for group-comparison workflows diff --git a/docs/source/user_guide/counting.rst b/docs/source/user_guide/counting.rst index 54db55f..b6a0ba6 100644 --- a/docs/source/user_guide/counting.rst +++ b/docs/source/user_guide/counting.rst @@ -4,195 +4,135 @@ Counting Module Overview -------- -The counting module quantifies allele-specific read counts at heterozygous SNP positions. It's the first step in allelic imbalance analysis. +``wasp2-count`` counts reads supporting reference and alternate alleles at +variant positions in BAM files. -Purpose -~~~~~~~ +It provides two commands: -* Count reads supporting reference vs alternate alleles -* Filter by sample genotype (heterozygous sites) -* Annotate with genomic regions (genes, peaks) -* Support single-cell RNA-seq +* ``count-variants`` for bulk data +* ``count-variants-sc`` for single-cell data with ``CB``-tagged barcodes -When to Use -~~~~~~~~~~~ - -Use counting when you have: -* Aligned reads (BAM file) -* Variant calls (VCF file) -* Want to quantify allele-specific expression - -CLI Usage ---------- +Bulk Counting +------------- -Basic Command -~~~~~~~~~~~~~ +Basic usage: .. code-block:: bash - wasp2-count count-variants BAM_FILE VCF_FILE + wasp2-count count-variants sample.bam variants.vcf.gz --out_file counts.tsv -Full Options -~~~~~~~~~~~~ +With sample filtering and region annotation: .. code-block:: bash wasp2-count count-variants \ - input.bam \ - variants.vcf \ - --samples sample1,sample2 \ + sample.bam \ + variants.vcf.gz \ + --samples SAMPLE1 \ --region genes.gtf \ --out_file counts.tsv -Input Requirements ------------------- - -BAM File -~~~~~~~~ +Supported region files: -* Aligned reads (single-end or paired-end) -* Indexed (.bai file in same directory) -* Sorted by coordinate +* BED +* MACS2 ``narrowPeak`` / ``broadPeak`` +* GTF +* GFF3 -VCF File -~~~~~~~~ +For GTF/GFF3 inputs, WASP2 derives interval annotations from feature rows and +defaults to ``gene`` features when present. -* Variant calls with genotype information -* Heterozygous SNPs (GT=0|1 or 1|0) -* Can include sample-specific genotypes +Useful options: -Optional: Region File -~~~~~~~~~~~~~~~~~~~~~ +* ``--samples`` / ``-s``: select het sites for one or more samples +* ``--region`` / ``-r``: restrict/annotate variants by overlapping regions +* ``--gene_feature``: choose the GTF/GFF3 feature type +* ``--gene_attribute``: choose the GTF/GFF3 attribute used as the feature ID +* ``--gene_parent``: choose the parent/grouping attribute for gene annotations +* ``--use_region_names``: prefer region names instead of coordinate strings +* ``--include-indels``: count indels in addition to SNPs -Annotate SNPs overlapping genes/peaks: +Output columns always include: -* GTF/GFF3 format (genes) -* BED format (peaks, regions) -* narrowPeak format (ATAC-seq, ChIP-seq) +* ``chrom`` +* ``pos`` or ``pos0`` / ``pos`` depending on input path +* ``ref`` +* ``alt`` +* ``ref_count`` +* ``alt_count`` +* ``other_count`` -Parameters ----------- +When sample filtering is active, genotype columns are included. When region +annotation is active, region or gene columns are included as well. -``--samples`` / ``-s`` -~~~~~~~~~~~~~~~~~~~~~~ +Single-Cell ATAC Counting +------------------------- -Filter SNPs heterozygous in specified samples: +Single-cell counting is designed for **scATAC-seq** data. It requires a BAM +with ``CB`` tags and a positional barcode file containing one barcode per line. .. code-block:: bash - --samples sample1,sample2,sample3 - # or - --samples samples.txt # one per line - -``--region`` / ``-r`` -~~~~~~~~~~~~~~~~~~~~~ - -Annotate SNPs with overlapping regions: - -.. code-block:: bash - - --region genes.gtf # Gene annotations - --region peaks.bed # ATAC-seq peaks - --region regions.gff3 # Custom regions - -``--out_file`` / ``-o`` -~~~~~~~~~~~~~~~~~~~~~~~ - -Output file path (default: counts.tsv): - -.. code-block:: bash - - --out_file my_counts.tsv - -Output Format -------------- - -Tab-separated file with columns: + wasp2-count count-variants-sc \ + sc_atac.bam \ + variants.vcf.gz \ + barcodes.tsv \ + --samples sample1 \ + --feature peaks.bed \ + --out_file allele_counts.h5ad -Basic Columns -~~~~~~~~~~~~~ +Important points: -* ``chr``: Chromosome -* ``pos``: SNP position (1-based) -* ``ref``: Reference allele -* ``alt``: Alternate allele -* ``ref_count``: Reads supporting reference -* ``alt_count``: Reads supporting alternate -* ``other_count``: Reads supporting other alleles +* ``barcodes.tsv`` is a positional argument, not ``--barcode_map`` +* ``--feature`` and ``--region`` are aliases on the single-cell command +* Accepts BED and MACS2 peak files (GTF/GFF3 are supported only by the bulk ``count-variants`` command) -Optional Columns (with --region) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The output is an AnnData ``.h5ad`` file with: -* ``gene_id``: Overlapping gene -* ``gene_name``: Gene symbol -* ``feature``: Feature type (exon, intron, etc.) +* sparse count layers for ``ref``, ``alt``, and ``other`` +* variant metadata in ``adata.obs`` +* barcode names in ``adata.var_names`` +* feature-to-variant mapping in ``adata.uns["feature"]`` when annotations are used -Example Workflow ----------------- +Examples +-------- -1. Basic Counting -~~~~~~~~~~~~~~~~~ +Count variants without regional annotation: .. code-block:: bash - wasp2-count count-variants sample.bam variants.vcf + wasp2-count count-variants \ + filtered.bam \ + variants.vcf.gz \ + --samples SAMPLE1 \ + --out_file counts.tsv -2. Filter by Sample -~~~~~~~~~~~~~~~~~~~ +Count variants inside peaks: .. code-block:: bash wasp2-count count-variants \ - sample.bam \ - variants.vcf \ - --samples NA12878 + filtered.bam \ + variants.vcf.gz \ + --samples SAMPLE1 \ + --region peaks.bed \ + --out_file counts_peaks.tsv -3. Annotate with Genes -~~~~~~~~~~~~~~~~~~~~~~ +Count variants inside genes: .. code-block:: bash wasp2-count count-variants \ - sample.bam \ - variants.vcf \ - --samples NA12878 \ + filtered.bam \ + variants.vcf.gz \ + --samples SAMPLE1 \ --region genes.gtf \ - --out_file counts_annotated.tsv - -Single-Cell Counting --------------------- - -For single-cell RNA-seq: - -.. code-block:: bash - - wasp2-count count-variants-sc \ - sc_rnaseq.bam \ - variants.vcf \ - --barcode_map barcodes.tsv - -Output includes cell-type-specific counts. - -Common Issues -------------- - -Low Count Numbers -~~~~~~~~~~~~~~~~~ - -* Check BAM file coverage (``samtools depth``) -* Verify VCF contains heterozygous SNPs -* Ensure BAM and VCF use same reference genome - -No Output SNPs -~~~~~~~~~~~~~~ - -* Check if --samples filter is too restrictive -* Verify VCF has genotype information (GT field) -* Ensure BAM file is indexed + --gene_feature gene \ + --gene_attribute gene_id \ + --out_file counts_genes.tsv Next Steps ---------- -After counting: -* :doc:`analysis` - Detect allelic imbalance -* :doc:`mapping` - Correct reference bias with WASP +* :doc:`analysis` for statistical testing of allelic imbalance +* :doc:`/user_guide/single_cell` for barcode grouping and single-cell workflows diff --git a/docs/source/user_guide/ipscore.rst b/docs/source/user_guide/ipscore.rst new file mode 100644 index 0000000..993fcbd --- /dev/null +++ b/docs/source/user_guide/ipscore.rst @@ -0,0 +1,53 @@ +iPSCORE Utilities +================= + +Overview +-------- + +``wasp2-ipscore`` provides helper commands for working with the iPSCORE +multi-tissue allelic imbalance resource bundled with WASP2. + +Commands +-------- + +``inventory`` +~~~~~~~~~~~~~ + +Validate the expected iPSCORE data inventory and optionally write a TSV report. + +.. code-block:: bash + + wasp2-ipscore inventory --output inventory.tsv + +``manifest`` +~~~~~~~~~~~~ + +Create a unified sample manifest across tissues and assays. + +.. code-block:: bash + + wasp2-ipscore manifest --output manifest.csv --format csv + +``qtls`` +~~~~~~~~ + +Load and summarize QTL resources, optionally filtering by tissue. + +.. code-block:: bash + + wasp2-ipscore qtls --tissue iPSC --output qtls.tsv + +``validate`` +~~~~~~~~~~~~ + +Run the combined inventory, manifest, and QTL validation workflow. + +.. code-block:: bash + + wasp2-ipscore validate + +Notes +----- + +These commands are resource-management utilities. They do not replace the main +``wasp2-map``, ``wasp2-count``, or ``wasp2-analyze`` analysis workflows. diff --git a/docs/source/user_guide/mapping.rst b/docs/source/user_guide/mapping.rst index d38be18..48c8ed8 100644 --- a/docs/source/user_guide/mapping.rst +++ b/docs/source/user_guide/mapping.rst @@ -1,221 +1,148 @@ -Mapping Module (WASP) -===================== +Mapping Module +============== Overview -------- -The WASP (Weighted Allele-Specific Mapping) algorithm corrects reference bias by remapping reads with all possible alleles. +``wasp2-map`` implements the WASP remap-and-filter workflow for removing +reference mapping bias before allele counting. -What is Reference Bias? -~~~~~~~~~~~~~~~~~~~~~~~~ +The public CLI has two commands: -Reference bias occurs when reads containing alternate alleles align worse than reads with reference alleles, leading to false allelic imbalance signals. +1. ``make-reads``: find reads overlapping sample variants and generate + allele-swapped FASTQ files for remapping +2. ``filter-remapped``: keep only remapped reads that return to the same locus -WASP Solution -~~~~~~~~~~~~~ +There is no separate ``find-intersecting-snps`` command in WASP2. That overlap +step is part of ``make-reads``. -1. Identify reads overlapping heterozygous SNPs -2. Generate alternative reads (swap alleles) -3. Remap both original and swapped reads -4. Keep only reads that map to the same location - -Purpose -------- - -* Correct reference bias in RNA-seq, ATAC-seq -* Improve accuracy of allelic imbalance detection -* Required before allele counting - -When to Use -~~~~~~~~~~~ - -Use WASP when: -* Reads will be used for allelic analysis -* Reference genome differs from sample genotype -* High-confidence bias correction needed - -Workflow --------- - -Complete WASP workflow has 3 steps: - -Step 1: Find Intersecting SNPs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Identify reads overlapping heterozygous SNPs: - -.. code-block:: bash - - wasp2-map find-intersecting-snps \ - input.bam \ - variants.vcf \ - --output intersecting.bam - -Output: BAM file with reads overlapping SNPs. - -Step 2: Generate Remapping Reads -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Typical Workflow +---------------- -Create reads with swapped alleles: +Step 1: Generate swapped reads +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash wasp2-map make-reads \ - intersecting.bam \ - variants.vcf \ - --samples sample1 \ - --output remap_reads.fastq + sample.bam \ + variants.vcf.gz \ + --samples SAMPLE1 \ + --out_dir wasp_output -Output: FASTQ file(s) with alternative allele sequences. +This writes: -Step 3: Remap and Filter -~~~~~~~~~~~~~~~~~~~~~~~~~ +* ``sample_to_remap.bam``: original reads that must be remapped +* ``sample_keep.bam``: reads that never overlapped eligible variants +* ``sample_swapped_alleles_r1.fq`` and ``sample_swapped_alleles_r2.fq``: + swapped FASTQ reads to realign +* ``sample_wasp_data_files.json``: metadata for ``filter-remapped`` -User remaps with their aligner (BWA, STAR, etc.): +Step 2: Realign swapped reads +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the same aligner and alignment settings used for the original BAM. .. code-block:: bash - # Example with BWA - bwa mem -t 8 reference.fa remap_reads.fastq | \ - samtools sort -o remapped.bam - + bwa mem -M -t 8 genome.fa \ + wasp_output/sample_swapped_alleles_r1.fq \ + wasp_output/sample_swapped_alleles_r2.fq | \ + samtools sort -o wasp_output/sample_remapped.bam - + + samtools index wasp_output/sample_remapped.bam -Then filter to consistent mappings: +Step 3: Filter remapped reads +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash - wasp2-map filt-remapped-reads \ - intersecting.bam \ - remapped.bam \ - --output filtered.bam + wasp2-map filter-remapped \ + wasp_output/sample_remapped.bam \ + --wasp_data_json wasp_output/sample_wasp_data_files.json \ + --out_bam wasp_output/sample_wasp_filtered.bam -Output: BAM file with bias-corrected reads. +You can also provide ``to_remap_bam`` and ``keep_bam`` positionally instead of +``--wasp_data_json``. -CLI Reference -------------- +Command Reference +----------------- -find-intersecting-snps -~~~~~~~~~~~~~~~~~~~~~~ +``make-reads`` +~~~~~~~~~~~~~~ .. code-block:: bash - wasp2-map find-intersecting-snps [OPTIONS] BAM VCF - -Options: -* ``--samples``: Filter by sample genotype -* ``--output``: Output BAM file + wasp2-map make-reads [OPTIONS] BAM VARIANTS -make-reads -~~~~~~~~~~ +Important options: -.. code-block:: bash +* ``--samples`` / ``-s``: sample name(s) used to select het variants +* ``--out_dir`` / ``-o``: output directory +* ``--out_json`` / ``-j``: explicit metadata JSON path +* ``--indels``: include indels as well as SNPs +* ``--threads``: BAM I/O threads - wasp2-map make-reads [OPTIONS] BAM VCF +Notes: -Options: -* ``--samples``: Sample name(s) -* ``--output``: Output FASTQ prefix -* ``--paired``: Paired-end mode +* paired-end input is required +* phased genotypes are strongly recommended +* supported variant formats are VCF, VCF.GZ, BCF, and PGEN -filt-remapped-reads +``filter-remapped`` ~~~~~~~~~~~~~~~~~~~ .. code-block:: bash - wasp2-map filt-remapped-reads [OPTIONS] ORIGINAL REMAPPED - -Options: -* ``--output``: Filtered BAM file -* ``--keep_read_file``: Save kept read IDs + wasp2-map filter-remapped [OPTIONS] REMAPPED_BAM [TO_REMAP_BAM] [KEEP_BAM] -Input Requirements ------------------- +Important options: -* **Original BAM**: Aligned reads from initial mapping -* **VCF File**: Phased heterozygous SNPs (recommended) -* **Reference Genome**: Same as used for original alignment +* ``--wasp_data_json`` / ``-j``: load ``to_remap_bam`` and ``keep_bam`` from + ``make-reads`` metadata +* ``--out_bam`` / ``-o``: output BAM path +* ``--remap_keep_bam``: optional BAM of remapped reads that passed filtering +* ``--remap_keep_file``: optional text file of kept read names +* ``--same-locus-slop``: positional tolerance for same-locus matching +* ``--threads``: BAM I/O threads -Output Interpretation ---------------------- +Interpreting Outputs +-------------------- -WASP Filter Rate -~~~~~~~~~~~~~~~~ +Common outcomes after ``filter-remapped``: -Typical filter rates: -* **Good**: 95-99% reads kept -* **Acceptable**: 90-95% reads kept -* **Concerning**: <90% reads kept (check data quality) +* reads kept because they remap to the same locus +* reads dropped because they remap elsewhere +* reads dropped because they fail to remap cleanly -Low filter rate may indicate: -* Poor mapping quality -* High SNP density -* Problematic reference genome - -Complete Example ----------------- +The final WASP-corrected BAM is the output of ``filter-remapped`` merged with +the ``*_keep.bam`` reads that never required remapping. -Full WASP workflow: +Example +------- .. code-block:: bash - # Step 1: Find SNP-overlapping reads - wasp2-map find-intersecting-snps \ - original.bam \ - phased_variants.vcf \ - --samples NA12878 \ - --output intersecting.bam - - # Step 2: Generate remapping reads wasp2-map make-reads \ - intersecting.bam \ - phased_variants.vcf \ - --samples NA12878 \ - --paired \ - --output remap - - # Step 3: Remap (user's aligner) - bwa mem -t reference.fa \ - remap_R1.fastq remap_R2.fastq | \ - samtools sort -o remapped.bam - - samtools index remapped.bam - - # Step 4: Filter - wasp2-map filt-remapped-reads \ - intersecting.bam \ - remapped.bam \ - --output filtered_wasp.bam - - # Step 5: Count alleles (use filtered BAM) - wasp2-count count-variants \ - filtered_wasp.bam \ - phased_variants.vcf \ - --samples NA12878 - -Performance Tips ----------------- - -* Use multi-threading for remapping step -* Filter VCF to high-quality SNPs only -* Use phased genotypes when available + sample.bam \ + variants.vcf.gz \ + --samples SAMPLE1 \ + --out_dir wasp_output -Common Issues -------------- + bwa mem -M -t 8 genome.fa \ + wasp_output/sample_swapped_alleles_r1.fq \ + wasp_output/sample_swapped_alleles_r2.fq | \ + samtools sort -o wasp_output/sample_remapped.bam - -Many Reads Filtered -~~~~~~~~~~~~~~~~~~~~ - -* Check remapping quality (MAPQ scores) -* Verify same reference genome used -* Consider relaxing mapping parameters - -Slow Remapping -~~~~~~~~~~~~~~ + samtools index wasp_output/sample_remapped.bam -* Use multi-threading (``-t`` flag) -* Process chromosomes in parallel -* Consider downsampling for testing + wasp2-map filter-remapped \ + wasp_output/sample_remapped.bam \ + --wasp_data_json wasp_output/sample_wasp_data_files.json \ + --out_bam wasp_output/sample_wasp_filtered.bam Next Steps ---------- -* :doc:`counting` - Count alleles from WASP-filtered BAM -* :doc:`analysis` - Analyze allelic imbalance +* :doc:`counting` to count alleles from the WASP-filtered BAM +* :doc:`analysis` to test for allelic imbalance diff --git a/pixi.toml b/pixi.toml index baa7dd2..fcc8b3a 100644 --- a/pixi.toml +++ b/pixi.toml @@ -66,6 +66,7 @@ verify = "wasp2-count --help && wasp2-map --help && wasp2-analyze --help && pyth build = "maturin develop --release -m rust/Cargo.toml" # Run tests test = "pytest tests/ -v --tb=short --ignore=tests/benchmarks -m 'not benchmark'" +test-mapping-parity = "pytest tests/regression/test_mapping_stage_parity.py -v --tb=short" # Run benchmarks bench = "pytest tests/benchmarks/ -v --tb=short" # Lint diff --git a/pyproject.toml b/pyproject.toml index 90f76c2..ea1740c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,7 +126,7 @@ python-source = "src" module-name = "wasp2_rust" python-packages = ["counting", "mapping", "analysis", "wasp2", "ipscore"] bindings = "pyo3" -strip = true +strip = false include = ["LICENSE", "README.md"] [tool.pytest.ini_options] diff --git a/rust/Cargo.lock b/rust/Cargo.lock index d778994..fa9d689 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1999,6 +1999,7 @@ dependencies = [ "noodles-core", "noodles-vcf", "pyo3", + "pyo3-build-config", "rayon", "rust-htslib", "rustc-hash 2.1.1", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index a375ef1..af1c3f4 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -33,6 +33,9 @@ flate2 = "1.1" # For gzip decompression criterion = { version = "0.8", features = ["html_reports"] } tempfile = "3.25" +[build-dependencies] +pyo3-build-config = "0.28.2" + # Benchmarks removed for clean release (benchmark files in paper branch only) # [[bench]] # name = "mapping_filter_bench" diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000..258e36c --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,4 @@ +fn main() { + pyo3_build_config::add_extension_module_link_args(); + pyo3_build_config::use_pyo3_cfgs(); +} diff --git a/scripts/container_smoke_test.sh b/scripts/container_smoke_test.sh index 690ef28..575b7d5 100755 --- a/scripts/container_smoke_test.sh +++ b/scripts/container_smoke_test.sh @@ -64,6 +64,15 @@ for cmd in wasp2-count wasp2-map wasp2-analyze; do FAIL=$((FAIL + 1)) fi done + +# wasp2-ipscore doesn't have --version, check --help instead +if wasp2-ipscore --help > /dev/null 2>&1; then + echo " PASS: wasp2-ipscore --help" + PASS=$((PASS + 1)) +else + echo " FAIL: wasp2-ipscore --help" + FAIL=$((FAIL + 1)) +fi echo "" # ───────────────────────────────────────────────────────────────────────────── @@ -96,7 +105,7 @@ if [[ -f "$DATA_DIR/sample1.bam" && -f "$DATA_DIR/variants.vcf.gz" ]]; then if wasp2-count count-variants \ "$DATA_DIR/sample1.bam" \ "$DATA_DIR/variants.vcf.gz" \ - --samples SAMPLE1 \ + --samples sample1 \ --out "$TMP_DIR/counts.tsv" \ 2>/dev/null; then diff --git a/src/counting/__main__.py b/src/counting/__main__.py index 2fb6945..15deb17 100644 --- a/src/counting/__main__.py +++ b/src/counting/__main__.py @@ -80,7 +80,7 @@ def count_variants( "-r", help=( "Only use variants overlapping regions in file. " - "Accepts BED or MACS2 formatted .(narrow/broad)Peak files. " + "Accepts BED, MACS2 .(narrow/broad)Peak, GTF, or GFF3 files. " ), ), ] = None, @@ -126,7 +126,7 @@ def count_variants( "--feat", help=( "Feature type in gtf/gff3 for counting intersecting SNPs. " - "Defaults to 'exon' for snp counting" + "Defaults to 'gene' when present." ), ), ] = None, @@ -237,10 +237,9 @@ def count_variants_sc( "--regions", "-r", help=( - "Features used in single-cell experiment. " + "Features used in single-cell ATAC experiment. " "Only use variants overlapping features in file. " - "Accepts BED or MACS2 formatted .(narrow/broad)Peak files. " - "TODO: Implement genes gtf/gff format" + "Accepts BED or MACS2 .(narrow/broad)Peak files." ), ), ] = None, diff --git a/src/counting/count_alleles_sc.py b/src/counting/count_alleles_sc.py index 1244383..104f8ef 100644 --- a/src/counting/count_alleles_sc.py +++ b/src/counting/count_alleles_sc.py @@ -20,6 +20,19 @@ logger = logging.getLogger(__name__) +def _sparse_from_counts( + counts: defaultdict[tuple[int, int], int], + shape: tuple[int, int], +) -> csr_matrix: + if not counts: + return csr_matrix(shape, dtype=np.uint16) + return csr_matrix( + (list(counts.values()), list(zip(*counts.keys()))), + shape=shape, + dtype=np.uint16, + ) + + class CountStatsSC: """Container for mutable single-cell counting statistics. @@ -101,7 +114,12 @@ def make_count_matrix( # Maybe do this automatically and parse feature col instead? snp_df_cols = ["chrom", "pos", "ref", "alt"] if include_samples is not None: - snp_df_cols.extend(include_samples) + sample_cols = list(include_samples) + missing_sample_cols = [col for col in sample_cols if col not in df.columns] + if missing_sample_cols and len(sample_cols) == 1 and "GT" in df.columns: + sample_name = sample_cols[0] + df = df.with_columns(pl.col("GT").alias(sample_name)) + snp_df_cols.extend(sample_cols) # Might be more memory efficient to use pandas index instead... snp_df = df.select(snp_df_cols).unique(maintain_order=True).with_row_index() @@ -135,23 +153,10 @@ def make_count_matrix( # Create sparse matrices # sparse array is recommended...but doesnt work with adata - sparse_ref = csr_matrix( - (list(sc_counts.ref_count.values()), list(zip(*sc_counts.ref_count.keys()))), - shape=(snp_df.shape[0], len(bc_dict)), - dtype=np.uint16, - ) - - sparse_alt = csr_matrix( - (list(sc_counts.alt_count.values()), list(zip(*sc_counts.alt_count.keys()))), - shape=(snp_df.shape[0], len(bc_dict)), - dtype=np.uint16, - ) - - sparse_other = csr_matrix( - (list(sc_counts.other_count.values()), list(zip(*sc_counts.other_count.keys()))), - shape=(snp_df.shape[0], len(bc_dict)), - dtype=np.uint16, - ) + matrix_shape = (snp_df.shape[0], len(bc_dict)) + sparse_ref = _sparse_from_counts(sc_counts.ref_count, matrix_shape) + sparse_alt = _sparse_from_counts(sc_counts.alt_count, matrix_shape) + sparse_other = _sparse_from_counts(sc_counts.other_count, matrix_shape) # Create anndata With total as X adata = ad.AnnData( @@ -171,17 +176,19 @@ def make_count_matrix( if include_samples is not None: adata.uns["samples"] = include_samples - # TODO: Allow for other features besides 'region' using include_features - # Could be case of no features, or feature is gene - if "region" in df.columns: - # Get unique snps and associated regions + feature_cols = include_features or [] + if not feature_cols and "region" in df.columns: + feature_cols = ["region"] - # Create dict during analysis step instead - adata.uns["feature"] = ( + if feature_cols: + feature_df = ( df.join(snp_df, on=["chrom", "pos", "ref", "alt"], how="left") - .select(["region", "index"]) - .to_pandas() + .select([*feature_cols, "index"]) + .unique(maintain_order=True) ) + if "region" not in feature_df.columns: + feature_df = feature_df.with_columns(pl.col(feature_cols[0]).alias("region")) + adata.uns["feature"] = feature_df.to_pandas() # region_snp_dict = dict( # df.join(snp_df, on=["chrom", "pos", "ref", "alt"], how="left" diff --git a/src/counting/parse_gene_data.py b/src/counting/parse_gene_data.py index 6653ed1..682192c 100644 --- a/src/counting/parse_gene_data.py +++ b/src/counting/parse_gene_data.py @@ -106,12 +106,12 @@ def parse_gene_file( if feature is None: feature_list = df.select(pl.col("feature").unique()).to_series() - if "exon" in feature_list: - feature = "exon" + if "gene" in feature_list: + feature = "gene" elif "transcript" in feature_list: feature = "transcript" - elif "gene" in feature_list: - feature = "gene" + elif "exon" in feature_list: + feature = "exon" else: logger.warning("exon, gene or transcript not found in feature list: %s", feature_list) @@ -244,6 +244,20 @@ def parse_intersect_genes( if parent_attribute is None: parent_attribute = "Parent" + # Guard against empty intersection file (0 variants in region) + intersect_path = Path(intersect_file) + if not intersect_path.exists() or intersect_path.stat().st_size == 0: + return pl.DataFrame( + schema={ + "chrom": pl.Categorical, + "pos": pl.UInt32, + "ref": pl.Categorical, + "alt": pl.Categorical, + attribute: pl.Utf8, + parent_attribute: pl.Utf8, + } + ) + # AFTER performing gtf_to_bed and intersecting! df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0) @@ -287,8 +301,23 @@ def parse_intersect_genes_new( if parent_attribute is None: parent_attribute = "Parent" + # Guard against empty intersection file (0 variants in region) + intersect_path = Path(intersect_file) + if not intersect_path.exists() or intersect_path.stat().st_size == 0: + return pl.DataFrame( + schema={ + "chrom": pl.Categorical, + "pos": pl.UInt32, + "ref": pl.Categorical, + "alt": pl.Categorical, + attribute: pl.Utf8, + parent_attribute: pl.Utf8, + } + ) + # AFTER performing gtf_to_bed and intersecting! df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0) + schema_names = list(df.collect_schema().names()) vcf_schema = [ pl.col("chrom").cast(pl.Categorical), @@ -298,12 +327,12 @@ def parse_intersect_genes_new( ] # Expect at min 10 cols, 11 if GT included - if len(df.columns) > 10: - subset_cols = [df.columns[i] for i in [0, 2, 3, 4, 5, -2, -1]] + if len(schema_names) > 10: + subset_cols = [schema_names[i] for i in [0, 2, 3, 4, 5, -2, -1]] new_cols = ["chrom", "pos", "ref", "alt", "GT", attribute, parent_attribute] vcf_schema.append(pl.col("GT").cast(pl.Categorical)) else: - subset_cols = [df.columns[i] for i in [0, 2, 3, 4, -2, -1]] + subset_cols = [schema_names[i] for i in [0, 2, 3, 4, -2, -1]] new_cols = ["chrom", "pos", "ref", "alt", attribute, parent_attribute] # Parse dataframe columns diff --git a/src/counting/run_counting_sc.py b/src/counting/run_counting_sc.py index 08a0240..c9c4c20 100644 --- a/src/counting/run_counting_sc.py +++ b/src/counting/run_counting_sc.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging import re from pathlib import Path @@ -98,39 +99,13 @@ def __init__( self.intersect_file = str( Path(self.temp_loc) / f"{variant_prefix}_intersect_regions.bed" ) - self.is_gene_file = False - elif re.search(r"\.g[tf]f(?:\.gz)?$", f_ext, re.I): - self.feature_type = "genes" - self.intersect_file = str( - Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed" - ) - self.is_gene_file = True - gtf_prefix = re.split(r".g[tf]f", Path(self.feature_file).name)[0] - self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed") - self.use_feature_names = True # Use feature attributes as region names - elif re.search(r"\.gff3(?:\.gz)?$", f_ext, re.I): - self.feature_type = "genes" - self.intersect_file = str( - Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed" - ) - self.is_gene_file = True - gtf_prefix = re.split(r".gff3", Path(self.feature_file).name)[0] - self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed") - self.use_feature_names = True # Use feature attributes as feature names else: raise ValueError( - f"Invalid feature file type. Expected .bed, .gtf, or .gff3, got: {self.feature_file}" + f"Invalid feature file type. Expected BED or MACS2 peak file, got: {self.feature_file}" ) else: self.intersect_file = self.vcf_bed - self.is_gene_file = False - - # TODO UPDATE THIS WHEN I ADD AUTOPARSERS - if self.is_gene_file: - # Possible edge case of vcf and gtf prefix conflict - if self.vcf_bed == self.gtf_bed: - self.gtf_bed = str(Path(self.temp_loc) / "genes.bed") @tempdir_decorator @@ -144,7 +119,7 @@ def run_count_variants_sc( out_file: str | None = None, temp_loc: str | None = None, ) -> None: - """Run single-cell variant counting pipeline. + """Run single-cell ATAC variant counting pipeline. Parameters ---------- @@ -155,7 +130,7 @@ def run_count_variants_sc( barcode_file : str Path to cell barcode file (one barcode per line). feature_file : str | None, optional - Path to feature/region file (BED, GTF, or GFF3). + Path to feature/region file (BED or MACS2 peak file). samples : str | list[str] | None, optional Sample ID(s) to process. use_region_names : bool, optional @@ -192,13 +167,13 @@ def run_count_variants_sc( ) assert count_files.feature_file is not None + intersect_vcf_region( vcf_file=count_files.vcf_bed, region_file=count_files.feature_file, out_file=count_files.intersect_file, ) - # TODO: handle use_region_names better df = parse_intersect_region_new( intersect_file=count_files.intersect_file, samples=count_files.samples, @@ -206,13 +181,26 @@ def run_count_variants_sc( region_col=None, ) + # Guard: if no variants survived intersection, warn and write empty output + if df.is_empty(): + logging.getLogger(__name__).warning( + "No variants found after intersection — writing empty output file." + ) + import anndata as ad + + ad.AnnData().write_h5ad(count_files.out_file) + return + # TODO: handle case where barcode file contains multiple columns with open(count_files.barcode_file) as file: bc_dict = {line.rstrip(): i for i, line in enumerate(file)} # Generate Output adata = make_count_matrix( - bam_file=count_files.bam_file, df=df, bc_dict=bc_dict, include_samples=count_files.samples + bam_file=count_files.bam_file, + df=df, + bc_dict=bc_dict, + include_samples=count_files.samples, ) # Write outputs diff --git a/src/mapping/intersect_variant_data.py b/src/mapping/intersect_variant_data.py index 6d52aba..307ac64 100644 --- a/src/mapping/intersect_variant_data.py +++ b/src/mapping/intersect_variant_data.py @@ -10,9 +10,8 @@ import os import subprocess from pathlib import Path +from typing import TYPE_CHECKING -import numpy as np -import polars as pl import pysam # Multi-format variant support @@ -25,6 +24,9 @@ logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import polars as pl + def vcf_to_bed( vcf_file: str | Path, @@ -148,7 +150,7 @@ def make_intersect_df( intersect_file: str, samples: list[str], is_paired: bool = True, -) -> pl.DataFrame: +) -> "pl.DataFrame": """Parse intersection file into a typed polars DataFrame. Parameters @@ -165,6 +167,9 @@ def make_intersect_df( pl.DataFrame Parsed intersection data with alleles split by sample. """ + import numpy as np + import polars as pl + # Create Dataframe df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0) diff --git a/src/mapping/remap_utils.py b/src/mapping/remap_utils.py index ebf8534..416ebde 100644 --- a/src/mapping/remap_utils.py +++ b/src/mapping/remap_utils.py @@ -1,13 +1,15 @@ import logging from collections.abc import Generator -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np -import polars as pl from pysam import AlignedSegment, AlignmentFile logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import polars as pl + # Generator for iterating through bam def paired_read_gen( @@ -127,13 +129,13 @@ def _build_ref2read_maps(read: AlignedSegment) -> tuple[dict[int, int], dict[int def get_read_het_data( - read_df: pl.DataFrame, + read_df: "pl.DataFrame", read: AlignedSegment, col_list: list[str], max_seqs: int | None = None, include_indels: bool = False, insert_qual: int = 30, -) -> tuple[list[str], list[Any], list[pl.Series]] | None: +) -> tuple[list[str], list[Any], list["pl.Series"]] | None: """Extract heterozygous variant data from read with indel support. Args: @@ -150,6 +152,8 @@ def get_read_het_data( split_qual: List of quality score segments allele_series: List of polars Series with allele data """ + import polars as pl + pos_list = read_df.select(["start", "stop"]).rows() assert read.query_sequence is not None, "Read has no query sequence"