diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3518670..2e37711 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -146,8 +146,8 @@ jobs:
- name: Run cargo test
working-directory: rust
- run: cargo test
+ run: PYO3_PYTHON=$(command -v python3) cargo test
- name: Run clippy
working-directory: rust
- run: cargo clippy -- -W warnings
+ run: PYO3_PYTHON=$(command -v python3) cargo clippy -- -W warnings
diff --git a/.gitignore b/.gitignore
index 058d27f..9418dda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,6 +154,11 @@ benchmark_figures/
# Sanity test data (downloaded from GitHub releases)
tests/sanity/data/
+# Real data test files (downloaded from 1000 Genomes, ~2-3 GB)
+tests/real_data/data/
+tests/real_data/samplesheets/
+tests/real_data/configs/
+
# Nextflow runtime
.nextflow/
.nextflow.log*
@@ -175,7 +180,29 @@ test-output/
results_stub/
pipelines/*/test-output/
pipelines/*/results_stub/
+pipelines/*/results_*/
+pipelines/*/artifacts/
+
+# Artifacts directory
+artifacts/
+
+# Benchmark infrastructure (large data/envs/results)
+test_benchmarks/
+
+# Claude Code local state
+.claude/
+
+# Nextflow pipeline-level logs
+pipelines/*/.nextflow.log*
+pipelines/*/.nf-test.log
+
+# Nextflow reports and visualizations
+trace.txt
+timeline.html
+report.html
+dag.svg
+dag.dot
# Claude Code memory files (per-directory)
**/CLAUDE.md
-!./CLAUDE.md
+!/CLAUDE.md
diff --git a/Dockerfile b/Dockerfile
index 0ade5c9..d3a4821 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -82,6 +82,8 @@ LABEL maintainer="Jeff Jaureguy "
# Install runtime deps + temporary build deps for pybedtools C++ extension
# Combined into one RUN to minimize layers; build tools purged at the end
RUN apt-get update && apt-get install -y --no-install-recommends \
+ # PID 1 init for proper signal handling (Nextflow/HPC)
+ tini \
# Bioinformatics tools
samtools \
bcftools \
@@ -106,14 +108,16 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install /tmp/*.whl \
&& rm -rf /tmp/*.whl \
&& apt-get purge -y --auto-remove g++ zlib1g-dev \
- && rm -rf /var/lib/apt/lists/*
+ && rm -rf /var/lib/apt/lists/* \
+ && ! command -v g++
WORKDIR /app
# Verify non-Python tools are available (Python tools skipped during build
# because Polars uses AVX2 instructions that fail under QEMU emulation
# on ARM64 CI runners building linux/amd64 images)
-RUN samtools --version && bcftools --version && bedtools --version
+RUN samtools --version && bcftools --version && bedtools --version \
+ && wasp2-ipscore --help > /dev/null 2>&1
# Create non-root user for security
RUN groupadd -g 1000 wasp2 && \
@@ -147,5 +151,7 @@ WORKDIR /data
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wasp2-count --version || exit 1
+ENTRYPOINT ["tini", "--"]
+
# Default command
CMD ["wasp2-count", "--help"]
diff --git a/Makefile b/Makefile
index d99addc..8e67b2a 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@
.PHONY: all build install test test-quick test-sanity lint format clean help
.PHONY: download-sanity-data sanity-data-local rust-build rust-test
+.PHONY: test-mapping-parity
# Configuration
PYTHON ?= python
@@ -48,7 +49,7 @@ rust-dev: ## Build Rust extension in debug mode (faster compile)
$(MATURIN) develop -m $(RUST_DIR)/Cargo.toml
rust-test: ## Run Rust unit tests
- cd $(RUST_DIR) && $(CARGO) test
+ cd $(RUST_DIR) && PYO3_PYTHON=$$($(PYTHON) -c "import sys; print(sys.executable)") $(CARGO) test
rust-bench: ## Run Rust benchmarks
cd $(RUST_DIR) && $(CARGO) bench
@@ -68,6 +69,9 @@ test-quick: ## Run quick validation tests only
test-rust: ## Run Rust-specific tests
$(PYTEST) $(TESTS_DIR) -v --tb=short -m "rust"
+test-mapping-parity: ## Run mapping parity tests against legacy and unified paths
+ $(PYTEST) $(TESTS_DIR)/regression/test_mapping_stage_parity.py -v --tb=short
+
test-integration: ## Run integration tests
$(PYTEST) $(TESTS_DIR) -v --tb=short -m "integration"
diff --git a/README.md b/README.md
index 29b7667..d89e29a 100644
--- a/README.md
+++ b/README.md
@@ -15,114 +15,100 @@
-
-
-
-
-
-
- Documentation •
- McVicker Lab •
- Original WASP
----
-
## Installation
-### Recommended: Bioconda
+### Bioconda
```bash
mamba install -c conda-forge -c bioconda wasp2
```
-Installs WASP2 and all dependencies (samtools, bcftools, bedtools, htslib) automatically. Available for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon). Requires [miniforge](https://github.com/conda-forge/miniforge).
-
-### Via PyPI
+### PyPI
```bash
pip install wasp2
```
-Pre-built wheels for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon) with Python 3.10-3.13. The Rust extension and htslib are bundled in the wheel. Requires samtools, bcftools, and bedtools installed separately.
+The PyPI package does not install external tools such as `samtools`,
+`bcftools`, or `bedtools`; install those separately.
-### For development
-
-```bash
-git clone https://github.com/mcvickerlab/WASP2.git
-cd WASP2
-pixi install # resolves all dependencies including Rust toolchain
-pixi run verify # build + test
-```
-
-### Via Docker
+### Docker
```bash
docker pull ghcr.io/mcvickerlab/wasp2:1.4.0
-docker run --rm -v $PWD:/data ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
```
-Multi-platform image (linux/amd64 + linux/arm64) with all dependencies included.
-
-### Via Singularity/Apptainer (HPC)
+### Singularity/Apptainer
```bash
singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
singularity exec wasp2.sif wasp2-count --help
```
-### Reproducible Environment (conda-lock)
+## CLI Tools
-For fully pinned, reproducible installs (HPC clusters, CI, shared lab environments):
+WASP2 installs four command-line entry points:
-```bash
-# Recommended: mamba (fastest)
-mamba create -n WASP2 --file conda-lock.yml
+- `wasp2-map`
+- `wasp2-count`
+- `wasp2-analyze`
+- `wasp2-ipscore`
-# Or with conda
-conda-lock install -n WASP2 conda-lock.yml
-```
+## Quick Start
-`conda-lock.yml` pins every package to exact versions with checksums for `linux-64` and `osx-64`. To regenerate after updating `environment.lock.yml`:
+### 1. Correct mapping bias
```bash
-conda-lock lock -f environment.lock.yml --lockfile conda-lock.yml
-```
-
-See the [documentation](https://mcvickerlab.github.io/WASP2/) for detailed install options and development setup.
+wasp2-map make-reads input.bam variants.vcf.gz -s sample1 -o remap_dir
-## Quick Start
+# Realign remap_dir/*_swapped_alleles_r1.fq and r2.fq with the same aligner
+# and settings used for the original BAM, then:
-WASP2 has three steps that run in order:
+wasp2-map filter-remapped remapped.bam \
+ -j remap_dir/input_wasp_data_files.json \
+ -o filtered.bam
+```
-**Step 1: Remap reads** to correct mapping bias
+### 2. Count alleles
```bash
-wasp2-map make-reads input.bam variants.vcf.gz -s sample1 -o remap_dir/
-# Realign the swapped-allele reads with your aligner, then:
-wasp2-map filter-remapped remapped.bam -j remap_dir/sample1_wasp_data_files.json -o filtered.bam
+wasp2-count count-variants filtered.bam variants.vcf.gz -s sample1 -o counts.tsv
```
-**Step 2: Count alleles** at heterozygous SNPs
+### 3. Test for imbalance
```bash
-wasp2-count count-variants filtered.bam variants.vcf.gz -s sample1
+wasp2-analyze find-imbalance counts.tsv -o ai_results.tsv
```
-**Step 3: Test for allelic imbalance**
+## Single-Cell Example
```bash
-wasp2-analyze find-imbalance counts.tsv -o results.tsv
+wasp2-count count-variants-sc \
+ cellranger.bam \
+ variants.vcf.gz \
+ barcodes.tsv \
+ --samples sample1 \
+ --feature genes.gtf \
+ --out_file allele_counts.h5ad
+
+wasp2-analyze find-imbalance-sc \
+ allele_counts.h5ad \
+ barcode_groups.tsv \
+ --sample sample1 \
+ -o ai_results.tsv
```
-See the [documentation](https://mcvickerlab.github.io/WASP2/) for detailed usage, single-cell workflows, and supported variant formats (VCF, BCF, PGEN).
-
-## Authors
-
-- **Aaron Ho** — Creator of WASP2
-- **Jeff Jaureguy** — Developer and maintainer
-- **[McVicker Lab](https://mcvicker.salk.edu/)**, Salk Institute
+## iPSCORE Utilities
-## Citation
+```bash
+wasp2-ipscore inventory --output inventory.tsv
+wasp2-ipscore manifest --output manifest.csv
+wasp2-ipscore validate
+```
-If you use WASP2 in your research, please cite our paper (coming soon).
+See the [documentation](https://mcvickerlab.github.io/WASP2/) for complete
+usage, tutorials, and API details.
diff --git a/docs/CONTAINER_USAGE.md b/docs/CONTAINER_USAGE.md
index 8dd77e7..6bda535 100644
--- a/docs/CONTAINER_USAGE.md
+++ b/docs/CONTAINER_USAGE.md
@@ -1,262 +1,81 @@
-# WASP2 Container Usage Guide
+# WASP2 Container Usage
-This guide covers how to use WASP2 containers for local development, HPC clusters, and Nextflow pipelines.
+## Verified Docker Flow
-## Container Registries
-
-WASP2 images are available from:
-
-| Registry | Image | Pull Command |
-|----------|-------|--------------|
-| **DockerHub** | `mcvickerlab/wasp2` | `docker pull mcvickerlab/wasp2:latest` |
-| **GitHub Container Registry** | `ghcr.io/mcvickerlab/wasp2` | `docker pull ghcr.io/mcvickerlab/wasp2:latest` |
-
-### Available Tags
-
-- `:latest` - Most recent release
-- `:1.3.0` - Specific version
-- `:1.3` - Minor version (tracks patches)
-- `:main` - Development builds from main branch
-
-## Docker Usage
-
-### Pull and Run
-
-```bash
-# Pull the image
-docker pull mcvickerlab/wasp2:latest
-
-# Run WASP2 commands
-docker run --rm mcvickerlab/wasp2 wasp2-count --help
-docker run --rm mcvickerlab/wasp2 wasp2-map --help
-docker run --rm mcvickerlab/wasp2 wasp2-analyze --help
-
-# Process files (mount local directory)
-docker run --rm -v $(pwd):/data mcvickerlab/wasp2 \
- wasp2-count /data/sample.bam /data/variants.vcf.gz -o /data/counts.tsv
-```
-
-### Interactive Shell
+The Docker image validated for this update is:
```bash
-docker run -it --rm -v $(pwd):/data mcvickerlab/wasp2 /bin/bash
+ghcr.io/mcvickerlab/wasp2:1.4.0
```
-## Singularity/Apptainer Usage (HPC)
-
-### Pull from Docker Registry
+Pull and inspect the available CLI tools:
```bash
-# Pull and convert to SIF
-singularity pull wasp2.sif docker://mcvickerlab/wasp2:latest
+docker pull ghcr.io/mcvickerlab/wasp2:1.4.0
-# Or from GHCR
-singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:latest
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-map --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-analyze --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-ipscore --help
```
-### Build from Definition File
+Mount local data when running workflows:
```bash
-# Clone the repository
-git clone https://github.com/mcvickerlab/WASP2.git
-cd WASP2
-
-# Build the container
-singularity build wasp2.sif Singularity.def
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+ wasp2-count count-variants /data/sample.bam /data/variants.vcf.gz -o /data/counts.tsv
```
-### Run Commands
+## Intended Singularity / Apptainer Flow
-```bash
-# Run WASP2 commands
-singularity exec wasp2.sif wasp2-count --help
-
-# Process files (current directory is auto-bound)
-singularity exec wasp2.sif wasp2-count sample.bam variants.vcf.gz -o counts.tsv
-
-# With explicit bindings
-singularity exec --bind /scratch:/scratch wasp2.sif \
- wasp2-map make-reads /scratch/input.bam /scratch/variants.vcf
-```
-
-### SLURM Job Script Example
+For HPC environments using SIF images:
```bash
-#!/bin/bash
-#SBATCH --job-name=wasp2
-#SBATCH --cpus-per-task=8
-#SBATCH --mem=32G
-#SBATCH --time=4:00:00
-
-module load singularity
-
-CONTAINER=/path/to/wasp2.sif
-
-# Count variants (wasp2-count does not have --threads option)
-singularity exec ${CONTAINER} wasp2-count \
- input.bam \
- variants.vcf.gz \
- -o counts.tsv
-
-# WASP mapping filter (supports --threads)
-singularity exec ${CONTAINER} wasp2-map make-reads \
- input.bam \
- variants.vcf.gz \
- --threads ${SLURM_CPUS_PER_TASK} \
- --out_dir ./wasp_output
-```
-
-## Nextflow Integration
-
-### Configuration
-
-Add to your `nextflow.config`:
-
-```groovy
-profiles {
- docker {
- docker.enabled = true
- docker.runOptions = '-u $(id -u):$(id -g)'
- }
- singularity {
- singularity.enabled = true
- singularity.autoMounts = true
- singularity.cacheDir = "${HOME}/.singularity/cache"
- }
-}
-
-process {
- withLabel: 'wasp2' {
- container = 'mcvickerlab/wasp2:latest'
- }
-}
-```
-
-### Running Pipelines
-
-```bash
-# With Docker
-nextflow run main.nf -profile docker
-
-# With Singularity
-nextflow run main.nf -profile singularity
-```
-
-## Building Locally
-
-### Docker Build
-
-```bash
-# Clone repository
-git clone https://github.com/mcvickerlab/WASP2.git
-cd WASP2
-
-# Build image
-docker build -t wasp2:local .
-
-# Test the build
-docker run --rm wasp2:local wasp2-count --version
-docker run --rm wasp2:local python -c "import wasp2_rust; print('OK')"
+singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
+singularity exec wasp2.sif wasp2-count --help
```
-### Manual Build (for maintainers)
-
-Note: Currently only `linux/amd64` is supported.
+or:
```bash
-# Set up buildx
-docker buildx create --name wasp2builder --use
-
-# Build with version argument
-docker buildx build \
- --platform linux/amd64 \
- --build-arg VERSION=1.3.0 \
- -t mcvickerlab/wasp2:1.3.0 \
- -t mcvickerlab/wasp2:latest \
- --push .
+apptainer pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
+apptainer exec wasp2.sif wasp2-count --help
```
-## Container Contents
-
-The WASP2 container includes:
+These commands are the intended container path, but they were not executed in
+this development environment because neither `singularity` nor `apptainer` was
+installed locally.
-### Python Environment
-- Python 3.10+ (container uses 3.11)
-- wasp2 package with Rust extension
-- Core: pysam, pandas (<2.0), numpy, scipy, polars
-- CLI: typer, rich
-- Single-cell: anndata, scanpy (optional)
-
-### Rust Components
-- Pre-built `wasp2_rust` Python extension
-- Compiled with release optimizations
-
-### CLI Tools
-
-Each tool has subcommands for different analysis modes:
-
-- **`wasp2-count`** - Allele counting
- - `count-variants` - Bulk allele counting at heterozygous sites (default)
- - `count-variants-sc` - Single-cell allele counting
-
-- **`wasp2-map`** - WASP mapping filter
- - `make-reads` - Generate reads with swapped alleles for remapping
- - `filter-remapped` - Filter remapped reads using WASP algorithm
-
-- **`wasp2-analyze`** - Statistical analysis
- - `find-imbalance` - Calculate allelic imbalance
- - `find-imbalance-sc` - Single-cell allelic imbalance analysis
- - `compare-imbalance` - Compare imbalance between cell types/groups
-
-### Bioinformatics Tools
-- samtools
-- bcftools
-- bedtools
-- tabix
-
-## Troubleshooting
-
-### Permission Issues (Docker)
+## Mapping Example
```bash
-# Run as current user
-docker run --rm -u $(id -u):$(id -g) -v $(pwd):/data mcvickerlab/wasp2 ...
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+ wasp2-map make-reads /data/sample.bam /data/variants.vcf.gz \
+ --samples sample1 \
+ --out_dir /data/remap_dir
```
-### Cache Issues (Singularity)
+After realigning the swapped FASTQ reads with your aligner of choice:
```bash
-# Clear Singularity cache
-singularity cache clean
-
-# Use different cache directory
-export SINGULARITY_CACHEDIR=/scratch/singularity_cache
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+ wasp2-map filter-remapped /data/remapped.bam \
+ --wasp_data_json /data/remap_dir/sample_wasp_data_files.json \
+ --out_bam /data/filtered.bam
```
-### Verify Installation
+## Counting Example
```bash
-# Docker
-docker run --rm mcvickerlab/wasp2 wasp2-count --version
-docker run --rm mcvickerlab/wasp2 python -c "import wasp2_rust; print('Rust extension OK')"
-
-# Singularity
-singularity exec wasp2.sif wasp2-count --version
-singularity exec wasp2.sif python -c "import wasp2_rust; print('Rust extension OK')"
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+ wasp2-count count-variants /data/filtered.bam /data/variants.vcf.gz \
+ --samples sample1 \
+ --region /data/genes.gtf \
+ --out_file /data/counts.tsv
```
-## GitHub Actions Secrets Setup
-
-To enable automated container builds, repository maintainers must configure:
-
-1. **DockerHub Secrets** (Settings → Secrets and variables → Actions):
- - `DOCKERHUB_USERNAME`: Your DockerHub username
- - `DOCKERHUB_TOKEN`: DockerHub access token (Account Settings → Security → Access Tokens)
-
-2. **GitHub Container Registry**: Uses `GITHUB_TOKEN` automatically (no setup needed)
-
-## Related Documentation
+## Notes
-- [Nextflow Pipelines](../pipelines/nf-atacseq/README.md)
-- [WASP2 Ecosystem](WASP2_ECOSYSTEM.md)
-- [GitHub Repository](https://github.com/mcvickerlab/WASP2)
+- The image contains the WASP2 package plus `samtools`, `bcftools`, and `bedtools`.
+- The documented public mapping workflow is `make-reads -> realign -> filter-remapped`.
+- `wasp2-ipscore` is present in the container alongside the main analysis tools.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1a5fc6a..ab712bd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -60,6 +60,7 @@ Documentation
user_guide/mapping
user_guide/analysis
user_guide/single_cell
+ user_guide/ipscore
.. toctree::
:maxdepth: 2
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index f039a97..51e9c15 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,25 +1,16 @@
Installation
============
-Via Bioconda (Recommended)
---------------------------
-
-`Bioconda `_ installs WASP2 and **all** dependencies
-(samtools, bcftools, bedtools, htslib) in one command. Requires
-`miniforge `_.
+Via Bioconda
+------------
.. code-block:: bash
mamba install -c conda-forge -c bioconda wasp2
-Or with conda:
-
-.. code-block:: bash
-
- conda install -c conda-forge -c bioconda wasp2
-
-Available for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon) with
-Python 3.10-3.13.
+Bioconda installs WASP2 together with the external command-line dependencies
+required by the workflows, including ``samtools``, ``bcftools``, and
+``bedtools``.
Via PyPI
--------
@@ -28,73 +19,58 @@ Via PyPI
pip install wasp2
-Pre-built wheels include the Rust extension and bundled htslib for Linux
-(x86_64, aarch64) and macOS (Intel, Apple Silicon) with Python 3.10-3.13.
-
-.. note::
-
- The PyPI package does not include samtools, bcftools, or bedtools.
- Install them separately before running WASP2:
-
- * On Ubuntu/Debian: ``sudo apt-get install bcftools bedtools samtools``
- * On macOS: ``brew install bcftools bedtools samtools``
- * Via conda: ``mamba install -c bioconda samtools bcftools bedtools``
+The PyPI wheel includes the WASP2 Python package and Rust extension, but it
+does not install external tools such as ``samtools``, ``bcftools``, or
+``bedtools``. Install those separately before running mapping or counting.
Via Docker
----------
-WASP2 is available as a multi-platform Docker image (linux/amd64 + linux/arm64)
-with all dependencies pre-installed:
+The Docker image is the most reproducible fully bundled option available in
+this environment.
.. code-block:: bash
docker pull ghcr.io/mcvickerlab/wasp2:1.4.0
- # Run a command
- docker run --rm -v /path/to/data:/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
- wasp2-count count-variants /data/sample.bam /data/variants.vcf
-
- # Interactive shell
- docker run -it --rm -v /path/to/data:/data ghcr.io/mcvickerlab/wasp2:1.4.0 bash
-
-The image includes samtools, bcftools, bedtools, and the Rust-accelerated backend.
+ docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
+ docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-map --help
+ docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-analyze --help
+ docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-ipscore --help
Via Singularity/Apptainer
-------------------------
-For HPC environments that don't support Docker:
+For HPC environments that require SIF images:
.. code-block:: bash
singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
singularity exec wasp2.sif wasp2-count --help
-Development Installation
-------------------------
-
-For contributing or building from source:
+or with Apptainer:
.. code-block:: bash
- git clone https://github.com/mcvickerlab/WASP2.git
- cd WASP2
- pixi install # resolves all deps including Rust toolchain
- pixi run verify # build + test suite in one step
+ apptainer pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
+ apptainer exec wasp2.sif wasp2-count --help
-`pixi `_ resolves Python, Rust toolchain, samtools, bcftools,
-bedtools, and htslib automatically. No system packages required.
+.. note::
-Compiling pgenlib
-~~~~~~~~~~~~~~~~~
+ Docker was validated in this development environment. ``apptainer`` /
+ ``singularity`` binaries were not available locally during this doc update,
+ so those examples reflect the intended pull/exec workflow but were not
+ executed here.
-WASP2 optionally uses `pgenlib `_
-for PLINK2 file I/O. This requires a C compiler:
+Development Installation
+------------------------
-* On Ubuntu/Debian: ``sudo apt-get install build-essential python3-dev``
-* On macOS: ``xcode-select --install``
-* On RHEL/Fedora: ``sudo dnf install gcc gcc-c++ python3-devel``
+.. code-block:: bash
-pgenlib is installed automatically via pip when you install WASP2.
+ git clone https://github.com/mcvickerlab/WASP2.git
+ cd WASP2
+ pixi install
+ pixi run verify
Verification
------------
@@ -104,3 +80,4 @@ Verification
wasp2-count --help
wasp2-map --help
wasp2-analyze --help
+ wasp2-ipscore --help
diff --git a/docs/source/methods/mapping_filter.rst b/docs/source/methods/mapping_filter.rst
index c3257f2..37b24d7 100644
--- a/docs/source/methods/mapping_filter.rst
+++ b/docs/source/methods/mapping_filter.rst
@@ -132,7 +132,10 @@ The WASP filter compares original and remapped positions:
.. code-block:: bash
- wasp2-map filter-remapped original_to_remap.bam remapped.bam output.bam
+ wasp2-map filter-remapped \
+ remapped.bam \
+ --wasp_data_json sample_wasp_data_files.json \
+ --out_bam output.bam
A read passes if:
@@ -272,7 +275,7 @@ The typical WASP2 workflow:
# Step 2: Create swapped reads
wasp2-map make-reads sample.bam variants.vcf \
- --samples SAMPLE1 --out-dir wasp_temp/
+ --samples SAMPLE1 --out_dir wasp_temp
# Step 3: Remap swapped reads (SAME ALIGNER!)
bwa mem -M genome.fa wasp_temp/swapped_r1.fq wasp_temp/swapped_r2.fq | \
diff --git a/docs/source/tutorials/quickstart_mapping.rst b/docs/source/tutorials/quickstart_mapping.rst
index 57baffb..41317a0 100644
--- a/docs/source/tutorials/quickstart_mapping.rst
+++ b/docs/source/tutorials/quickstart_mapping.rst
@@ -68,7 +68,7 @@ Identify reads overlapping heterozygous SNPs and generate allele-swapped version
wasp2-map make-reads sample.bam variants.vcf.gz \
--samples SAMPLE1 \
- --out-dir wasp_output/
+ --out_dir wasp_output
This produces (where ``sample`` is your BAM file prefix):
@@ -103,9 +103,9 @@ The WASP filter compares original and remapped positions:
.. code-block:: bash
wasp2-map filter-remapped \
- wasp_output/sample_to_remap.bam \
wasp_output/sample_remapped.bam \
- wasp_output/sample_wasp_filtered.bam
+ --wasp_data_json wasp_output/sample_wasp_data_files.json \
+ --out_bam wasp_output/sample_wasp_filtered.bam
Understanding Filter Statistics
-------------------------------
@@ -172,7 +172,7 @@ Complete Workflow Script
echo "Step 1: Creating swapped reads..."
wasp2-map make-reads $BAM $VCF \
--samples $SAMPLE \
- --out-dir $OUTDIR/
+ --out_dir $OUTDIR
# Step 2: Remap with same aligner
echo "Step 2: Remapping swapped reads..."
@@ -185,9 +185,9 @@ Complete Workflow Script
# Step 3: Filter biased reads
echo "Step 3: Filtering biased reads..."
wasp2-map filter-remapped \
- $OUTDIR/${PREFIX}_to_remap.bam \
$OUTDIR/${PREFIX}_remapped.bam \
- $OUTDIR/${PREFIX}_wasp_filtered.bam
+ --wasp_data_json $OUTDIR/${PREFIX}_wasp_data_files.json \
+ --out_bam $OUTDIR/${PREFIX}_wasp_filtered.bam
# Step 4: Merge with non-overlapping reads
echo "Step 4: Merging final BAM..."
diff --git a/docs/source/tutorials/scrna_seq.rst b/docs/source/tutorials/scrna_seq.rst
index dcdb325..1ee8d54 100644
--- a/docs/source/tutorials/scrna_seq.rst
+++ b/docs/source/tutorials/scrna_seq.rst
@@ -162,7 +162,7 @@ Run the single-cell allele counting:
cellranger_output/outs/possorted_genome_bam.bam \
phased_variants.vcf.gz \
barcodes_celltype.tsv \
- --region genes.gtf \
+ --feature genes.gtf \
--samples SAMPLE_ID \
--out_file allele_counts.h5ad
@@ -293,7 +293,7 @@ Low Read Counts
Single-cell data is sparse. Consider:
* Using pseudo-bulk aggregation by cell type
-* Lowering ``--min-count`` threshold
+* Lowering ``--min`` / ``--min_count`` threshold
* Focusing on highly expressed genes
Memory Issues
diff --git a/docs/source/user_guide/analysis.rst b/docs/source/user_guide/analysis.rst
index f486d11..23b015e 100644
--- a/docs/source/user_guide/analysis.rst
+++ b/docs/source/user_guide/analysis.rst
@@ -4,520 +4,92 @@ Analysis Module
Overview
--------
-The analysis module detects statistically significant allelic imbalance using beta-binomial models.
+``wasp2-analyze`` runs allelic imbalance analysis on bulk or single-cell count
+outputs.
-Purpose
--------
-
-* Detect allelic imbalance at genomic regions
-* Control for biological and technical variation
-* Support single-cell and bulk RNA-seq
-* Compare imbalance between groups/conditions
-
-Statistical Models
-------------------
-
-Beta-Binomial Model
-~~~~~~~~~~~~~~~~~~~
-
-WASP2 uses beta-binomial distribution to model:
-* Overdispersion (variation beyond binomial)
-* Biological variability between regions
-* Technical noise in sequencing
-
-The model:
-* Null hypothesis: Equal expression from both alleles (p=0.5)
-* Alternative: Allelic imbalance (p ≠ 0.5)
-* FDR correction for multiple testing
+Commands:
-Dispersion Parameter
-~~~~~~~~~~~~~~~~~~~~
-
-Two models:
-1. **Single**: One dispersion parameter for all regions
-2. **Linear**: Dispersion varies with read depth
-
-CLI Usage
----------
-
-Basic Analysis
-~~~~~~~~~~~~~~
-
-.. code-block:: bash
+* ``find-imbalance``: bulk allelic imbalance from TSV counts
+* ``find-imbalance-sc``: per-group single-cell imbalance from ``.h5ad`` counts
+* ``compare-imbalance``: differential imbalance between single-cell groups
- wasp2-analyze find-imbalance counts.tsv
-
-Options
-~~~~~~~
+Bulk Analysis
+-------------
.. code-block:: bash
wasp2-analyze find-imbalance \
counts.tsv \
- --min-count 10 \
+ --min 10 \
--pseudocount 1 \
- --model single \
- --output results.tsv
-
-Parameters
-----------
-
-``--min-count``
-~~~~~~~~~~~~~~~
-
-Minimum total read count per region (default: 10):
-
-.. code-block:: bash
-
- --min-count 20 # More stringent
-
-``--pseudocount``
-~~~~~~~~~~~~~~~~~
-
-Pseudocount added to avoid zero counts (default: 1):
-
-.. code-block:: bash
-
- --pseudocount 0 # No pseudocount
-
-``--model``
-~~~~~~~~~~~
-
-Dispersion model (default: single):
-
-.. code-block:: bash
-
- --model linear # Depth-dependent dispersion
-
-``--phased``
-~~~~~~~~~~~~
-
-Use phased genotype information:
-
-.. code-block:: bash
-
- --phased # Requires phased VCF
-
-Output Format
--------------
-
-Tab-separated file with columns:
-
-Statistical Columns
-~~~~~~~~~~~~~~~~~~~
-
-* ``region``: Genomic region identifier
-* ``ref_count``: Total reference allele counts
-* ``alt_count``: Total alternate allele counts
-* ``p_value``: Likelihood ratio test p-value
-* ``fdr_pval``: FDR-corrected p-value
-* ``effect_size``: Log2 fold-change (ref/alt)
+ --output ai_results.tsv
-Model Parameters
-~~~~~~~~~~~~~~~~
+Useful options:
-* ``dispersion``: Beta-binomial dispersion parameter
-* ``log_likelihood_null``: Null model log-likelihood
-* ``log_likelihood_alt``: Alternative model log-likelihood
-
-Interpreting Results
---------------------
-
-Significant Imbalance
-~~~~~~~~~~~~~~~~~~~~~
-
-FDR < 0.05 indicates significant imbalance:
-
-* **Biological**: cis-regulatory variation, ASE
-* **Technical**: mapping bias (check WASP), PCR artifacts
-
-Effect Size
-~~~~~~~~~~~
-
-* log2FC > 1: Strong imbalance (2-fold difference)
-* log2FC > 2: Very strong imbalance (4-fold difference)
+* ``--min`` / ``--min_count``: minimum total count threshold
+* ``--pseudocount``: pseudocount added before modeling
+* ``--model``: dispersion model (currently ``single`` or ``linear`` input)
+* ``--output`` / ``--out_file`` / ``-o``: output TSV path
+* ``--region_col``: explicit region column name if auto-detection is not desired
+* ``--groupby``: group on an alternate annotation column, such as a parent gene column
Single-Cell Analysis
--------------------
-For single-cell data, WASP2 detects allelic imbalance within specific cell populations
-using aggregated counts across cells of the same type.
-
.. code-block:: bash
wasp2-analyze find-imbalance-sc \
- adata.h5ad \
- barcode_map.tsv \
- --sample donor1 \
- --min-count 10
-
-Output: Per-celltype TSV files (``ai_results_[CELLTYPE].tsv``).
-
-Single-Cell Comparative Imbalance
----------------------------------
-
-Overview
-~~~~~~~~
-
-The comparative imbalance analysis detects **differential allelic imbalance** between
-cell types, conditions, or biological groups. This is useful for identifying:
-
-* Cell-type-specific regulatory variation
-* Sex differences in chromatin accessibility
-* Condition-dependent allelic effects (e.g., treatment vs control)
-* Developmental stage-specific imbalance
-
-Statistical Model
-~~~~~~~~~~~~~~~~~
-
-The analysis uses a **likelihood ratio test (LRT)** comparing two hypotheses:
-
-* **Null (H0)**: Both groups share the same allelic imbalance (μ_combined)
-* **Alternative (H1)**: Groups have different imbalance (μ₁ ≠ μ₂)
-
-The test statistic follows a chi-squared distribution with 1 degree of freedom:
-
-.. code-block:: text
-
- LRT = -2 × (log L_null - log L_alt)
- p-value = P(χ²(df=1) > LRT)
-
-Input Format: Count Matrix (.h5ad)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The count matrix must be an AnnData object with the following structure:
-
-.. code-block:: text
-
- AnnData object (n_obs × n_vars)
- ├── .obs # SNP metadata (rows)
- │ ├── index # SNP identifiers (0, 1, 2, ...)
- │ └── [sample_name] # Genotypes: '0|1', '1|0', '0/1', '1/0'
- │
- ├── .var # Cell metadata (columns)
- │ └── group # Cell type/group assignment
- │
- ├── .layers
- │ ├── "ref" # Reference allele counts (sparse matrix)
- │ └── "alt" # Alternate allele counts (sparse matrix)
- │
- └── .uns
- ├── feature # DataFrame: SNP → region mapping
- └── samples # List of sample names
-
-**Example count matrix creation:**
-
-.. code-block:: bash
-
- # Generate counts from BAM + variants + barcodes
- wasp2-count count-variants-sc \
- aligned.bam \
- variants.vcf.gz \
- barcodes.txt \
- --samples NA12878 \
- --feature peaks.bed \
- --out_file allele_counts.h5ad
-
-Input Format: Barcode Map (TSV)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A two-column TSV file (no header) mapping cell barcodes to groups:
-
-.. code-block:: text
-
- AAACGAACAGTCAGTT-1 excitatory_neurons
- AAACGAAGTCGCTCTA-1 inhibitory_neurons
- AAACGAAGTGAACCTA-1 excitatory_neurons
- AAAGGATCATCGATGT-1 astrocytes
- AAAGGATGTGCAACGA-1 microglia
-
-**Requirements:**
-
-* Tab-separated (``\t``)
-* No header row
-* Barcodes must match those in the count matrix
-* Groups can be cell types, conditions, sex, or any categorical variable
-
-Basic Usage
-~~~~~~~~~~~
-
-Compare imbalance between all groups:
-
-.. code-block:: bash
-
- wasp2-analyze compare-imbalance \
- allele_counts.h5ad \
- barcode_map.tsv
-
-Compare specific groups only:
-
-.. code-block:: bash
-
- wasp2-analyze compare-imbalance \
allele_counts.h5ad \
- barcode_map.tsv \
- --groups "excitatory_neurons,inhibitory_neurons"
-
-Output Format
-~~~~~~~~~~~~~
-
-Results are written to ``ai_results_[GROUP1]_[GROUP2].tsv``:
-
-.. list-table::
- :header-rows: 1
- :widths: 20 80
-
- * - Column
- - Description
- * - ``region``
- - Genomic region identifier (peak or gene)
- * - ``num_snps``
- - Number of shared heterozygous SNPs in region
- * - ``combined_mu``
- - Reference allele frequency under null hypothesis (shared)
- * - ``mu1``
- - Reference allele frequency in group 1
- * - ``mu2``
- - Reference allele frequency in group 2
- * - ``null_ll``
- - Log-likelihood under null (shared μ)
- * - ``alt_ll``
- - Log-likelihood under alternative (separate μ values)
- * - ``pval``
- - Likelihood ratio test p-value
- * - ``fdr_pval``
- - FDR-corrected p-value (Benjamini-Hochberg)
-
-**Interpreting results:**
-
-* ``fdr_pval < 0.05``: Significant differential imbalance
-* ``|mu1 - mu2| > 0.1``: Meaningful effect size (~20% difference)
-* ``mu < 0.5``: Alternate allele favored; ``mu > 0.5``: Reference allele favored
-
-Parameters
-~~~~~~~~~~
-
-``--groups``
- Comma-separated list of groups to compare. If omitted, compares all pairwise
- combinations found in the barcode map.
-
-``--min``
- Minimum total allele count per region per group (default: 10). Higher values
- increase statistical power but reduce the number of testable regions.
-
-``--pseudocount``
- Pseudocount added to avoid zero counts (default: 1). Affects dispersion estimation.
-
-``--sample``
- Sample name for heterozygous SNP filtering. Required if multiple samples are
- present in the count matrix.
-
-``--phased``
- Use phased genotype information from VCF. Requires genotypes in ``0|1`` or ``1|0``
- format. Improves power when haplotype phase is known.
-
-``-z/--z_cutoff``
- Remove SNPs with counts exceeding this z-score threshold. Useful for removing
- outliers caused by mapping artifacts or copy number variation.
-
-Example: Sex Differences Analysis
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Identify chromatin accessibility regions with sex-biased allelic imbalance:
+ barcode_groups.tsv \
+ --sample SAMPLE1 \
+ --min 10 \
+ --out_file ai_results.tsv
-**Step 1: Prepare barcode map with sex labels**
+``barcode_groups.tsv`` is a two-column TSV:
.. code-block:: text
- # barcode_sex_map.tsv
- AAACGAACAGTCAGTT-1 male
- AAACGAAGTCGCTCTA-1 female
- AAACGAAGTGAACCTA-1 male
- AAAGGATCATCGATGT-1 female
-
-**Step 2: Run comparative analysis**
-
-.. code-block:: bash
-
- wasp2-analyze compare-imbalance \
- allele_counts.h5ad \
- barcode_sex_map.tsv \
- --groups "male,female" \
- --min 20 \
- --out_file ai_results_sex_comparison.tsv
-
-**Step 3: Filter significant results**
-
-.. code-block:: bash
-
- # Extract regions with significant sex differences
- awk -F'\t' 'NR==1 || $9 < 0.05' ai_results_male_female.tsv > significant_sex_diff.tsv
-
- # Find regions with large effect size
- awk -F'\t' 'NR==1 || ($4 - $5 > 0.15 || $5 - $4 > 0.15)' significant_sex_diff.tsv
-
-Example: snATAC-seq Cell Type Analysis
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Complete workflow for analyzing cell-type-specific chromatin accessibility imbalance:
-
-**Step 1: Count alleles from snATAC-seq BAM**
-
-.. code-block:: bash
-
- # Extract valid barcodes from Cell Ranger output
- zcat filtered_barcodes.tsv.gz > barcodes.txt
-
- # Count alleles at heterozygous SNPs overlapping peaks
- wasp2-count count-variants-sc \
- possorted_bam.bam \
- phased_variants.vcf.gz \
- barcodes.txt \
- --samples sample1 \
- --feature atac_peaks.bed \
- --out_file snATAC_counts.h5ad
-
-**Step 2: Create barcode-to-celltype mapping**
-
-Export cell type annotations from your clustering analysis (e.g., Seurat, ArchR):
-
-.. code-block:: r
-
- # R/Seurat example
- write.table(
- data.frame(barcode = Cells(seurat_obj),
- celltype = Idents(seurat_obj)),
- "barcode_celltype_map.tsv",
- sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE
- )
-
-**Step 3: Run single-cell imbalance analysis**
-
-.. code-block:: bash
+ BARCODEGROUP
- # Per-celltype analysis
- wasp2-analyze find-imbalance-sc \
- snATAC_counts.h5ad \
- barcode_celltype_map.tsv \
- --sample sample1 \
- --phased \
- --min 10 \
- -z 3.0
+The command writes one output file per group using the requested output prefix.
-**Step 4: Compare imbalance between cell types**
+Comparative Single-Cell Analysis
+--------------------------------
.. code-block:: bash
- # Compare specific cell types
wasp2-analyze compare-imbalance \
- snATAC_counts.h5ad \
- barcode_celltype_map.tsv \
- --sample sample1 \
- --groups "excitatory,inhibitory,astrocyte" \
- --phased \
- --min 15
-
- # This produces:
- # - ai_results_excitatory_inhibitory.tsv
- # - ai_results_excitatory_astrocyte.tsv
- # - ai_results_inhibitory_astrocyte.tsv
-
-**Step 5: Identify cell-type-specific regulatory regions**
-
-.. code-block:: bash
-
- # Find peaks with differential imbalance between excitatory and inhibitory neurons
- awk -F'\t' '$9 < 0.01 && ($4 > 0.6 || $4 < 0.4)' \
- ai_results_excitatory_inhibitory.tsv > neuron_subtype_specific_peaks.tsv
-
-Best Practices for Single-Cell Analysis
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**Data Quality:**
-
-* Use WASP-filtered BAM files to remove mapping bias
-* Require ≥10 total counts per region per group (``--min 10``)
-* Apply z-score filtering to remove outliers (``-z 3.0``)
-
-**Statistical Power:**
-
-* Merge similar cell types if individual populations have low coverage
-* Use phased genotypes when available (``--phased``)
-* Focus on regions with multiple SNPs for better estimates
-
-**Interpretation:**
-
-* Consider biological replication across samples
-* Validate top hits with orthogonal methods (allele-specific CRISPR, etc.)
-* Integrate with eQTL data to identify causal variants
-
-Example Workflow
-----------------
-
-.. code-block:: bash
-
- # 1. Count alleles
- wasp2-count count-variants \
- wasp_filtered.bam \
- variants.vcf \
- --region genes.gtf \
- --samples NA12878 \
- --output counts.tsv
-
- # 2. Analyze imbalance
- wasp2-analyze find-imbalance \
- counts.tsv \
- --min-count 20 \
- --model single \
- --output imbalance.tsv
-
- # 3. Filter significant results
- awk '$5 < 0.05' imbalance.tsv > significant.tsv
-
-Best Practices
---------------
-
-Read Depth
-~~~~~~~~~~
-
-* Minimum 10 reads per region (use ``--min-count``)
-* Higher depth = more power
-* Consider downsampling very deep regions
-
-Quality Control
-~~~~~~~~~~~~~~~
+ allele_counts.h5ad \
+ barcode_groups.tsv \
+ --sample SAMPLE1 \
+ --groups B_cell T_cell \
+ --out_file compare_ai.tsv
-* Use WASP-filtered reads
-* Remove low-complexity regions
-* Filter low-quality SNPs
+This compares allelic imbalance between the requested groups and writes one TSV
+per comparison.
-Multiple Testing
-~~~~~~~~~~~~~~~~
+Notes
+-----
-* FDR correction is automatic
-* Consider Bonferroni for very important regions
-* Validate top hits experimentally
+* If your count file contains genotype columns for multiple samples, you must
+ provide ``--sample`` for single-cell analysis.
+* For bulk analysis, region columns are auto-detected when present in the count
+ TSV. Use ``--region_col`` only when you need to override that behavior.
-Common Issues
--------------
-
-No Significant Results
-~~~~~~~~~~~~~~~~~~~~~~
+Outputs
+-------
-* Increase sample size
-* Check read depth (use deeper sequencing)
-* Verify heterozygous SNPs present
+Typical bulk outputs include:
-Many Significant Results
-~~~~~~~~~~~~~~~~~~~~~~~~
+* region or feature identifier
+* aggregated ``ref_count`` and ``alt_count``
+* p-values and FDR-adjusted p-values
-* Check for batch effects
-* Verify WASP filtering was applied
-* Consider stricter FDR threshold
+Typical single-cell outputs include the same statistics stratified by barcode
+group.
Next Steps
----------
-* Validate results with qPCR or DNA-seq
-* Integrate with eQTL data
-* Perform pathway enrichment analysis
+* :doc:`counting` to generate bulk or single-cell counts
+* :doc:`/tutorials/comparative_imbalance` for group-comparison workflows
diff --git a/docs/source/user_guide/counting.rst b/docs/source/user_guide/counting.rst
index 54db55f..b6a0ba6 100644
--- a/docs/source/user_guide/counting.rst
+++ b/docs/source/user_guide/counting.rst
@@ -4,195 +4,135 @@ Counting Module
Overview
--------
-The counting module quantifies allele-specific read counts at heterozygous SNP positions. It's the first step in allelic imbalance analysis.
+``wasp2-count`` counts reads supporting reference and alternate alleles at
+variant positions in BAM files.
-Purpose
-~~~~~~~
+It provides two commands:
-* Count reads supporting reference vs alternate alleles
-* Filter by sample genotype (heterozygous sites)
-* Annotate with genomic regions (genes, peaks)
-* Support single-cell RNA-seq
+* ``count-variants`` for bulk data
+* ``count-variants-sc`` for single-cell data with ``CB``-tagged barcodes
-When to Use
-~~~~~~~~~~~
-
-Use counting when you have:
-* Aligned reads (BAM file)
-* Variant calls (VCF file)
-* Want to quantify allele-specific expression
-
-CLI Usage
----------
+Bulk Counting
+-------------
-Basic Command
-~~~~~~~~~~~~~
+Basic usage:
.. code-block:: bash
- wasp2-count count-variants BAM_FILE VCF_FILE
+ wasp2-count count-variants sample.bam variants.vcf.gz --out_file counts.tsv
-Full Options
-~~~~~~~~~~~~
+With sample filtering and region annotation:
.. code-block:: bash
wasp2-count count-variants \
- input.bam \
- variants.vcf \
- --samples sample1,sample2 \
+ sample.bam \
+ variants.vcf.gz \
+ --samples SAMPLE1 \
--region genes.gtf \
--out_file counts.tsv
-Input Requirements
-------------------
-
-BAM File
-~~~~~~~~
+Supported region files:
-* Aligned reads (single-end or paired-end)
-* Indexed (.bai file in same directory)
-* Sorted by coordinate
+* BED
+* MACS2 ``narrowPeak`` / ``broadPeak``
+* GTF
+* GFF3
-VCF File
-~~~~~~~~
+For GTF/GFF3 inputs, WASP2 derives interval annotations from feature rows and
+defaults to ``gene`` features when present.
-* Variant calls with genotype information
-* Heterozygous SNPs (GT=0|1 or 1|0)
-* Can include sample-specific genotypes
+Useful options:
-Optional: Region File
-~~~~~~~~~~~~~~~~~~~~~
+* ``--samples`` / ``-s``: select het sites for one or more samples
+* ``--region`` / ``-r``: restrict/annotate variants by overlapping regions
+* ``--gene_feature``: choose the GTF/GFF3 feature type
+* ``--gene_attribute``: choose the GTF/GFF3 attribute used as the feature ID
+* ``--gene_parent``: choose the parent/grouping attribute for gene annotations
+* ``--use_region_names``: prefer region names instead of coordinate strings
+* ``--include-indels``: count indels in addition to SNPs
-Annotate SNPs overlapping genes/peaks:
+Output columns always include:
-* GTF/GFF3 format (genes)
-* BED format (peaks, regions)
-* narrowPeak format (ATAC-seq, ChIP-seq)
+* ``chrom``
+* ``pos`` or ``pos0`` / ``pos`` depending on input path
+* ``ref``
+* ``alt``
+* ``ref_count``
+* ``alt_count``
+* ``other_count``
-Parameters
-----------
+When sample filtering is active, genotype columns are included. When region
+annotation is active, region or gene columns are included as well.
-``--samples`` / ``-s``
-~~~~~~~~~~~~~~~~~~~~~~
+Single-Cell ATAC Counting
+-------------------------
-Filter SNPs heterozygous in specified samples:
+Single-cell counting is designed for **scATAC-seq** data. It requires a BAM
+with ``CB`` tags and a positional barcode file containing one barcode per line.
.. code-block:: bash
- --samples sample1,sample2,sample3
- # or
- --samples samples.txt # one per line
-
-``--region`` / ``-r``
-~~~~~~~~~~~~~~~~~~~~~
-
-Annotate SNPs with overlapping regions:
-
-.. code-block:: bash
-
- --region genes.gtf # Gene annotations
- --region peaks.bed # ATAC-seq peaks
- --region regions.gff3 # Custom regions
-
-``--out_file`` / ``-o``
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Output file path (default: counts.tsv):
-
-.. code-block:: bash
-
- --out_file my_counts.tsv
-
-Output Format
--------------
-
-Tab-separated file with columns:
+ wasp2-count count-variants-sc \
+ sc_atac.bam \
+ variants.vcf.gz \
+ barcodes.tsv \
+ --samples sample1 \
+ --feature peaks.bed \
+ --out_file allele_counts.h5ad
-Basic Columns
-~~~~~~~~~~~~~
+Important points:
-* ``chr``: Chromosome
-* ``pos``: SNP position (1-based)
-* ``ref``: Reference allele
-* ``alt``: Alternate allele
-* ``ref_count``: Reads supporting reference
-* ``alt_count``: Reads supporting alternate
-* ``other_count``: Reads supporting other alleles
+* ``barcodes.tsv`` is a positional argument, not ``--barcode_map``
+* ``--feature`` and ``--region`` are aliases on the single-cell command
+* Accepts BED and MACS2 peak files (GTF/GFF3 are supported only by the bulk ``count-variants`` command)
-Optional Columns (with --region)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The output is an AnnData ``.h5ad`` file with:
-* ``gene_id``: Overlapping gene
-* ``gene_name``: Gene symbol
-* ``feature``: Feature type (exon, intron, etc.)
+* sparse count layers for ``ref``, ``alt``, and ``other``
+* variant metadata in ``adata.obs``
+* barcode names in ``adata.var_names``
+* feature-to-variant mapping in ``adata.uns["feature"]`` when annotations are used
-Example Workflow
-----------------
+Examples
+--------
-1. Basic Counting
-~~~~~~~~~~~~~~~~~
+Count variants without regional annotation:
.. code-block:: bash
- wasp2-count count-variants sample.bam variants.vcf
+ wasp2-count count-variants \
+ filtered.bam \
+ variants.vcf.gz \
+ --samples SAMPLE1 \
+ --out_file counts.tsv
-2. Filter by Sample
-~~~~~~~~~~~~~~~~~~~
+Count variants inside peaks:
.. code-block:: bash
wasp2-count count-variants \
- sample.bam \
- variants.vcf \
- --samples NA12878
+ filtered.bam \
+ variants.vcf.gz \
+ --samples SAMPLE1 \
+ --region peaks.bed \
+ --out_file counts_peaks.tsv
-3. Annotate with Genes
-~~~~~~~~~~~~~~~~~~~~~~
+Count variants inside genes:
.. code-block:: bash
wasp2-count count-variants \
- sample.bam \
- variants.vcf \
- --samples NA12878 \
+ filtered.bam \
+ variants.vcf.gz \
+ --samples SAMPLE1 \
--region genes.gtf \
- --out_file counts_annotated.tsv
-
-Single-Cell Counting
---------------------
-
-For single-cell RNA-seq:
-
-.. code-block:: bash
-
- wasp2-count count-variants-sc \
- sc_rnaseq.bam \
- variants.vcf \
- --barcode_map barcodes.tsv
-
-Output includes cell-type-specific counts.
-
-Common Issues
--------------
-
-Low Count Numbers
-~~~~~~~~~~~~~~~~~
-
-* Check BAM file coverage (``samtools depth``)
-* Verify VCF contains heterozygous SNPs
-* Ensure BAM and VCF use same reference genome
-
-No Output SNPs
-~~~~~~~~~~~~~~
-
-* Check if --samples filter is too restrictive
-* Verify VCF has genotype information (GT field)
-* Ensure BAM file is indexed
+ --gene_feature gene \
+ --gene_attribute gene_id \
+ --out_file counts_genes.tsv
Next Steps
----------
-After counting:
-* :doc:`analysis` - Detect allelic imbalance
-* :doc:`mapping` - Correct reference bias with WASP
+* :doc:`analysis` for statistical testing of allelic imbalance
+* :doc:`/user_guide/single_cell` for barcode grouping and single-cell workflows
diff --git a/docs/source/user_guide/ipscore.rst b/docs/source/user_guide/ipscore.rst
new file mode 100644
index 0000000..993fcbd
--- /dev/null
+++ b/docs/source/user_guide/ipscore.rst
@@ -0,0 +1,53 @@
+iPSCORE Utilities
+=================
+
+Overview
+--------
+
+``wasp2-ipscore`` provides helper commands for working with the iPSCORE
+multi-tissue allelic imbalance resource bundled with WASP2.
+
+Commands
+--------
+
+``inventory``
+~~~~~~~~~~~~~
+
+Validate the expected iPSCORE data inventory and optionally write a TSV report.
+
+.. code-block:: bash
+
+ wasp2-ipscore inventory --output inventory.tsv
+
+``manifest``
+~~~~~~~~~~~~
+
+Create a unified sample manifest across tissues and assays.
+
+.. code-block:: bash
+
+ wasp2-ipscore manifest --output manifest.csv --format csv
+
+``qtls``
+~~~~~~~~
+
+Load and summarize QTL resources, optionally filtering by tissue.
+
+.. code-block:: bash
+
+ wasp2-ipscore qtls --tissue iPSC --output qtls.tsv
+
+``validate``
+~~~~~~~~~~~~
+
+Run the combined inventory, manifest, and QTL validation workflow.
+
+.. code-block:: bash
+
+ wasp2-ipscore validate
+
+Notes
+-----
+
+These commands are resource-management utilities. They do not replace the main
+``wasp2-map``, ``wasp2-count``, or ``wasp2-analyze`` analysis workflows.
diff --git a/docs/source/user_guide/mapping.rst b/docs/source/user_guide/mapping.rst
index d38be18..48c8ed8 100644
--- a/docs/source/user_guide/mapping.rst
+++ b/docs/source/user_guide/mapping.rst
@@ -1,221 +1,148 @@
-Mapping Module (WASP)
-=====================
+Mapping Module
+==============
Overview
--------
-The WASP (Weighted Allele-Specific Mapping) algorithm corrects reference bias by remapping reads with all possible alleles.
+``wasp2-map`` implements the WASP remap-and-filter workflow for removing
+reference mapping bias before allele counting.
-What is Reference Bias?
-~~~~~~~~~~~~~~~~~~~~~~~~
+The public CLI has two commands:
-Reference bias occurs when reads containing alternate alleles align worse than reads with reference alleles, leading to false allelic imbalance signals.
+1. ``make-reads``: find reads overlapping sample variants and generate
+ allele-swapped FASTQ files for remapping
+2. ``filter-remapped``: keep only remapped reads that return to the same locus
-WASP Solution
-~~~~~~~~~~~~~
+There is no separate ``find-intersecting-snps`` command in WASP2. That overlap
+step is part of ``make-reads``.
-1. Identify reads overlapping heterozygous SNPs
-2. Generate alternative reads (swap alleles)
-3. Remap both original and swapped reads
-4. Keep only reads that map to the same location
-
-Purpose
--------
-
-* Correct reference bias in RNA-seq, ATAC-seq
-* Improve accuracy of allelic imbalance detection
-* Required before allele counting
-
-When to Use
-~~~~~~~~~~~
-
-Use WASP when:
-* Reads will be used for allelic analysis
-* Reference genome differs from sample genotype
-* High-confidence bias correction needed
-
-Workflow
---------
-
-Complete WASP workflow has 3 steps:
-
-Step 1: Find Intersecting SNPs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Identify reads overlapping heterozygous SNPs:
-
-.. code-block:: bash
-
- wasp2-map find-intersecting-snps \
- input.bam \
- variants.vcf \
- --output intersecting.bam
-
-Output: BAM file with reads overlapping SNPs.
-
-Step 2: Generate Remapping Reads
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Typical Workflow
+----------------
-Create reads with swapped alleles:
+Step 1: Generate swapped reads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
wasp2-map make-reads \
- intersecting.bam \
- variants.vcf \
- --samples sample1 \
- --output remap_reads.fastq
+ sample.bam \
+ variants.vcf.gz \
+ --samples SAMPLE1 \
+ --out_dir wasp_output
-Output: FASTQ file(s) with alternative allele sequences.
+This writes:
-Step 3: Remap and Filter
-~~~~~~~~~~~~~~~~~~~~~~~~~
+* ``sample_to_remap.bam``: original reads that must be remapped
+* ``sample_keep.bam``: reads that never overlapped eligible variants
+* ``sample_swapped_alleles_r1.fq`` and ``sample_swapped_alleles_r2.fq``:
+ swapped FASTQ reads to realign
+* ``sample_wasp_data_files.json``: metadata for ``filter-remapped``
-User remaps with their aligner (BWA, STAR, etc.):
+Step 2: Realign swapped reads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the same aligner and alignment settings used for the original BAM.
.. code-block:: bash
- # Example with BWA
- bwa mem -t 8 reference.fa remap_reads.fastq | \
- samtools sort -o remapped.bam -
+ bwa mem -M -t 8 genome.fa \
+ wasp_output/sample_swapped_alleles_r1.fq \
+ wasp_output/sample_swapped_alleles_r2.fq | \
+ samtools sort -o wasp_output/sample_remapped.bam -
+
+ samtools index wasp_output/sample_remapped.bam
-Then filter to consistent mappings:
+Step 3: Filter remapped reads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
- wasp2-map filt-remapped-reads \
- intersecting.bam \
- remapped.bam \
- --output filtered.bam
+ wasp2-map filter-remapped \
+ wasp_output/sample_remapped.bam \
+ --wasp_data_json wasp_output/sample_wasp_data_files.json \
+ --out_bam wasp_output/sample_wasp_filtered.bam
-Output: BAM file with bias-corrected reads.
+You can also provide ``to_remap_bam`` and ``keep_bam`` positionally instead of
+``--wasp_data_json``.
-CLI Reference
--------------
+Command Reference
+-----------------
-find-intersecting-snps
-~~~~~~~~~~~~~~~~~~~~~~
+``make-reads``
+~~~~~~~~~~~~~~
.. code-block:: bash
- wasp2-map find-intersecting-snps [OPTIONS] BAM VCF
-
-Options:
-* ``--samples``: Filter by sample genotype
-* ``--output``: Output BAM file
+ wasp2-map make-reads [OPTIONS] BAM VARIANTS
-make-reads
-~~~~~~~~~~
+Important options:
-.. code-block:: bash
+* ``--samples`` / ``-s``: sample name(s) used to select het variants
+* ``--out_dir`` / ``-o``: output directory
+* ``--out_json`` / ``-j``: explicit metadata JSON path
+* ``--indels``: include indels as well as SNPs
+* ``--threads``: BAM I/O threads
- wasp2-map make-reads [OPTIONS] BAM VCF
+Notes:
-Options:
-* ``--samples``: Sample name(s)
-* ``--output``: Output FASTQ prefix
-* ``--paired``: Paired-end mode
+* paired-end input is required
+* phased genotypes are strongly recommended
+* supported variant formats are VCF, VCF.GZ, BCF, and PGEN
-filt-remapped-reads
+``filter-remapped``
~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
- wasp2-map filt-remapped-reads [OPTIONS] ORIGINAL REMAPPED
-
-Options:
-* ``--output``: Filtered BAM file
-* ``--keep_read_file``: Save kept read IDs
+ wasp2-map filter-remapped [OPTIONS] REMAPPED_BAM [TO_REMAP_BAM] [KEEP_BAM]
-Input Requirements
-------------------
+Important options:
-* **Original BAM**: Aligned reads from initial mapping
-* **VCF File**: Phased heterozygous SNPs (recommended)
-* **Reference Genome**: Same as used for original alignment
+* ``--wasp_data_json`` / ``-j``: load ``to_remap_bam`` and ``keep_bam`` from
+ ``make-reads`` metadata
+* ``--out_bam`` / ``-o``: output BAM path
+* ``--remap_keep_bam``: optional BAM of remapped reads that passed filtering
+* ``--remap_keep_file``: optional text file of kept read names
+* ``--same-locus-slop``: positional tolerance for same-locus matching
+* ``--threads``: BAM I/O threads
-Output Interpretation
----------------------
+Interpreting Outputs
+--------------------
-WASP Filter Rate
-~~~~~~~~~~~~~~~~
+Common outcomes after ``filter-remapped``:
-Typical filter rates:
-* **Good**: 95-99% reads kept
-* **Acceptable**: 90-95% reads kept
-* **Concerning**: <90% reads kept (check data quality)
+* reads kept because they remap to the same locus
+* reads dropped because they remap elsewhere
+* reads dropped because they fail to remap cleanly
-Low filter rate may indicate:
-* Poor mapping quality
-* High SNP density
-* Problematic reference genome
-
-Complete Example
-----------------
+The final WASP-corrected BAM is the output of ``filter-remapped`` merged with
+the ``*_keep.bam`` reads that never required remapping.
-Full WASP workflow:
+Example
+-------
.. code-block:: bash
- # Step 1: Find SNP-overlapping reads
- wasp2-map find-intersecting-snps \
- original.bam \
- phased_variants.vcf \
- --samples NA12878 \
- --output intersecting.bam
-
- # Step 2: Generate remapping reads
wasp2-map make-reads \
- intersecting.bam \
- phased_variants.vcf \
- --samples NA12878 \
- --paired \
- --output remap
-
- # Step 3: Remap (user's aligner)
- bwa mem -t reference.fa \
- remap_R1.fastq remap_R2.fastq | \
- samtools sort -o remapped.bam -
- samtools index remapped.bam
-
- # Step 4: Filter
- wasp2-map filt-remapped-reads \
- intersecting.bam \
- remapped.bam \
- --output filtered_wasp.bam
-
- # Step 5: Count alleles (use filtered BAM)
- wasp2-count count-variants \
- filtered_wasp.bam \
- phased_variants.vcf \
- --samples NA12878
-
-Performance Tips
-----------------
-
-* Use multi-threading for remapping step
-* Filter VCF to high-quality SNPs only
-* Use phased genotypes when available
+ sample.bam \
+ variants.vcf.gz \
+ --samples SAMPLE1 \
+ --out_dir wasp_output
-Common Issues
--------------
+ bwa mem -M -t 8 genome.fa \
+ wasp_output/sample_swapped_alleles_r1.fq \
+ wasp_output/sample_swapped_alleles_r2.fq | \
+ samtools sort -o wasp_output/sample_remapped.bam -
-Many Reads Filtered
-~~~~~~~~~~~~~~~~~~~~
-
-* Check remapping quality (MAPQ scores)
-* Verify same reference genome used
-* Consider relaxing mapping parameters
-
-Slow Remapping
-~~~~~~~~~~~~~~
+ samtools index wasp_output/sample_remapped.bam
-* Use multi-threading (``-t`` flag)
-* Process chromosomes in parallel
-* Consider downsampling for testing
+ wasp2-map filter-remapped \
+ wasp_output/sample_remapped.bam \
+ --wasp_data_json wasp_output/sample_wasp_data_files.json \
+ --out_bam wasp_output/sample_wasp_filtered.bam
Next Steps
----------
-* :doc:`counting` - Count alleles from WASP-filtered BAM
-* :doc:`analysis` - Analyze allelic imbalance
+* :doc:`counting` to count alleles from the WASP-filtered BAM
+* :doc:`analysis` to test for allelic imbalance
diff --git a/pixi.toml b/pixi.toml
index baa7dd2..fcc8b3a 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -66,6 +66,7 @@ verify = "wasp2-count --help && wasp2-map --help && wasp2-analyze --help && pyth
build = "maturin develop --release -m rust/Cargo.toml"
# Run tests
test = "pytest tests/ -v --tb=short --ignore=tests/benchmarks -m 'not benchmark'"
+test-mapping-parity = "pytest tests/regression/test_mapping_stage_parity.py -v --tb=short"
# Run benchmarks
bench = "pytest tests/benchmarks/ -v --tb=short"
# Lint
diff --git a/pyproject.toml b/pyproject.toml
index 90f76c2..ea1740c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -126,7 +126,7 @@ python-source = "src"
module-name = "wasp2_rust"
python-packages = ["counting", "mapping", "analysis", "wasp2", "ipscore"]
bindings = "pyo3"
-strip = true
+strip = false
include = ["LICENSE", "README.md"]
[tool.pytest.ini_options]
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index d778994..fa9d689 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -1999,6 +1999,7 @@ dependencies = [
"noodles-core",
"noodles-vcf",
"pyo3",
+ "pyo3-build-config",
"rayon",
"rust-htslib",
"rustc-hash 2.1.1",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index a375ef1..af1c3f4 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -33,6 +33,9 @@ flate2 = "1.1" # For gzip decompression
criterion = { version = "0.8", features = ["html_reports"] }
tempfile = "3.25"
+[build-dependencies]
+pyo3-build-config = "0.28.2"
+
# Benchmarks removed for clean release (benchmark files in paper branch only)
# [[bench]]
# name = "mapping_filter_bench"
diff --git a/rust/build.rs b/rust/build.rs
new file mode 100644
index 0000000..258e36c
--- /dev/null
+++ b/rust/build.rs
@@ -0,0 +1,4 @@
+fn main() {
+ pyo3_build_config::add_extension_module_link_args();
+ pyo3_build_config::use_pyo3_cfgs();
+}
diff --git a/scripts/container_smoke_test.sh b/scripts/container_smoke_test.sh
index 690ef28..575b7d5 100755
--- a/scripts/container_smoke_test.sh
+++ b/scripts/container_smoke_test.sh
@@ -64,6 +64,15 @@ for cmd in wasp2-count wasp2-map wasp2-analyze; do
FAIL=$((FAIL + 1))
fi
done
+
+# wasp2-ipscore doesn't have --version, check --help instead
+if wasp2-ipscore --help > /dev/null 2>&1; then
+ echo " PASS: wasp2-ipscore --help"
+ PASS=$((PASS + 1))
+else
+ echo " FAIL: wasp2-ipscore --help"
+ FAIL=$((FAIL + 1))
+fi
echo ""
# ─────────────────────────────────────────────────────────────────────────────
@@ -96,7 +105,7 @@ if [[ -f "$DATA_DIR/sample1.bam" && -f "$DATA_DIR/variants.vcf.gz" ]]; then
if wasp2-count count-variants \
"$DATA_DIR/sample1.bam" \
"$DATA_DIR/variants.vcf.gz" \
- --samples SAMPLE1 \
+ --samples sample1 \
--out "$TMP_DIR/counts.tsv" \
2>/dev/null; then
diff --git a/src/counting/__main__.py b/src/counting/__main__.py
index 2fb6945..15deb17 100644
--- a/src/counting/__main__.py
+++ b/src/counting/__main__.py
@@ -80,7 +80,7 @@ def count_variants(
"-r",
help=(
"Only use variants overlapping regions in file. "
- "Accepts BED or MACS2 formatted .(narrow/broad)Peak files. "
+ "Accepts BED, MACS2 .(narrow/broad)Peak, GTF, or GFF3 files. "
),
),
] = None,
@@ -126,7 +126,7 @@ def count_variants(
"--feat",
help=(
"Feature type in gtf/gff3 for counting intersecting SNPs. "
- "Defaults to 'exon' for snp counting"
+ "Defaults to 'gene' when present."
),
),
] = None,
@@ -237,10 +237,9 @@ def count_variants_sc(
"--regions",
"-r",
help=(
- "Features used in single-cell experiment. "
+ "Features used in single-cell ATAC experiment. "
"Only use variants overlapping features in file. "
- "Accepts BED or MACS2 formatted .(narrow/broad)Peak files. "
- "TODO: Implement genes gtf/gff format"
+ "Accepts BED or MACS2 .(narrow/broad)Peak files."
),
),
] = None,
diff --git a/src/counting/count_alleles_sc.py b/src/counting/count_alleles_sc.py
index 1244383..104f8ef 100644
--- a/src/counting/count_alleles_sc.py
+++ b/src/counting/count_alleles_sc.py
@@ -20,6 +20,19 @@
logger = logging.getLogger(__name__)
+def _sparse_from_counts(
+ counts: defaultdict[tuple[int, int], int],
+ shape: tuple[int, int],
+) -> csr_matrix:
+ if not counts:
+ return csr_matrix(shape, dtype=np.uint16)
+ return csr_matrix(
+ (list(counts.values()), list(zip(*counts.keys()))),
+ shape=shape,
+ dtype=np.uint16,
+ )
+
+
class CountStatsSC:
"""Container for mutable single-cell counting statistics.
@@ -101,7 +114,12 @@ def make_count_matrix(
# Maybe do this automatically and parse feature col instead?
snp_df_cols = ["chrom", "pos", "ref", "alt"]
if include_samples is not None:
- snp_df_cols.extend(include_samples)
+ sample_cols = list(include_samples)
+ missing_sample_cols = [col for col in sample_cols if col not in df.columns]
+ if missing_sample_cols and len(sample_cols) == 1 and "GT" in df.columns:
+ sample_name = sample_cols[0]
+ df = df.with_columns(pl.col("GT").alias(sample_name))
+ snp_df_cols.extend(sample_cols)
# Might be more memory efficient to use pandas index instead...
snp_df = df.select(snp_df_cols).unique(maintain_order=True).with_row_index()
@@ -135,23 +153,10 @@ def make_count_matrix(
# Create sparse matrices
# sparse array is recommended...but doesnt work with adata
- sparse_ref = csr_matrix(
- (list(sc_counts.ref_count.values()), list(zip(*sc_counts.ref_count.keys()))),
- shape=(snp_df.shape[0], len(bc_dict)),
- dtype=np.uint16,
- )
-
- sparse_alt = csr_matrix(
- (list(sc_counts.alt_count.values()), list(zip(*sc_counts.alt_count.keys()))),
- shape=(snp_df.shape[0], len(bc_dict)),
- dtype=np.uint16,
- )
-
- sparse_other = csr_matrix(
- (list(sc_counts.other_count.values()), list(zip(*sc_counts.other_count.keys()))),
- shape=(snp_df.shape[0], len(bc_dict)),
- dtype=np.uint16,
- )
+ matrix_shape = (snp_df.shape[0], len(bc_dict))
+ sparse_ref = _sparse_from_counts(sc_counts.ref_count, matrix_shape)
+ sparse_alt = _sparse_from_counts(sc_counts.alt_count, matrix_shape)
+ sparse_other = _sparse_from_counts(sc_counts.other_count, matrix_shape)
# Create anndata With total as X
adata = ad.AnnData(
@@ -171,17 +176,19 @@ def make_count_matrix(
if include_samples is not None:
adata.uns["samples"] = include_samples
- # TODO: Allow for other features besides 'region' using include_features
- # Could be case of no features, or feature is gene
- if "region" in df.columns:
- # Get unique snps and associated regions
+ feature_cols = include_features or []
+ if not feature_cols and "region" in df.columns:
+ feature_cols = ["region"]
- # Create dict during analysis step instead
- adata.uns["feature"] = (
+ if feature_cols:
+ feature_df = (
df.join(snp_df, on=["chrom", "pos", "ref", "alt"], how="left")
- .select(["region", "index"])
- .to_pandas()
+ .select([*feature_cols, "index"])
+ .unique(maintain_order=True)
)
+ if "region" not in feature_df.columns:
+ feature_df = feature_df.with_columns(pl.col(feature_cols[0]).alias("region"))
+ adata.uns["feature"] = feature_df.to_pandas()
# region_snp_dict = dict(
# df.join(snp_df, on=["chrom", "pos", "ref", "alt"], how="left"
diff --git a/src/counting/parse_gene_data.py b/src/counting/parse_gene_data.py
index 6653ed1..682192c 100644
--- a/src/counting/parse_gene_data.py
+++ b/src/counting/parse_gene_data.py
@@ -106,12 +106,12 @@ def parse_gene_file(
if feature is None:
feature_list = df.select(pl.col("feature").unique()).to_series()
- if "exon" in feature_list:
- feature = "exon"
+ if "gene" in feature_list:
+ feature = "gene"
elif "transcript" in feature_list:
feature = "transcript"
- elif "gene" in feature_list:
- feature = "gene"
+ elif "exon" in feature_list:
+ feature = "exon"
else:
logger.warning("exon, gene or transcript not found in feature list: %s", feature_list)
@@ -244,6 +244,20 @@ def parse_intersect_genes(
if parent_attribute is None:
parent_attribute = "Parent"
+ # Guard against empty intersection file (0 variants in region)
+ intersect_path = Path(intersect_file)
+ if not intersect_path.exists() or intersect_path.stat().st_size == 0:
+ return pl.DataFrame(
+ schema={
+ "chrom": pl.Categorical,
+ "pos": pl.UInt32,
+ "ref": pl.Categorical,
+ "alt": pl.Categorical,
+ attribute: pl.Utf8,
+ parent_attribute: pl.Utf8,
+ }
+ )
+
# AFTER performing gtf_to_bed and intersecting!
df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
@@ -287,8 +301,23 @@ def parse_intersect_genes_new(
if parent_attribute is None:
parent_attribute = "Parent"
+ # Guard against empty intersection file (0 variants in region)
+ intersect_path = Path(intersect_file)
+ if not intersect_path.exists() or intersect_path.stat().st_size == 0:
+ return pl.DataFrame(
+ schema={
+ "chrom": pl.Categorical,
+ "pos": pl.UInt32,
+ "ref": pl.Categorical,
+ "alt": pl.Categorical,
+ attribute: pl.Utf8,
+ parent_attribute: pl.Utf8,
+ }
+ )
+
# AFTER performing gtf_to_bed and intersecting!
df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
+ schema_names = list(df.collect_schema().names())
vcf_schema = [
pl.col("chrom").cast(pl.Categorical),
@@ -298,12 +327,12 @@ def parse_intersect_genes_new(
]
# Expect at min 10 cols, 11 if GT included
- if len(df.columns) > 10:
- subset_cols = [df.columns[i] for i in [0, 2, 3, 4, 5, -2, -1]]
+ if len(schema_names) > 10:
+ subset_cols = [schema_names[i] for i in [0, 2, 3, 4, 5, -2, -1]]
new_cols = ["chrom", "pos", "ref", "alt", "GT", attribute, parent_attribute]
vcf_schema.append(pl.col("GT").cast(pl.Categorical))
else:
- subset_cols = [df.columns[i] for i in [0, 2, 3, 4, -2, -1]]
+ subset_cols = [schema_names[i] for i in [0, 2, 3, 4, -2, -1]]
new_cols = ["chrom", "pos", "ref", "alt", attribute, parent_attribute]
# Parse dataframe columns
diff --git a/src/counting/run_counting_sc.py b/src/counting/run_counting_sc.py
index 08a0240..c9c4c20 100644
--- a/src/counting/run_counting_sc.py
+++ b/src/counting/run_counting_sc.py
@@ -2,6 +2,7 @@
from __future__ import annotations
+import logging
import re
from pathlib import Path
@@ -98,39 +99,13 @@ def __init__(
self.intersect_file = str(
Path(self.temp_loc) / f"{variant_prefix}_intersect_regions.bed"
)
- self.is_gene_file = False
- elif re.search(r"\.g[tf]f(?:\.gz)?$", f_ext, re.I):
- self.feature_type = "genes"
- self.intersect_file = str(
- Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed"
- )
- self.is_gene_file = True
- gtf_prefix = re.split(r".g[tf]f", Path(self.feature_file).name)[0]
- self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed")
- self.use_feature_names = True # Use feature attributes as region names
- elif re.search(r"\.gff3(?:\.gz)?$", f_ext, re.I):
- self.feature_type = "genes"
- self.intersect_file = str(
- Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed"
- )
- self.is_gene_file = True
- gtf_prefix = re.split(r".gff3", Path(self.feature_file).name)[0]
- self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed")
- self.use_feature_names = True # Use feature attributes as feature names
else:
raise ValueError(
- f"Invalid feature file type. Expected .bed, .gtf, or .gff3, got: {self.feature_file}"
+ f"Invalid feature file type. Expected BED or MACS2 peak file, got: {self.feature_file}"
)
else:
self.intersect_file = self.vcf_bed
- self.is_gene_file = False
-
- # TODO UPDATE THIS WHEN I ADD AUTOPARSERS
- if self.is_gene_file:
- # Possible edge case of vcf and gtf prefix conflict
- if self.vcf_bed == self.gtf_bed:
- self.gtf_bed = str(Path(self.temp_loc) / "genes.bed")
@tempdir_decorator
@@ -144,7 +119,7 @@ def run_count_variants_sc(
out_file: str | None = None,
temp_loc: str | None = None,
) -> None:
- """Run single-cell variant counting pipeline.
+ """Run single-cell ATAC variant counting pipeline.
Parameters
----------
@@ -155,7 +130,7 @@ def run_count_variants_sc(
barcode_file : str
Path to cell barcode file (one barcode per line).
feature_file : str | None, optional
- Path to feature/region file (BED, GTF, or GFF3).
+ Path to feature/region file (BED or MACS2 peak file).
samples : str | list[str] | None, optional
Sample ID(s) to process.
use_region_names : bool, optional
@@ -192,13 +167,13 @@ def run_count_variants_sc(
)
assert count_files.feature_file is not None
+
intersect_vcf_region(
vcf_file=count_files.vcf_bed,
region_file=count_files.feature_file,
out_file=count_files.intersect_file,
)
- # TODO: handle use_region_names better
df = parse_intersect_region_new(
intersect_file=count_files.intersect_file,
samples=count_files.samples,
@@ -206,13 +181,26 @@ def run_count_variants_sc(
region_col=None,
)
+ # Guard: if no variants survived intersection, warn and write empty output
+ if df.is_empty():
+ logging.getLogger(__name__).warning(
+ "No variants found after intersection — writing empty output file."
+ )
+ import anndata as ad
+
+ ad.AnnData().write_h5ad(count_files.out_file)
+ return
+
# TODO: handle case where barcode file contains multiple columns
with open(count_files.barcode_file) as file:
bc_dict = {line.rstrip(): i for i, line in enumerate(file)}
# Generate Output
adata = make_count_matrix(
- bam_file=count_files.bam_file, df=df, bc_dict=bc_dict, include_samples=count_files.samples
+ bam_file=count_files.bam_file,
+ df=df,
+ bc_dict=bc_dict,
+ include_samples=count_files.samples,
)
# Write outputs
diff --git a/src/mapping/intersect_variant_data.py b/src/mapping/intersect_variant_data.py
index 6d52aba..307ac64 100644
--- a/src/mapping/intersect_variant_data.py
+++ b/src/mapping/intersect_variant_data.py
@@ -10,9 +10,8 @@
import os
import subprocess
from pathlib import Path
+from typing import TYPE_CHECKING
-import numpy as np
-import polars as pl
import pysam
# Multi-format variant support
@@ -25,6 +24,9 @@
logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+ import polars as pl
+
def vcf_to_bed(
vcf_file: str | Path,
@@ -148,7 +150,7 @@ def make_intersect_df(
intersect_file: str,
samples: list[str],
is_paired: bool = True,
-) -> pl.DataFrame:
+) -> "pl.DataFrame":
"""Parse intersection file into a typed polars DataFrame.
Parameters
@@ -165,6 +167,9 @@ def make_intersect_df(
pl.DataFrame
Parsed intersection data with alleles split by sample.
"""
+ import numpy as np
+ import polars as pl
+
# Create Dataframe
df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
diff --git a/src/mapping/remap_utils.py b/src/mapping/remap_utils.py
index ebf8534..416ebde 100644
--- a/src/mapping/remap_utils.py
+++ b/src/mapping/remap_utils.py
@@ -1,13 +1,15 @@
import logging
from collections.abc import Generator
-from typing import Any
+from typing import TYPE_CHECKING, Any
import numpy as np
-import polars as pl
from pysam import AlignedSegment, AlignmentFile
logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+ import polars as pl
+
# Generator for iterating through bam
def paired_read_gen(
@@ -127,13 +129,13 @@ def _build_ref2read_maps(read: AlignedSegment) -> tuple[dict[int, int], dict[int
def get_read_het_data(
- read_df: pl.DataFrame,
+ read_df: "pl.DataFrame",
read: AlignedSegment,
col_list: list[str],
max_seqs: int | None = None,
include_indels: bool = False,
insert_qual: int = 30,
-) -> tuple[list[str], list[Any], list[pl.Series]] | None:
+) -> tuple[list[str], list[Any], list["pl.Series"]] | None:
"""Extract heterozygous variant data from read with indel support.
Args:
@@ -150,6 +152,8 @@ def get_read_het_data(
split_qual: List of quality score segments
allele_series: List of polars Series with allele data
"""
+ import polars as pl
+
pos_list = read_df.select(["start", "stop"]).rows()
assert read.query_sequence is not None, "Read has no query sequence"