From b1aac9392ae01e173c32abbe04086961d6830931 Mon Sep 17 00:00:00 2001
From: Jeff Jaureguy <67065808+Jaureguy760@users.noreply.github.com>
Date: Sun, 15 Mar 2026 00:38:36 -0700
Subject: [PATCH] fix: revert GTF from count-variants-sc, harden Dockerfile,
 fix docs

- Revert GTF/GFF3 support from count-variants-sc (sc commands are
  scATAC-only; gene annotation is a downstream ArchR/Signac step)
- Bulk count-variants retains full GTF support
- Dockerfile: tini PID 1, g++ purge assertion, wasp2-ipscore check
- Smoke test: add wasp2-ipscore, fix sample name case
- Docs: rewrite counting/mapping/analysis/installation pages
- Remove stale --min-count footnote from analysis.rst

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml                     |   4 +-
 .gitignore                                   |  29 +-
 Dockerfile                                   |  10 +-
 Makefile                                     |   6 +-
 README.md                                    | 110 ++--
 docs/CONTAINER_USAGE.md                      | 267 ++--------
 docs/source/index.rst                        |   1 +
 docs/source/installation.rst                 |  85 ++-
 docs/source/methods/mapping_filter.rst       |   7 +-
 docs/source/tutorials/quickstart_mapping.rst |  12 +-
 docs/source/tutorials/scrna_seq.rst          |   4 +-
 docs/source/user_guide/analysis.rst          | 526 ++-----------------
 docs/source/user_guide/counting.rst          | 222 +++-----
 docs/source/user_guide/ipscore.rst           |  53 ++
 docs/source/user_guide/mapping.rst           | 257 ++++-----
 pixi.toml                                    |   1 +
 pyproject.toml                               |   2 +-
 rust/Cargo.lock                              |   1 +
 rust/Cargo.toml                              |   3 +
 rust/build.rs                                |   4 +
 scripts/container_smoke_test.sh              |  11 +-
 src/counting/__main__.py                     |   9 +-
 src/counting/count_alleles_sc.py             |  59 ++-
 src/counting/parse_gene_data.py              |  43 +-
 src/counting/run_counting_sc.py              |  50 +-
 src/mapping/intersect_variant_data.py        |  11 +-
 src/mapping/remap_utils.py                   |  12 +-
 27 files changed, 582 insertions(+), 1217 deletions(-)
 create mode 100644 docs/source/user_guide/ipscore.rst
 create mode 100644 rust/build.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3518670..2e37711 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -146,8 +146,8 @@ jobs:
 
       - name: Run cargo test
         working-directory: rust
-        run: cargo test
+        run: PYO3_PYTHON=$(command -v python3) cargo test
 
       - name: Run clippy
         working-directory: rust
-        run: cargo clippy -- -W warnings
+        run: PYO3_PYTHON=$(command -v python3) cargo clippy -- -W warnings
diff --git a/.gitignore b/.gitignore
index 058d27f..9418dda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,6 +154,11 @@ benchmark_figures/
 # Sanity test data (downloaded from GitHub releases)
 tests/sanity/data/
 
+# Real data test files (downloaded from 1000 Genomes, ~2-3 GB)
+tests/real_data/data/
+tests/real_data/samplesheets/
+tests/real_data/configs/
+
 # Nextflow runtime
 .nextflow/
 .nextflow.log*
@@ -175,7 +180,29 @@ test-output/
 results_stub/
 pipelines/*/test-output/
 pipelines/*/results_stub/
+pipelines/*/results_*/
+pipelines/*/artifacts/
+
+# Artifacts directory
+artifacts/
+
+# Benchmark infrastructure (large data/envs/results)
+test_benchmarks/
+
+# Claude Code local state
+.claude/
+
+# Nextflow pipeline-level logs
+pipelines/*/.nextflow.log*
+pipelines/*/.nf-test.log
+
+# Nextflow reports and visualizations
+trace.txt
+timeline.html
+report.html
+dag.svg
+dag.dot
 
 # Claude Code memory files (per-directory)
 **/CLAUDE.md
-!./CLAUDE.md
+!/CLAUDE.md
diff --git a/Dockerfile b/Dockerfile
index 0ade5c9..d3a4821 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -82,6 +82,8 @@ LABEL maintainer="Jeff Jaureguy <jeffpjaureguy@gmail.com>"
 # Install runtime deps + temporary build deps for pybedtools C++ extension
 # Combined into one RUN to minimize layers; build tools purged at the end
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    # PID 1 init for proper signal handling (Nextflow/HPC)
+    tini \
     # Bioinformatics tools
     samtools \
     bcftools \
@@ -106,14 +108,16 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install /tmp/*.whl \
     && rm -rf /tmp/*.whl \
     && apt-get purge -y --auto-remove g++ zlib1g-dev \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* \
+    && ! command -v g++
 
 WORKDIR /app
 
 # Verify non-Python tools are available (Python tools skipped during build
 # because Polars uses AVX2 instructions that fail under QEMU emulation
 # on ARM64 CI runners building linux/amd64 images)
-RUN samtools --version && bcftools --version && bedtools --version
+RUN samtools --version && bcftools --version && bedtools --version \
+    && wasp2-ipscore --help > /dev/null 2>&1
 
 # Create non-root user for security
 RUN groupadd -g 1000 wasp2 && \
@@ -147,5 +151,7 @@ WORKDIR /data
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
     CMD wasp2-count --version || exit 1
 
+ENTRYPOINT ["tini", "--"]
+
 # Default command
 CMD ["wasp2-count", "--help"]
diff --git a/Makefile b/Makefile
index d99addc..8e67b2a 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@
 
 .PHONY: all build install test test-quick test-sanity lint format clean help
 .PHONY: download-sanity-data sanity-data-local rust-build rust-test
+.PHONY: test-mapping-parity
 
 # Configuration
 PYTHON ?= python
@@ -48,7 +49,7 @@ rust-dev:  ## Build Rust extension in debug mode (faster compile)
 	$(MATURIN) develop -m $(RUST_DIR)/Cargo.toml
 
 rust-test:  ## Run Rust unit tests
-	cd $(RUST_DIR) && $(CARGO) test
+	cd $(RUST_DIR) && PYO3_PYTHON=$$($(PYTHON) -c "import sys; print(sys.executable)") $(CARGO) test
 
 rust-bench:  ## Run Rust benchmarks
 	cd $(RUST_DIR) && $(CARGO) bench
@@ -68,6 +69,9 @@ test-quick:  ## Run quick validation tests only
 test-rust:  ## Run Rust-specific tests
 	$(PYTEST) $(TESTS_DIR) -v --tb=short -m "rust"
 
+test-mapping-parity:  ## Run mapping parity tests against legacy and unified paths
+	$(PYTEST) $(TESTS_DIR)/regression/test_mapping_stage_parity.py -v --tb=short
+
 test-integration:  ## Run integration tests
 	$(PYTEST) $(TESTS_DIR) -v --tb=short -m "integration"
 
diff --git a/README.md b/README.md
index 29b7667..d89e29a 100644
--- a/README.md
+++ b/README.md
@@ -15,114 +15,100 @@
   <a href="https://mcvickerlab.github.io/WASP2/">
     <img src="https://img.shields.io/badge/docs-GitHub%20Pages-blue" alt="Documentation">
   </a>
-  <a href="https://github.com/mcvickerlab/WASP2/blob/master/LICENSE">
-    <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
-  </a>
-</p>
-
-<p align="center">
-  <a href="https://mcvickerlab.github.io/WASP2/">Documentation</a> •
-  <a href="https://mcvicker.salk.edu/">McVicker Lab</a> •
-  <a href="https://github.com/bmvdgeijn/WASP">Original WASP</a>
 </p>
 
----
-
 ## Installation
 
-### Recommended: Bioconda
+### Bioconda
 
 ```bash
 mamba install -c conda-forge -c bioconda wasp2
 ```
 
-Installs WASP2 and all dependencies (samtools, bcftools, bedtools, htslib) automatically. Available for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon). Requires [miniforge](https://github.com/conda-forge/miniforge).
-
-### Via PyPI
+### PyPI
 
 ```bash
 pip install wasp2
 ```
 
-Pre-built wheels for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon) with Python 3.10-3.13. The Rust extension and htslib are bundled in the wheel. Requires samtools, bcftools, and bedtools installed separately.
+The PyPI package does not install external tools such as `samtools`,
+`bcftools`, or `bedtools`; install those separately.
 
-### For development
-
-```bash
-git clone https://github.com/mcvickerlab/WASP2.git
-cd WASP2
-pixi install        # resolves all dependencies including Rust toolchain
-pixi run verify     # build + test
-```
-
-### Via Docker
+### Docker
 
 ```bash
 docker pull ghcr.io/mcvickerlab/wasp2:1.4.0
-docker run --rm -v $PWD:/data ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
 ```
 
-Multi-platform image (linux/amd64 + linux/arm64) with all dependencies included.
-
-### Via Singularity/Apptainer (HPC)
+### Singularity/Apptainer
 
 ```bash
 singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
 singularity exec wasp2.sif wasp2-count --help
 ```
 
-### Reproducible Environment (conda-lock)
+## CLI Tools
 
-For fully pinned, reproducible installs (HPC clusters, CI, shared lab environments):
+WASP2 installs four command-line entry points:
 
-```bash
-# Recommended: mamba (fastest)
-mamba create -n WASP2 --file conda-lock.yml
+- `wasp2-map`
+- `wasp2-count`
+- `wasp2-analyze`
+- `wasp2-ipscore`
 
-# Or with conda
-conda-lock install -n WASP2 conda-lock.yml
-```
+## Quick Start
 
-`conda-lock.yml` pins every package to exact versions with checksums for `linux-64` and `osx-64`. To regenerate after updating `environment.lock.yml`:
+### 1. Correct mapping bias
 
 ```bash
-conda-lock lock -f environment.lock.yml --lockfile conda-lock.yml
-```
-
-See the [documentation](https://mcvickerlab.github.io/WASP2/) for detailed install options and development setup.
+wasp2-map make-reads input.bam variants.vcf.gz -s sample1 -o remap_dir
 
-## Quick Start
+# Realign remap_dir/*_swapped_alleles_r1.fq and r2.fq with the same aligner
+# and settings used for the original BAM, then:
 
-WASP2 has three steps that run in order:
+wasp2-map filter-remapped remapped.bam \
+  -j remap_dir/input_wasp_data_files.json \
+  -o filtered.bam
+```
 
-**Step 1: Remap reads** to correct mapping bias
+### 2. Count alleles
 
 ```bash
-wasp2-map make-reads input.bam variants.vcf.gz -s sample1 -o remap_dir/
-# Realign the swapped-allele reads with your aligner, then:
-wasp2-map filter-remapped remapped.bam -j remap_dir/sample1_wasp_data_files.json -o filtered.bam
+wasp2-count count-variants filtered.bam variants.vcf.gz -s sample1 -o counts.tsv
 ```
 
-**Step 2: Count alleles** at heterozygous SNPs
+### 3. Test for imbalance
 
 ```bash
-wasp2-count count-variants filtered.bam variants.vcf.gz -s sample1
+wasp2-analyze find-imbalance counts.tsv -o ai_results.tsv
 ```
 
-**Step 3: Test for allelic imbalance**
+## Single-Cell Example
 
 ```bash
-wasp2-analyze find-imbalance counts.tsv -o results.tsv
+wasp2-count count-variants-sc \
+  cellranger.bam \
+  variants.vcf.gz \
+  barcodes.tsv \
+  --samples sample1 \
+  --feature genes.gtf \
+  --out_file allele_counts.h5ad
+
+wasp2-analyze find-imbalance-sc \
+  allele_counts.h5ad \
+  barcode_groups.tsv \
+  --sample sample1 \
+  -o ai_results.tsv
 ```
 
-See the [documentation](https://mcvickerlab.github.io/WASP2/) for detailed usage, single-cell workflows, and supported variant formats (VCF, BCF, PGEN).
-
-## Authors
-
-- **Aaron Ho** — Creator of WASP2
-- **Jeff Jaureguy** — Developer and maintainer
-- **[McVicker Lab](https://mcvicker.salk.edu/)**, Salk Institute
+## iPSCORE Utilities
 
-## Citation
+```bash
+wasp2-ipscore inventory --output inventory.tsv
+wasp2-ipscore manifest --output manifest.csv
+wasp2-ipscore validate
+```
 
-If you use WASP2 in your research, please cite our paper (coming soon).
+See the [documentation](https://mcvickerlab.github.io/WASP2/) for complete
+usage, tutorials, and API details.
diff --git a/docs/CONTAINER_USAGE.md b/docs/CONTAINER_USAGE.md
index 8dd77e7..6bda535 100644
--- a/docs/CONTAINER_USAGE.md
+++ b/docs/CONTAINER_USAGE.md
@@ -1,262 +1,81 @@
-# WASP2 Container Usage Guide
+# WASP2 Container Usage
 
-This guide covers how to use WASP2 containers for local development, HPC clusters, and Nextflow pipelines.
+## Verified Docker Flow
 
-## Container Registries
-
-WASP2 images are available from:
-
-| Registry | Image | Pull Command |
-|----------|-------|--------------|
-| **DockerHub** | `mcvickerlab/wasp2` | `docker pull mcvickerlab/wasp2:latest` |
-| **GitHub Container Registry** | `ghcr.io/mcvickerlab/wasp2` | `docker pull ghcr.io/mcvickerlab/wasp2:latest` |
-
-### Available Tags
-
-- `:latest` - Most recent release
-- `:1.3.0` - Specific version
-- `:1.3` - Minor version (tracks patches)
-- `:main` - Development builds from main branch
-
-## Docker Usage
-
-### Pull and Run
-
-```bash
-# Pull the image
-docker pull mcvickerlab/wasp2:latest
-
-# Run WASP2 commands
-docker run --rm mcvickerlab/wasp2 wasp2-count --help
-docker run --rm mcvickerlab/wasp2 wasp2-map --help
-docker run --rm mcvickerlab/wasp2 wasp2-analyze --help
-
-# Process files (mount local directory)
-docker run --rm -v $(pwd):/data mcvickerlab/wasp2 \
-    wasp2-count /data/sample.bam /data/variants.vcf.gz -o /data/counts.tsv
-```
-
-### Interactive Shell
+The Docker image validated for this update is:
 
 ```bash
-docker run -it --rm -v $(pwd):/data mcvickerlab/wasp2 /bin/bash
+ghcr.io/mcvickerlab/wasp2:1.4.0
 ```
 
-## Singularity/Apptainer Usage (HPC)
-
-### Pull from Docker Registry
+Pull and inspect the available CLI tools:
 
 ```bash
-# Pull and convert to SIF
-singularity pull wasp2.sif docker://mcvickerlab/wasp2:latest
+docker pull ghcr.io/mcvickerlab/wasp2:1.4.0
 
-# Or from GHCR
-singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:latest
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-map --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-analyze --help
+docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-ipscore --help
 ```
 
-### Build from Definition File
+Mount local data when running workflows:
 
 ```bash
-# Clone the repository
-git clone https://github.com/mcvickerlab/WASP2.git
-cd WASP2
-
-# Build the container
-singularity build wasp2.sif Singularity.def
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+  wasp2-count count-variants /data/sample.bam /data/variants.vcf.gz -o /data/counts.tsv
 ```
 
-### Run Commands
+## Intended Singularity / Apptainer Flow
 
-```bash
-# Run WASP2 commands
-singularity exec wasp2.sif wasp2-count --help
-
-# Process files (current directory is auto-bound)
-singularity exec wasp2.sif wasp2-count sample.bam variants.vcf.gz -o counts.tsv
-
-# With explicit bindings
-singularity exec --bind /scratch:/scratch wasp2.sif \
-    wasp2-map make-reads /scratch/input.bam /scratch/variants.vcf
-```
-
-### SLURM Job Script Example
+For HPC environments using SIF images:
 
 ```bash
-#!/bin/bash
-#SBATCH --job-name=wasp2
-#SBATCH --cpus-per-task=8
-#SBATCH --mem=32G
-#SBATCH --time=4:00:00
-
-module load singularity
-
-CONTAINER=/path/to/wasp2.sif
-
-# Count variants (wasp2-count does not have --threads option)
-singularity exec ${CONTAINER} wasp2-count \
-    input.bam \
-    variants.vcf.gz \
-    -o counts.tsv
-
-# WASP mapping filter (supports --threads)
-singularity exec ${CONTAINER} wasp2-map make-reads \
-    input.bam \
-    variants.vcf.gz \
-    --threads ${SLURM_CPUS_PER_TASK} \
-    --out_dir ./wasp_output
-```
-
-## Nextflow Integration
-
-### Configuration
-
-Add to your `nextflow.config`:
-
-```groovy
-profiles {
-    docker {
-        docker.enabled = true
-        docker.runOptions = '-u $(id -u):$(id -g)'
-    }
-    singularity {
-        singularity.enabled = true
-        singularity.autoMounts = true
-        singularity.cacheDir = "${HOME}/.singularity/cache"
-    }
-}
-
-process {
-    withLabel: 'wasp2' {
-        container = 'mcvickerlab/wasp2:latest'
-    }
-}
-```
-
-### Running Pipelines
-
-```bash
-# With Docker
-nextflow run main.nf -profile docker
-
-# With Singularity
-nextflow run main.nf -profile singularity
-```
-
-## Building Locally
-
-### Docker Build
-
-```bash
-# Clone repository
-git clone https://github.com/mcvickerlab/WASP2.git
-cd WASP2
-
-# Build image
-docker build -t wasp2:local .
-
-# Test the build
-docker run --rm wasp2:local wasp2-count --version
-docker run --rm wasp2:local python -c "import wasp2_rust; print('OK')"
+singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
+singularity exec wasp2.sif wasp2-count --help
 ```
 
-### Manual Build (for maintainers)
-
-Note: Currently only `linux/amd64` is supported.
+or:
 
 ```bash
-# Set up buildx
-docker buildx create --name wasp2builder --use
-
-# Build with version argument
-docker buildx build \
-    --platform linux/amd64 \
-    --build-arg VERSION=1.3.0 \
-    -t mcvickerlab/wasp2:1.3.0 \
-    -t mcvickerlab/wasp2:latest \
-    --push .
+apptainer pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
+apptainer exec wasp2.sif wasp2-count --help
 ```
 
-## Container Contents
-
-The WASP2 container includes:
+These commands are the intended container path, but they were not executed in
+this development environment because neither `singularity` nor `apptainer` was
+installed locally.
 
-### Python Environment
-- Python 3.10+ (container uses 3.11)
-- wasp2 package with Rust extension
-- Core: pysam, pandas (<2.0), numpy, scipy, polars
-- CLI: typer, rich
-- Single-cell: anndata, scanpy (optional)
-
-### Rust Components
-- Pre-built `wasp2_rust` Python extension
-- Compiled with release optimizations
-
-### CLI Tools
-
-Each tool has subcommands for different analysis modes:
-
-- **`wasp2-count`** - Allele counting
-  - `count-variants` - Bulk allele counting at heterozygous sites (default)
-  - `count-variants-sc` - Single-cell allele counting
-
-- **`wasp2-map`** - WASP mapping filter
-  - `make-reads` - Generate reads with swapped alleles for remapping
-  - `filter-remapped` - Filter remapped reads using WASP algorithm
-
-- **`wasp2-analyze`** - Statistical analysis
-  - `find-imbalance` - Calculate allelic imbalance
-  - `find-imbalance-sc` - Single-cell allelic imbalance analysis
-  - `compare-imbalance` - Compare imbalance between cell types/groups
-
-### Bioinformatics Tools
-- samtools
-- bcftools
-- bedtools
-- tabix
-
-## Troubleshooting
-
-### Permission Issues (Docker)
+## Mapping Example
 
 ```bash
-# Run as current user
-docker run --rm -u $(id -u):$(id -g) -v $(pwd):/data mcvickerlab/wasp2 ...
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+  wasp2-map make-reads /data/sample.bam /data/variants.vcf.gz \
+  --samples sample1 \
+  --out_dir /data/remap_dir
 ```
 
-### Cache Issues (Singularity)
+After realigning the swapped FASTQ reads with your aligner of choice:
 
 ```bash
-# Clear Singularity cache
-singularity cache clean
-
-# Use different cache directory
-export SINGULARITY_CACHEDIR=/scratch/singularity_cache
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+  wasp2-map filter-remapped /data/remapped.bam \
+  --wasp_data_json /data/remap_dir/sample_wasp_data_files.json \
+  --out_bam /data/filtered.bam
 ```
 
-### Verify Installation
+## Counting Example
 
 ```bash
-# Docker
-docker run --rm mcvickerlab/wasp2 wasp2-count --version
-docker run --rm mcvickerlab/wasp2 python -c "import wasp2_rust; print('Rust extension OK')"
-
-# Singularity
-singularity exec wasp2.sif wasp2-count --version
-singularity exec wasp2.sif python -c "import wasp2_rust; print('Rust extension OK')"
+docker run --rm -v "$PWD":/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
+  wasp2-count count-variants /data/filtered.bam /data/variants.vcf.gz \
+  --samples sample1 \
+  --region /data/genes.gtf \
+  --out_file /data/counts.tsv
 ```
 
-## GitHub Actions Secrets Setup
-
-To enable automated container builds, repository maintainers must configure:
-
-1. **DockerHub Secrets** (Settings → Secrets and variables → Actions):
-   - `DOCKERHUB_USERNAME`: Your DockerHub username
-   - `DOCKERHUB_TOKEN`: DockerHub access token (Account Settings → Security → Access Tokens)
-
-2. **GitHub Container Registry**: Uses `GITHUB_TOKEN` automatically (no setup needed)
-
-## Related Documentation
+## Notes
 
-- [Nextflow Pipelines](../pipelines/nf-atacseq/README.md)
-- [WASP2 Ecosystem](WASP2_ECOSYSTEM.md)
-- [GitHub Repository](https://github.com/mcvickerlab/WASP2)
+- The image contains the WASP2 package plus `samtools`, `bcftools`, and `bedtools`.
+- The documented public mapping workflow is `make-reads -> realign -> filter-remapped`.
+- `wasp2-ipscore` is present in the container alongside the main analysis tools.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1a5fc6a..ab712bd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -60,6 +60,7 @@ Documentation
    user_guide/mapping
    user_guide/analysis
    user_guide/single_cell
+   user_guide/ipscore
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index f039a97..51e9c15 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,25 +1,16 @@
 Installation
 ============
 
-Via Bioconda (Recommended)
---------------------------
-
-`Bioconda <https://bioconda.github.io>`_ installs WASP2 and **all** dependencies
-(samtools, bcftools, bedtools, htslib) in one command. Requires
-`miniforge <https://github.com/conda-forge/miniforge>`_.
+Via Bioconda
+------------
 
 .. code-block:: bash
 
    mamba install -c conda-forge -c bioconda wasp2
 
-Or with conda:
-
-.. code-block:: bash
-
-   conda install -c conda-forge -c bioconda wasp2
-
-Available for Linux (x86_64, aarch64) and macOS (Intel, Apple Silicon) with
-Python 3.10-3.13.
+Bioconda installs WASP2 together with the external command-line dependencies
+required by the workflows, including ``samtools``, ``bcftools``, and
+``bedtools``.
 
 Via PyPI
 --------
@@ -28,73 +19,58 @@ Via PyPI
 
    pip install wasp2
 
-Pre-built wheels include the Rust extension and bundled htslib for Linux
-(x86_64, aarch64) and macOS (Intel, Apple Silicon) with Python 3.10-3.13.
-
-.. note::
-
-   The PyPI package does not include samtools, bcftools, or bedtools.
-   Install them separately before running WASP2:
-
-   * On Ubuntu/Debian: ``sudo apt-get install bcftools bedtools samtools``
-   * On macOS: ``brew install bcftools bedtools samtools``
-   * Via conda: ``mamba install -c bioconda samtools bcftools bedtools``
+The PyPI wheel includes the WASP2 Python package and Rust extension, but it
+does not install external tools such as ``samtools``, ``bcftools``, or
+``bedtools``. Install those separately before running mapping or counting.
 
 Via Docker
 ----------
 
-WASP2 is available as a multi-platform Docker image (linux/amd64 + linux/arm64)
-with all dependencies pre-installed:
+The Docker image is the most reproducible fully bundled option available in
+this environment.
 
 .. code-block:: bash
 
    docker pull ghcr.io/mcvickerlab/wasp2:1.4.0
 
-   # Run a command
-   docker run --rm -v /path/to/data:/data ghcr.io/mcvickerlab/wasp2:1.4.0 \
-       wasp2-count count-variants /data/sample.bam /data/variants.vcf
-
-   # Interactive shell
-   docker run -it --rm -v /path/to/data:/data ghcr.io/mcvickerlab/wasp2:1.4.0 bash
-
-The image includes samtools, bcftools, bedtools, and the Rust-accelerated backend.
+   docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-count --help
+   docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-map --help
+   docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-analyze --help
+   docker run --rm ghcr.io/mcvickerlab/wasp2:1.4.0 wasp2-ipscore --help
 
 Via Singularity/Apptainer
 -------------------------
 
-For HPC environments that don't support Docker:
+For HPC environments that require SIF images:
 
 .. code-block:: bash
 
    singularity pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
    singularity exec wasp2.sif wasp2-count --help
 
-Development Installation
-------------------------
-
-For contributing or building from source:
+or with Apptainer:
 
 .. code-block:: bash
 
-   git clone https://github.com/mcvickerlab/WASP2.git
-   cd WASP2
-   pixi install        # resolves all deps including Rust toolchain
-   pixi run verify     # build + test suite in one step
+   apptainer pull wasp2.sif docker://ghcr.io/mcvickerlab/wasp2:1.4.0
+   apptainer exec wasp2.sif wasp2-count --help
 
-`pixi <https://pixi.sh>`_ resolves Python, Rust toolchain, samtools, bcftools,
-bedtools, and htslib automatically. No system packages required.
+.. note::
 
-Compiling pgenlib
-~~~~~~~~~~~~~~~~~
+   Docker was validated in this development environment. ``apptainer`` /
+   ``singularity`` binaries were not available locally during this doc update,
+   so those examples reflect the intended pull/exec workflow but were not
+   executed here.
 
-WASP2 optionally uses `pgenlib <https://github.com/chrchang/plink-ng/tree/master/2.0/Python>`_
-for PLINK2 file I/O. This requires a C compiler:
+Development Installation
+------------------------
 
-* On Ubuntu/Debian: ``sudo apt-get install build-essential python3-dev``
-* On macOS: ``xcode-select --install``
-* On RHEL/Fedora: ``sudo dnf install gcc gcc-c++ python3-devel``
+.. code-block:: bash
 
-pgenlib is installed automatically via pip when you install WASP2.
+   git clone https://github.com/mcvickerlab/WASP2.git
+   cd WASP2
+   pixi install
+   pixi run verify
 
 Verification
 ------------
@@ -104,3 +80,4 @@ Verification
    wasp2-count --help
    wasp2-map --help
    wasp2-analyze --help
+   wasp2-ipscore --help
diff --git a/docs/source/methods/mapping_filter.rst b/docs/source/methods/mapping_filter.rst
index c3257f2..37b24d7 100644
--- a/docs/source/methods/mapping_filter.rst
+++ b/docs/source/methods/mapping_filter.rst
@@ -132,7 +132,10 @@ The WASP filter compares original and remapped positions:
 
 .. code-block:: bash
 
-   wasp2-map filter-remapped original_to_remap.bam remapped.bam output.bam
+   wasp2-map filter-remapped \
+     remapped.bam \
+     --wasp_data_json sample_wasp_data_files.json \
+     --out_bam output.bam
 
 A read passes if:
 
@@ -272,7 +275,7 @@ The typical WASP2 workflow:
 
    # Step 2: Create swapped reads
    wasp2-map make-reads sample.bam variants.vcf \
-     --samples SAMPLE1 --out-dir wasp_temp/
+     --samples SAMPLE1 --out_dir wasp_temp
 
    # Step 3: Remap swapped reads (SAME ALIGNER!)
    bwa mem -M genome.fa wasp_temp/swapped_r1.fq wasp_temp/swapped_r2.fq | \
diff --git a/docs/source/tutorials/quickstart_mapping.rst b/docs/source/tutorials/quickstart_mapping.rst
index 57baffb..41317a0 100644
--- a/docs/source/tutorials/quickstart_mapping.rst
+++ b/docs/source/tutorials/quickstart_mapping.rst
@@ -68,7 +68,7 @@ Identify reads overlapping heterozygous SNPs and generate allele-swapped version
 
    wasp2-map make-reads sample.bam variants.vcf.gz \
      --samples SAMPLE1 \
-     --out-dir wasp_output/
+     --out_dir wasp_output
 
 This produces (where ``sample`` is your BAM file prefix):
 
@@ -103,9 +103,9 @@ The WASP filter compares original and remapped positions:
 .. code-block:: bash
 
    wasp2-map filter-remapped \
-     wasp_output/sample_to_remap.bam \
      wasp_output/sample_remapped.bam \
-     wasp_output/sample_wasp_filtered.bam
+     --wasp_data_json wasp_output/sample_wasp_data_files.json \
+     --out_bam wasp_output/sample_wasp_filtered.bam
 
 Understanding Filter Statistics
 -------------------------------
@@ -172,7 +172,7 @@ Complete Workflow Script
    echo "Step 1: Creating swapped reads..."
    wasp2-map make-reads $BAM $VCF \
      --samples $SAMPLE \
-     --out-dir $OUTDIR/
+     --out_dir $OUTDIR
 
    # Step 2: Remap with same aligner
    echo "Step 2: Remapping swapped reads..."
@@ -185,9 +185,9 @@ Complete Workflow Script
    # Step 3: Filter biased reads
    echo "Step 3: Filtering biased reads..."
    wasp2-map filter-remapped \
-     $OUTDIR/${PREFIX}_to_remap.bam \
      $OUTDIR/${PREFIX}_remapped.bam \
-     $OUTDIR/${PREFIX}_wasp_filtered.bam
+     --wasp_data_json $OUTDIR/${PREFIX}_wasp_data_files.json \
+     --out_bam $OUTDIR/${PREFIX}_wasp_filtered.bam
 
    # Step 4: Merge with non-overlapping reads
    echo "Step 4: Merging final BAM..."
diff --git a/docs/source/tutorials/scrna_seq.rst b/docs/source/tutorials/scrna_seq.rst
index dcdb325..1ee8d54 100644
--- a/docs/source/tutorials/scrna_seq.rst
+++ b/docs/source/tutorials/scrna_seq.rst
@@ -162,7 +162,7 @@ Run the single-cell allele counting:
      cellranger_output/outs/possorted_genome_bam.bam \
      phased_variants.vcf.gz \
      barcodes_celltype.tsv \
-     --region genes.gtf \
+     --feature genes.gtf \
      --samples SAMPLE_ID \
      --out_file allele_counts.h5ad
 
@@ -293,7 +293,7 @@ Low Read Counts
 Single-cell data is sparse. Consider:
 
 * Using pseudo-bulk aggregation by cell type
-* Lowering ``--min-count`` threshold
+* Lowering ``--min`` / ``--min_count`` threshold
 * Focusing on highly expressed genes
 
 Memory Issues
diff --git a/docs/source/user_guide/analysis.rst b/docs/source/user_guide/analysis.rst
index f486d11..23b015e 100644
--- a/docs/source/user_guide/analysis.rst
+++ b/docs/source/user_guide/analysis.rst
@@ -4,520 +4,92 @@ Analysis Module
 Overview
 --------
 
-The analysis module detects statistically significant allelic imbalance using beta-binomial models.
+``wasp2-analyze`` runs allelic imbalance analysis on bulk or single-cell count
+outputs.
 
-Purpose
--------
-
-* Detect allelic imbalance at genomic regions
-* Control for biological and technical variation
-* Support single-cell and bulk RNA-seq
-* Compare imbalance between groups/conditions
-
-Statistical Models
-------------------
-
-Beta-Binomial Model
-~~~~~~~~~~~~~~~~~~~
-
-WASP2 uses beta-binomial distribution to model:
-* Overdispersion (variation beyond binomial)
-* Biological variability between regions
-* Technical noise in sequencing
-
-The model:
-* Null hypothesis: Equal expression from both alleles (p=0.5)
-* Alternative: Allelic imbalance (p ≠ 0.5)
-* FDR correction for multiple testing
+Commands:
 
-Dispersion Parameter
-~~~~~~~~~~~~~~~~~~~~
-
-Two models:
-1. **Single**: One dispersion parameter for all regions
-2. **Linear**: Dispersion varies with read depth
-
-CLI Usage
----------
-
-Basic Analysis
-~~~~~~~~~~~~~~
-
-.. code-block:: bash
+* ``find-imbalance``: bulk allelic imbalance from TSV counts
+* ``find-imbalance-sc``: per-group single-cell imbalance from ``.h5ad`` counts
+* ``compare-imbalance``: differential imbalance between single-cell groups
 
-   wasp2-analyze find-imbalance counts.tsv
-
-Options
-~~~~~~~
+Bulk Analysis
+-------------
 
 .. code-block:: bash
 
    wasp2-analyze find-imbalance \
      counts.tsv \
-     --min-count 10 \
+     --min 10 \
      --pseudocount 1 \
-     --model single \
-     --output results.tsv
-
-Parameters
-----------
-
-``--min-count``
-~~~~~~~~~~~~~~~
-
-Minimum total read count per region (default: 10):
-
-.. code-block:: bash
-
-   --min-count 20  # More stringent
-
-``--pseudocount``
-~~~~~~~~~~~~~~~~~
-
-Pseudocount added to avoid zero counts (default: 1):
-
-.. code-block:: bash
-
-   --pseudocount 0  # No pseudocount
-
-``--model``
-~~~~~~~~~~~
-
-Dispersion model (default: single):
-
-.. code-block:: bash
-
-   --model linear  # Depth-dependent dispersion
-
-``--phased``
-~~~~~~~~~~~~
-
-Use phased genotype information:
-
-.. code-block:: bash
-
-   --phased  # Requires phased VCF
-
-Output Format
--------------
-
-Tab-separated file with columns:
-
-Statistical Columns
-~~~~~~~~~~~~~~~~~~~
-
-* ``region``: Genomic region identifier
-* ``ref_count``: Total reference allele counts
-* ``alt_count``: Total alternate allele counts
-* ``p_value``: Likelihood ratio test p-value
-* ``fdr_pval``: FDR-corrected p-value
-* ``effect_size``: Log2 fold-change (ref/alt)
+     --output ai_results.tsv
 
-Model Parameters
-~~~~~~~~~~~~~~~~
+Useful options:
 
-* ``dispersion``: Beta-binomial dispersion parameter
-* ``log_likelihood_null``: Null model log-likelihood
-* ``log_likelihood_alt``: Alternative model log-likelihood
-
-Interpreting Results
---------------------
-
-Significant Imbalance
-~~~~~~~~~~~~~~~~~~~~~
-
-FDR < 0.05 indicates significant imbalance:
-
-* **Biological**: cis-regulatory variation, ASE
-* **Technical**: mapping bias (check WASP), PCR artifacts
-
-Effect Size
-~~~~~~~~~~~
-
-* log2FC > 1: Strong imbalance (2-fold difference)
-* log2FC > 2: Very strong imbalance (4-fold difference)
+* ``--min`` / ``--min_count``: minimum total count threshold
+* ``--pseudocount``: pseudocount added before modeling
+* ``--model``: dispersion model (currently ``single`` or ``linear`` input)
+* ``--output`` / ``--out_file`` / ``-o``: output TSV path
+* ``--region_col``: explicit region column name if auto-detection is not desired
+* ``--groupby``: group on an alternate annotation column, such as a parent gene column
 
 Single-Cell Analysis
 --------------------
 
-For single-cell data, WASP2 detects allelic imbalance within specific cell populations
-using aggregated counts across cells of the same type.
-
 .. code-block:: bash
 
    wasp2-analyze find-imbalance-sc \
-     adata.h5ad \
-     barcode_map.tsv \
-     --sample donor1 \
-     --min-count 10
-
-Output: Per-celltype TSV files (``ai_results_[CELLTYPE].tsv``).
-
-Single-Cell Comparative Imbalance
----------------------------------
-
-Overview
-~~~~~~~~
-
-The comparative imbalance analysis detects **differential allelic imbalance** between
-cell types, conditions, or biological groups. This is useful for identifying:
-
-* Cell-type-specific regulatory variation
-* Sex differences in chromatin accessibility
-* Condition-dependent allelic effects (e.g., treatment vs control)
-* Developmental stage-specific imbalance
-
-Statistical Model
-~~~~~~~~~~~~~~~~~
-
-The analysis uses a **likelihood ratio test (LRT)** comparing two hypotheses:
-
-* **Null (H0)**: Both groups share the same allelic imbalance (μ_combined)
-* **Alternative (H1)**: Groups have different imbalance (μ₁ ≠ μ₂)
-
-The test statistic follows a chi-squared distribution with 1 degree of freedom:
-
-.. code-block:: text
-
-   LRT = -2 × (log L_null - log L_alt)
-   p-value = P(χ²(df=1) > LRT)
-
-Input Format: Count Matrix (.h5ad)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The count matrix must be an AnnData object with the following structure:
-
-.. code-block:: text
-
-   AnnData object (n_obs × n_vars)
-   ├── .obs                 # SNP metadata (rows)
-   │   ├── index            # SNP identifiers (0, 1, 2, ...)
-   │   └── [sample_name]    # Genotypes: '0|1', '1|0', '0/1', '1/0'
-   │
-   ├── .var                 # Cell metadata (columns)
-   │   └── group            # Cell type/group assignment
-   │
-   ├── .layers
-   │   ├── "ref"            # Reference allele counts (sparse matrix)
-   │   └── "alt"            # Alternate allele counts (sparse matrix)
-   │
-   └── .uns
-       ├── feature          # DataFrame: SNP → region mapping
-       └── samples          # List of sample names
-
-**Example count matrix creation:**
-
-.. code-block:: bash
-
-   # Generate counts from BAM + variants + barcodes
-   wasp2-count count-variants-sc \
-     aligned.bam \
-     variants.vcf.gz \
-     barcodes.txt \
-     --samples NA12878 \
-     --feature peaks.bed \
-     --out_file allele_counts.h5ad
-
-Input Format: Barcode Map (TSV)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A two-column TSV file (no header) mapping cell barcodes to groups:
-
-.. code-block:: text
-
-   AAACGAACAGTCAGTT-1    excitatory_neurons
-   AAACGAAGTCGCTCTA-1    inhibitory_neurons
-   AAACGAAGTGAACCTA-1    excitatory_neurons
-   AAAGGATCATCGATGT-1    astrocytes
-   AAAGGATGTGCAACGA-1    microglia
-
-**Requirements:**
-
-* Tab-separated (``\t``)
-* No header row
-* Barcodes must match those in the count matrix
-* Groups can be cell types, conditions, sex, or any categorical variable
-
-Basic Usage
-~~~~~~~~~~~
-
-Compare imbalance between all groups:
-
-.. code-block:: bash
-
-   wasp2-analyze compare-imbalance \
-     allele_counts.h5ad \
-     barcode_map.tsv
-
-Compare specific groups only:
-
-.. code-block:: bash
-
-   wasp2-analyze compare-imbalance \
      allele_counts.h5ad \
-     barcode_map.tsv \
-     --groups "excitatory_neurons,inhibitory_neurons"
-
-Output Format
-~~~~~~~~~~~~~
-
-Results are written to ``ai_results_[GROUP1]_[GROUP2].tsv``:
-
-.. list-table::
-   :header-rows: 1
-   :widths: 20 80
-
-   * - Column
-     - Description
-   * - ``region``
-     - Genomic region identifier (peak or gene)
-   * - ``num_snps``
-     - Number of shared heterozygous SNPs in region
-   * - ``combined_mu``
-     - Reference allele frequency under null hypothesis (shared)
-   * - ``mu1``
-     - Reference allele frequency in group 1
-   * - ``mu2``
-     - Reference allele frequency in group 2
-   * - ``null_ll``
-     - Log-likelihood under null (shared μ)
-   * - ``alt_ll``
-     - Log-likelihood under alternative (separate μ values)
-   * - ``pval``
-     - Likelihood ratio test p-value
-   * - ``fdr_pval``
-     - FDR-corrected p-value (Benjamini-Hochberg)
-
-**Interpreting results:**
-
-* ``fdr_pval < 0.05``: Significant differential imbalance
-* ``|mu1 - mu2| > 0.1``: Meaningful effect size (~20% difference)
-* ``mu < 0.5``: Alternate allele favored; ``mu > 0.5``: Reference allele favored
-
-Parameters
-~~~~~~~~~~
-
-``--groups``
-   Comma-separated list of groups to compare. If omitted, compares all pairwise
-   combinations found in the barcode map.
-
-``--min``
-   Minimum total allele count per region per group (default: 10). Higher values
-   increase statistical power but reduce the number of testable regions.
-
-``--pseudocount``
-   Pseudocount added to avoid zero counts (default: 1). Affects dispersion estimation.
-
-``--sample``
-   Sample name for heterozygous SNP filtering. Required if multiple samples are
-   present in the count matrix.
-
-``--phased``
-   Use phased genotype information from VCF. Requires genotypes in ``0|1`` or ``1|0``
-   format. Improves power when haplotype phase is known.
-
-``-z/--z_cutoff``
-   Remove SNPs with counts exceeding this z-score threshold. Useful for removing
-   outliers caused by mapping artifacts or copy number variation.
-
-Example: Sex Differences Analysis
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Identify chromatin accessibility regions with sex-biased allelic imbalance:
+     barcode_groups.tsv \
+     --sample SAMPLE1 \
+     --min 10 \
+     --out_file ai_results.tsv
 
-**Step 1: Prepare barcode map with sex labels**
+``barcode_groups.tsv`` is a two-column TSV:
 
 .. code-block:: text
 
-   # barcode_sex_map.tsv
-   AAACGAACAGTCAGTT-1    male
-   AAACGAAGTCGCTCTA-1    female
-   AAACGAAGTGAACCTA-1    male
-   AAAGGATCATCGATGT-1    female
-
-**Step 2: Run comparative analysis**
-
-.. code-block:: bash
-
-   wasp2-analyze compare-imbalance \
-     allele_counts.h5ad \
-     barcode_sex_map.tsv \
-     --groups "male,female" \
-     --min 20 \
-     --out_file ai_results_sex_comparison.tsv
-
-**Step 3: Filter significant results**
-
-.. code-block:: bash
-
-   # Extract regions with significant sex differences
-   awk -F'\t' 'NR==1 || $9 < 0.05' ai_results_male_female.tsv > significant_sex_diff.tsv
-
-   # Find regions with large effect size
-   awk -F'\t' 'NR==1 || ($4 - $5 > 0.15 || $5 - $4 > 0.15)' significant_sex_diff.tsv
-
-Example: snATAC-seq Cell Type Analysis
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Complete workflow for analyzing cell-type-specific chromatin accessibility imbalance:
-
-**Step 1: Count alleles from snATAC-seq BAM**
-
-.. code-block:: bash
-
-   # Extract valid barcodes from Cell Ranger output
-   zcat filtered_barcodes.tsv.gz > barcodes.txt
-
-   # Count alleles at heterozygous SNPs overlapping peaks
-   wasp2-count count-variants-sc \
-     possorted_bam.bam \
-     phased_variants.vcf.gz \
-     barcodes.txt \
-     --samples sample1 \
-     --feature atac_peaks.bed \
-     --out_file snATAC_counts.h5ad
-
-**Step 2: Create barcode-to-celltype mapping**
-
-Export cell type annotations from your clustering analysis (e.g., Seurat, ArchR):
-
-.. code-block:: r
-
-   # R/Seurat example
-   write.table(
-     data.frame(barcode = Cells(seurat_obj),
-                celltype = Idents(seurat_obj)),
-     "barcode_celltype_map.tsv",
-     sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE
-   )
-
-**Step 3: Run single-cell imbalance analysis**
-
-.. code-block:: bash
+   BARCODE<TAB>GROUP
 
-   # Per-celltype analysis
-   wasp2-analyze find-imbalance-sc \
-     snATAC_counts.h5ad \
-     barcode_celltype_map.tsv \
-     --sample sample1 \
-     --phased \
-     --min 10 \
-     -z 3.0
+The command writes one output file per group using the requested output prefix.
 
-**Step 4: Compare imbalance between cell types**
+Comparative Single-Cell Analysis
+--------------------------------
 
 .. code-block:: bash
 
-   # Compare specific cell types
    wasp2-analyze compare-imbalance \
-     snATAC_counts.h5ad \
-     barcode_celltype_map.tsv \
-     --sample sample1 \
-     --groups "excitatory,inhibitory,astrocyte" \
-     --phased \
-     --min 15
-
-   # This produces:
-   # - ai_results_excitatory_inhibitory.tsv
-   # - ai_results_excitatory_astrocyte.tsv
-   # - ai_results_inhibitory_astrocyte.tsv
-
-**Step 5: Identify cell-type-specific regulatory regions**
-
-.. code-block:: bash
-
-   # Find peaks with differential imbalance between excitatory and inhibitory neurons
-   awk -F'\t' '$9 < 0.01 && ($4 > 0.6 || $4 < 0.4)' \
-     ai_results_excitatory_inhibitory.tsv > neuron_subtype_specific_peaks.tsv
-
-Best Practices for Single-Cell Analysis
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**Data Quality:**
-
-* Use WASP-filtered BAM files to remove mapping bias
-* Require ≥10 total counts per region per group (``--min 10``)
-* Apply z-score filtering to remove outliers (``-z 3.0``)
-
-**Statistical Power:**
-
-* Merge similar cell types if individual populations have low coverage
-* Use phased genotypes when available (``--phased``)
-* Focus on regions with multiple SNPs for better estimates
-
-**Interpretation:**
-
-* Consider biological replication across samples
-* Validate top hits with orthogonal methods (allele-specific CRISPR, etc.)
-* Integrate with eQTL data to identify causal variants
-
-Example Workflow
-----------------
-
-.. code-block:: bash
-
-   # 1. Count alleles
-   wasp2-count count-variants \
-     wasp_filtered.bam \
-     variants.vcf \
-     --region genes.gtf \
-     --samples NA12878 \
-     --output counts.tsv
-
-   # 2. Analyze imbalance
-   wasp2-analyze find-imbalance \
-     counts.tsv \
-     --min-count 20 \
-     --model single \
-     --output imbalance.tsv
-
-   # 3. Filter significant results
-   awk '$5 < 0.05' imbalance.tsv > significant.tsv
-
-Best Practices
---------------
-
-Read Depth
-~~~~~~~~~~
-
-* Minimum 10 reads per region (use ``--min-count``)
-* Higher depth = more power
-* Consider downsampling very deep regions
-
-Quality Control
-~~~~~~~~~~~~~~~
+     allele_counts.h5ad \
+     barcode_groups.tsv \
+     --sample SAMPLE1 \
+     --groups B_cell T_cell \
+     --out_file compare_ai.tsv
 
-* Use WASP-filtered reads
-* Remove low-complexity regions
-* Filter low-quality SNPs
+This compares allelic imbalance between the requested groups and writes one TSV
+per comparison.
 
-Multiple Testing
-~~~~~~~~~~~~~~~~
+Notes
+-----
 
-* FDR correction is automatic
-* Consider Bonferroni for very important regions
-* Validate top hits experimentally
+* If your count file contains genotype columns for multiple samples, you must
+  provide ``--sample`` for single-cell analysis.
+* For bulk analysis, region columns are auto-detected when present in the count
+  TSV. Use ``--region_col`` only when you need to override that behavior.
 
-Common Issues
--------------
-
-No Significant Results
-~~~~~~~~~~~~~~~~~~~~~~
+Outputs
+-------
 
-* Increase sample size
-* Check read depth (use deeper sequencing)
-* Verify heterozygous SNPs present
+Typical bulk outputs include:
 
-Many Significant Results
-~~~~~~~~~~~~~~~~~~~~~~~~
+* region or feature identifier
+* aggregated ``ref_count`` and ``alt_count``
+* p-values and FDR-adjusted p-values
 
-* Check for batch effects
-* Verify WASP filtering was applied
-* Consider stricter FDR threshold
+Typical single-cell outputs include the same statistics stratified by barcode
+group.
 
 Next Steps
 ----------
 
-* Validate results with qPCR or DNA-seq
-* Integrate with eQTL data
-* Perform pathway enrichment analysis
+* :doc:`counting` to generate bulk or single-cell counts
+* :doc:`/tutorials/comparative_imbalance` for group-comparison workflows
diff --git a/docs/source/user_guide/counting.rst b/docs/source/user_guide/counting.rst
index 54db55f..b6a0ba6 100644
--- a/docs/source/user_guide/counting.rst
+++ b/docs/source/user_guide/counting.rst
@@ -4,195 +4,135 @@ Counting Module
 Overview
 --------
 
-The counting module quantifies allele-specific read counts at heterozygous SNP positions. It's the first step in allelic imbalance analysis.
+``wasp2-count`` counts reads supporting reference and alternate alleles at
+variant positions in BAM files.
 
-Purpose
-~~~~~~~
+It provides two commands:
 
-* Count reads supporting reference vs alternate alleles
-* Filter by sample genotype (heterozygous sites)
-* Annotate with genomic regions (genes, peaks)
-* Support single-cell RNA-seq
+* ``count-variants`` for bulk data
+* ``count-variants-sc`` for single-cell data with ``CB``-tagged barcodes
 
-When to Use
-~~~~~~~~~~~
-
-Use counting when you have:
-* Aligned reads (BAM file)
-* Variant calls (VCF file)
-* Want to quantify allele-specific expression
-
-CLI Usage
----------
+Bulk Counting
+-------------
 
-Basic Command
-~~~~~~~~~~~~~
+Basic usage:
 
 .. code-block:: bash
 
-   wasp2-count count-variants BAM_FILE VCF_FILE
+   wasp2-count count-variants sample.bam variants.vcf.gz --out_file counts.tsv
 
-Full Options
-~~~~~~~~~~~~
+With sample filtering and region annotation:
 
 .. code-block:: bash
 
    wasp2-count count-variants \
-     input.bam \
-     variants.vcf \
-     --samples sample1,sample2 \
+     sample.bam \
+     variants.vcf.gz \
+     --samples SAMPLE1 \
      --region genes.gtf \
      --out_file counts.tsv
 
-Input Requirements
-------------------
-
-BAM File
-~~~~~~~~
+Supported region files:
 
-* Aligned reads (single-end or paired-end)
-* Indexed (.bai file in same directory)
-* Sorted by coordinate
+* BED
+* MACS2 ``narrowPeak`` / ``broadPeak``
+* GTF
+* GFF3
 
-VCF File
-~~~~~~~~
+For GTF/GFF3 inputs, WASP2 derives interval annotations from feature rows and
+defaults to ``gene`` features when present.
 
-* Variant calls with genotype information
-* Heterozygous SNPs (GT=0|1 or 1|0)
-* Can include sample-specific genotypes
+Useful options:
 
-Optional: Region File
-~~~~~~~~~~~~~~~~~~~~~
+* ``--samples`` / ``-s``: select het sites for one or more samples
+* ``--region`` / ``-r``: restrict/annotate variants by overlapping regions
+* ``--gene_feature``: choose the GTF/GFF3 feature type
+* ``--gene_attribute``: choose the GTF/GFF3 attribute used as the feature ID
+* ``--gene_parent``: choose the parent/grouping attribute for gene annotations
+* ``--use_region_names``: prefer region names instead of coordinate strings
+* ``--include-indels``: count indels in addition to SNPs
 
-Annotate SNPs overlapping genes/peaks:
+Output columns always include:
 
-* GTF/GFF3 format (genes)
-* BED format (peaks, regions)
-* narrowPeak format (ATAC-seq, ChIP-seq)
+* ``chrom``
+* ``pos`` or ``pos0`` / ``pos`` depending on input path
+* ``ref``
+* ``alt``
+* ``ref_count``
+* ``alt_count``
+* ``other_count``
 
-Parameters
-----------
+When sample filtering is active, genotype columns are included. When region
+annotation is active, region or gene columns are included as well.
 
-``--samples`` / ``-s``
-~~~~~~~~~~~~~~~~~~~~~~
+Single-Cell ATAC Counting
+-------------------------
 
-Filter SNPs heterozygous in specified samples:
+Single-cell counting is designed for **scATAC-seq** data. It requires a BAM
+with ``CB`` tags and a positional barcode file containing one barcode per line.
 
 .. code-block:: bash
 
-   --samples sample1,sample2,sample3
-   # or
-   --samples samples.txt  # one per line
-
-``--region`` / ``-r``
-~~~~~~~~~~~~~~~~~~~~~
-
-Annotate SNPs with overlapping regions:
-
-.. code-block:: bash
-
-   --region genes.gtf      # Gene annotations
-   --region peaks.bed      # ATAC-seq peaks
-   --region regions.gff3   # Custom regions
-
-``--out_file`` / ``-o``
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Output file path (default: counts.tsv):
-
-.. code-block:: bash
-
-   --out_file my_counts.tsv
-
-Output Format
--------------
-
-Tab-separated file with columns:
+   wasp2-count count-variants-sc \
+     sc_atac.bam \
+     variants.vcf.gz \
+     barcodes.tsv \
+     --samples sample1 \
+     --feature peaks.bed \
+     --out_file allele_counts.h5ad
 
-Basic Columns
-~~~~~~~~~~~~~
+Important points:
 
-* ``chr``: Chromosome
-* ``pos``: SNP position (1-based)
-* ``ref``: Reference allele
-* ``alt``: Alternate allele
-* ``ref_count``: Reads supporting reference
-* ``alt_count``: Reads supporting alternate
-* ``other_count``: Reads supporting other alleles
+* ``barcodes.tsv`` is a positional argument, not ``--barcode_map``
+* ``--feature`` and ``--region`` are aliases on the single-cell command
+* Accepts BED and MACS2 peak files (GTF/GFF3 are supported only by the bulk ``count-variants`` command)
 
-Optional Columns (with --region)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The output is an AnnData ``.h5ad`` file with:
 
-* ``gene_id``: Overlapping gene
-* ``gene_name``: Gene symbol
-* ``feature``: Feature type (exon, intron, etc.)
+* sparse count layers for ``ref``, ``alt``, and ``other``
+* variant metadata in ``adata.obs``
+* barcode names in ``adata.var_names``
+* feature-to-variant mapping in ``adata.uns["feature"]`` when annotations are used
 
-Example Workflow
-----------------
+Examples
+--------
 
-1. Basic Counting
-~~~~~~~~~~~~~~~~~
+Count variants without regional annotation:
 
 .. code-block:: bash
 
-   wasp2-count count-variants sample.bam variants.vcf
+   wasp2-count count-variants \
+     filtered.bam \
+     variants.vcf.gz \
+     --samples SAMPLE1 \
+     --out_file counts.tsv
 
-2. Filter by Sample
-~~~~~~~~~~~~~~~~~~~
+Count variants inside peaks:
 
 .. code-block:: bash
 
    wasp2-count count-variants \
-     sample.bam \
-     variants.vcf \
-     --samples NA12878
+     filtered.bam \
+     variants.vcf.gz \
+     --samples SAMPLE1 \
+     --region peaks.bed \
+     --out_file counts_peaks.tsv
 
-3. Annotate with Genes
-~~~~~~~~~~~~~~~~~~~~~~
+Count variants inside genes:
 
 .. code-block:: bash
 
    wasp2-count count-variants \
-     sample.bam \
-     variants.vcf \
-     --samples NA12878 \
+     filtered.bam \
+     variants.vcf.gz \
+     --samples SAMPLE1 \
      --region genes.gtf \
-     --out_file counts_annotated.tsv
-
-Single-Cell Counting
---------------------
-
-For single-cell RNA-seq:
-
-.. code-block:: bash
-
-   wasp2-count count-variants-sc \
-     sc_rnaseq.bam \
-     variants.vcf \
-     --barcode_map barcodes.tsv
-
-Output includes cell-type-specific counts.
-
-Common Issues
--------------
-
-Low Count Numbers
-~~~~~~~~~~~~~~~~~
-
-* Check BAM file coverage (``samtools depth``)
-* Verify VCF contains heterozygous SNPs
-* Ensure BAM and VCF use same reference genome
-
-No Output SNPs
-~~~~~~~~~~~~~~
-
-* Check if --samples filter is too restrictive
-* Verify VCF has genotype information (GT field)
-* Ensure BAM file is indexed
+     --gene_feature gene \
+     --gene_attribute gene_id \
+     --out_file counts_genes.tsv
 
 Next Steps
 ----------
 
-After counting:
-* :doc:`analysis` - Detect allelic imbalance
-* :doc:`mapping` - Correct reference bias with WASP
+* :doc:`analysis` for statistical testing of allelic imbalance
+* :doc:`/user_guide/single_cell` for barcode grouping and single-cell workflows
diff --git a/docs/source/user_guide/ipscore.rst b/docs/source/user_guide/ipscore.rst
new file mode 100644
index 0000000..993fcbd
--- /dev/null
+++ b/docs/source/user_guide/ipscore.rst
@@ -0,0 +1,53 @@
+iPSCORE Utilities
+=================
+
+Overview
+--------
+
+``wasp2-ipscore`` provides helper commands for working with the iPSCORE
+multi-tissue allelic imbalance resource bundled with WASP2.
+
+Commands
+--------
+
+``inventory``
+~~~~~~~~~~~~~
+
+Validate the expected iPSCORE data inventory and optionally write a TSV report.
+
+.. code-block:: bash
+
+   wasp2-ipscore inventory --output inventory.tsv
+
+``manifest``
+~~~~~~~~~~~~
+
+Create a unified sample manifest across tissues and assays.
+
+.. code-block:: bash
+
+   wasp2-ipscore manifest --output manifest.csv --format csv
+
+``qtls``
+~~~~~~~~
+
+Load and summarize QTL resources, optionally filtering by tissue.
+
+.. code-block:: bash
+
+   wasp2-ipscore qtls --tissue iPSC --output qtls.tsv
+
+``validate``
+~~~~~~~~~~~~
+
+Run the combined inventory, manifest, and QTL validation workflow.
+
+.. code-block:: bash
+
+   wasp2-ipscore validate
+
+Notes
+-----
+
+These commands are resource-management utilities. They do not replace the main
+``wasp2-map``, ``wasp2-count``, or ``wasp2-analyze`` analysis workflows.
diff --git a/docs/source/user_guide/mapping.rst b/docs/source/user_guide/mapping.rst
index d38be18..48c8ed8 100644
--- a/docs/source/user_guide/mapping.rst
+++ b/docs/source/user_guide/mapping.rst
@@ -1,221 +1,148 @@
-Mapping Module (WASP)
-=====================
+Mapping Module
+==============
 
 Overview
 --------
 
-The WASP (Weighted Allele-Specific Mapping) algorithm corrects reference bias by remapping reads with all possible alleles.
+``wasp2-map`` implements the WASP remap-and-filter workflow for removing
+reference mapping bias before allele counting.
 
-What is Reference Bias?
-~~~~~~~~~~~~~~~~~~~~~~~~
+The public CLI has two commands:
 
-Reference bias occurs when reads containing alternate alleles align worse than reads with reference alleles, leading to false allelic imbalance signals.
+1. ``make-reads``: find reads overlapping sample variants and generate
+   allele-swapped FASTQ files for remapping
+2. ``filter-remapped``: keep only remapped reads that return to the same locus
 
-WASP Solution
-~~~~~~~~~~~~~
+There is no separate ``find-intersecting-snps`` command in WASP2. That overlap
+step is part of ``make-reads``.
 
-1. Identify reads overlapping heterozygous SNPs
-2. Generate alternative reads (swap alleles)
-3. Remap both original and swapped reads
-4. Keep only reads that map to the same location
-
-Purpose
--------
-
-* Correct reference bias in RNA-seq, ATAC-seq
-* Improve accuracy of allelic imbalance detection
-* Required before allele counting
-
-When to Use
-~~~~~~~~~~~
-
-Use WASP when:
-* Reads will be used for allelic analysis
-* Reference genome differs from sample genotype
-* High-confidence bias correction needed
-
-Workflow
---------
-
-Complete WASP workflow has 3 steps:
-
-Step 1: Find Intersecting SNPs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Identify reads overlapping heterozygous SNPs:
-
-.. code-block:: bash
-
-   wasp2-map find-intersecting-snps \
-     input.bam \
-     variants.vcf \
-     --output intersecting.bam
-
-Output: BAM file with reads overlapping SNPs.
-
-Step 2: Generate Remapping Reads
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Typical Workflow
+----------------
 
-Create reads with swapped alleles:
+Step 1: Generate swapped reads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: bash
 
    wasp2-map make-reads \
-     intersecting.bam \
-     variants.vcf \
-     --samples sample1 \
-     --output remap_reads.fastq
+     sample.bam \
+     variants.vcf.gz \
+     --samples SAMPLE1 \
+     --out_dir wasp_output
 
-Output: FASTQ file(s) with alternative allele sequences.
+This writes:
 
-Step 3: Remap and Filter
-~~~~~~~~~~~~~~~~~~~~~~~~~
+* ``sample_to_remap.bam``: original reads that must be remapped
+* ``sample_keep.bam``: reads that never overlapped eligible variants
+* ``sample_swapped_alleles_r1.fq`` and ``sample_swapped_alleles_r2.fq``:
+  swapped FASTQ reads to realign
+* ``sample_wasp_data_files.json``: metadata for ``filter-remapped``
 
-User remaps with their aligner (BWA, STAR, etc.):
+Step 2: Realign swapped reads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the same aligner and alignment settings used for the original BAM.
 
 .. code-block:: bash
 
-   # Example with BWA
-   bwa mem -t 8 reference.fa remap_reads.fastq | \
-     samtools sort -o remapped.bam -
+   bwa mem -M -t 8 genome.fa \
+     wasp_output/sample_swapped_alleles_r1.fq \
+     wasp_output/sample_swapped_alleles_r2.fq | \
+     samtools sort -o wasp_output/sample_remapped.bam -
+
+   samtools index wasp_output/sample_remapped.bam
 
-Then filter to consistent mappings:
+Step 3: Filter remapped reads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: bash
 
-   wasp2-map filt-remapped-reads \
-     intersecting.bam \
-     remapped.bam \
-     --output filtered.bam
+   wasp2-map filter-remapped \
+     wasp_output/sample_remapped.bam \
+     --wasp_data_json wasp_output/sample_wasp_data_files.json \
+     --out_bam wasp_output/sample_wasp_filtered.bam
 
-Output: BAM file with bias-corrected reads.
+You can also provide ``to_remap_bam`` and ``keep_bam`` positionally instead of
+``--wasp_data_json``.
 
-CLI Reference
--------------
+Command Reference
+-----------------
 
-find-intersecting-snps
-~~~~~~~~~~~~~~~~~~~~~~
+``make-reads``
+~~~~~~~~~~~~~~
 
 .. code-block:: bash
 
-   wasp2-map find-intersecting-snps [OPTIONS] BAM VCF
-
-Options:
-* ``--samples``: Filter by sample genotype
-* ``--output``: Output BAM file
+   wasp2-map make-reads [OPTIONS] BAM VARIANTS
 
-make-reads
-~~~~~~~~~~
+Important options:
 
-.. code-block:: bash
+* ``--samples`` / ``-s``: sample name(s) used to select het variants
+* ``--out_dir`` / ``-o``: output directory
+* ``--out_json`` / ``-j``: explicit metadata JSON path
+* ``--indels``: include indels as well as SNPs
+* ``--threads``: BAM I/O threads
 
-   wasp2-map make-reads [OPTIONS] BAM VCF
+Notes:
 
-Options:
-* ``--samples``: Sample name(s)
-* ``--output``: Output FASTQ prefix
-* ``--paired``: Paired-end mode
+* paired-end input is required
+* phased genotypes are strongly recommended
+* supported variant formats are VCF, VCF.GZ, BCF, and PGEN
 
-filt-remapped-reads
+``filter-remapped``
 ~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: bash
 
-   wasp2-map filt-remapped-reads [OPTIONS] ORIGINAL REMAPPED
-
-Options:
-* ``--output``: Filtered BAM file
-* ``--keep_read_file``: Save kept read IDs
+   wasp2-map filter-remapped [OPTIONS] REMAPPED_BAM [TO_REMAP_BAM] [KEEP_BAM]
 
-Input Requirements
-------------------
+Important options:
 
-* **Original BAM**: Aligned reads from initial mapping
-* **VCF File**: Phased heterozygous SNPs (recommended)
-* **Reference Genome**: Same as used for original alignment
+* ``--wasp_data_json`` / ``-j``: load ``to_remap_bam`` and ``keep_bam`` from
+  ``make-reads`` metadata
+* ``--out_bam`` / ``-o``: output BAM path
+* ``--remap_keep_bam``: optional BAM of remapped reads that passed filtering
+* ``--remap_keep_file``: optional text file of kept read names
+* ``--same-locus-slop``: positional tolerance for same-locus matching
+* ``--threads``: BAM I/O threads
 
-Output Interpretation
----------------------
+Interpreting Outputs
+--------------------
 
-WASP Filter Rate
-~~~~~~~~~~~~~~~~
+Common outcomes after ``filter-remapped``:
 
-Typical filter rates:
-* **Good**: 95-99% reads kept
-* **Acceptable**: 90-95% reads kept
-* **Concerning**: <90% reads kept (check data quality)
+* reads kept because they remap to the same locus
+* reads dropped because they remap elsewhere
+* reads dropped because they fail to remap cleanly
 
-Low filter rate may indicate:
-* Poor mapping quality
-* High SNP density
-* Problematic reference genome
-
-Complete Example
-----------------
+The final WASP-corrected BAM is the output of ``filter-remapped`` merged with
+the ``*_keep.bam`` reads that never required remapping.
 
-Full WASP workflow:
+Example
+-------
 
 .. code-block:: bash
 
-   # Step 1: Find SNP-overlapping reads
-   wasp2-map find-intersecting-snps \
-     original.bam \
-     phased_variants.vcf \
-     --samples NA12878 \
-     --output intersecting.bam
-
-   # Step 2: Generate remapping reads
    wasp2-map make-reads \
-     intersecting.bam \
-     phased_variants.vcf \
-     --samples NA12878 \
-     --paired \
-     --output remap
-
-   # Step 3: Remap (user's aligner)
-   bwa mem -t <threads> reference.fa \
-     remap_R1.fastq remap_R2.fastq | \
-     samtools sort -o remapped.bam -
-   samtools index remapped.bam
-
-   # Step 4: Filter
-   wasp2-map filt-remapped-reads \
-     intersecting.bam \
-     remapped.bam \
-     --output filtered_wasp.bam
-
-   # Step 5: Count alleles (use filtered BAM)
-   wasp2-count count-variants \
-     filtered_wasp.bam \
-     phased_variants.vcf \
-     --samples NA12878
-
-Performance Tips
-----------------
-
-* Use multi-threading for remapping step
-* Filter VCF to high-quality SNPs only
-* Use phased genotypes when available
+     sample.bam \
+     variants.vcf.gz \
+     --samples SAMPLE1 \
+     --out_dir wasp_output
 
-Common Issues
--------------
+   bwa mem -M -t 8 genome.fa \
+     wasp_output/sample_swapped_alleles_r1.fq \
+     wasp_output/sample_swapped_alleles_r2.fq | \
+     samtools sort -o wasp_output/sample_remapped.bam -
 
-Many Reads Filtered
-~~~~~~~~~~~~~~~~~~~~
-
-* Check remapping quality (MAPQ scores)
-* Verify same reference genome used
-* Consider relaxing mapping parameters
-
-Slow Remapping
-~~~~~~~~~~~~~~
+   samtools index wasp_output/sample_remapped.bam
 
-* Use multi-threading (``-t`` flag)
-* Process chromosomes in parallel
-* Consider downsampling for testing
+   wasp2-map filter-remapped \
+     wasp_output/sample_remapped.bam \
+     --wasp_data_json wasp_output/sample_wasp_data_files.json \
+     --out_bam wasp_output/sample_wasp_filtered.bam
 
 Next Steps
 ----------
 
-* :doc:`counting` - Count alleles from WASP-filtered BAM
-* :doc:`analysis` - Analyze allelic imbalance
+* :doc:`counting` to count alleles from the WASP-filtered BAM
+* :doc:`analysis` to test for allelic imbalance
diff --git a/pixi.toml b/pixi.toml
index baa7dd2..fcc8b3a 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -66,6 +66,7 @@ verify = "wasp2-count --help && wasp2-map --help && wasp2-analyze --help && pyth
 build = "maturin develop --release -m rust/Cargo.toml"
 # Run tests
 test = "pytest tests/ -v --tb=short --ignore=tests/benchmarks -m 'not benchmark'"
+test-mapping-parity = "pytest tests/regression/test_mapping_stage_parity.py -v --tb=short"
 # Run benchmarks
 bench = "pytest tests/benchmarks/ -v --tb=short"
 # Lint
diff --git a/pyproject.toml b/pyproject.toml
index 90f76c2..ea1740c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -126,7 +126,7 @@ python-source = "src"
 module-name = "wasp2_rust"
 python-packages = ["counting", "mapping", "analysis", "wasp2", "ipscore"]
 bindings = "pyo3"
-strip = true
+strip = false
 include = ["LICENSE", "README.md"]
 
 [tool.pytest.ini_options]
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index d778994..fa9d689 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -1999,6 +1999,7 @@ dependencies = [
  "noodles-core",
  "noodles-vcf",
  "pyo3",
+ "pyo3-build-config",
  "rayon",
  "rust-htslib",
  "rustc-hash 2.1.1",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index a375ef1..af1c3f4 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -33,6 +33,9 @@ flate2 = "1.1"         # For gzip decompression
 criterion = { version = "0.8", features = ["html_reports"] }
 tempfile = "3.25"
 
+[build-dependencies]
+pyo3-build-config = "0.28.2"
+
 # Benchmarks removed for clean release (benchmark files in paper branch only)
 # [[bench]]
 # name = "mapping_filter_bench"
diff --git a/rust/build.rs b/rust/build.rs
new file mode 100644
index 0000000..258e36c
--- /dev/null
+++ b/rust/build.rs
@@ -0,0 +1,4 @@
+fn main() {
+    pyo3_build_config::add_extension_module_link_args();
+    pyo3_build_config::use_pyo3_cfgs();
+}
diff --git a/scripts/container_smoke_test.sh b/scripts/container_smoke_test.sh
index 690ef28..575b7d5 100755
--- a/scripts/container_smoke_test.sh
+++ b/scripts/container_smoke_test.sh
@@ -64,6 +64,15 @@ for cmd in wasp2-count wasp2-map wasp2-analyze; do
         FAIL=$((FAIL + 1))
     fi
 done
+
+# wasp2-ipscore doesn't have --version, check --help instead
+if wasp2-ipscore --help > /dev/null 2>&1; then
+    echo "  PASS: wasp2-ipscore --help"
+    PASS=$((PASS + 1))
+else
+    echo "  FAIL: wasp2-ipscore --help"
+    FAIL=$((FAIL + 1))
+fi
 echo ""
 
 # ─────────────────────────────────────────────────────────────────────────────
@@ -96,7 +105,7 @@ if [[ -f "$DATA_DIR/sample1.bam" && -f "$DATA_DIR/variants.vcf.gz" ]]; then
     if wasp2-count count-variants \
         "$DATA_DIR/sample1.bam" \
         "$DATA_DIR/variants.vcf.gz" \
-        --samples SAMPLE1 \
+        --samples sample1 \
         --out "$TMP_DIR/counts.tsv" \
         2>/dev/null; then
 
diff --git a/src/counting/__main__.py b/src/counting/__main__.py
index 2fb6945..15deb17 100644
--- a/src/counting/__main__.py
+++ b/src/counting/__main__.py
@@ -80,7 +80,7 @@ def count_variants(
             "-r",
             help=(
                 "Only use variants overlapping regions in file. "
-                "Accepts BED or MACS2 formatted .(narrow/broad)Peak files. "
+                "Accepts BED, MACS2 .(narrow/broad)Peak, GTF, or GFF3 files. "
             ),
         ),
     ] = None,
@@ -126,7 +126,7 @@ def count_variants(
             "--feat",
             help=(
                 "Feature type in gtf/gff3 for counting intersecting SNPs. "
-                "Defaults to 'exon' for snp counting"
+                "Defaults to 'gene' when present."
             ),
         ),
     ] = None,
@@ -237,10 +237,9 @@ def count_variants_sc(
             "--regions",
             "-r",
             help=(
-                "Features used in single-cell experiment. "
+                "Features used in single-cell ATAC experiment. "
                 "Only use variants overlapping features in file. "
-                "Accepts BED or MACS2 formatted .(narrow/broad)Peak files. "
-                "TODO: Implement genes gtf/gff format"
+                "Accepts BED or MACS2 .(narrow/broad)Peak files."
             ),
         ),
     ] = None,
diff --git a/src/counting/count_alleles_sc.py b/src/counting/count_alleles_sc.py
index 1244383..104f8ef 100644
--- a/src/counting/count_alleles_sc.py
+++ b/src/counting/count_alleles_sc.py
@@ -20,6 +20,19 @@
 logger = logging.getLogger(__name__)
 
 
+def _sparse_from_counts(
+    counts: defaultdict[tuple[int, int], int],
+    shape: tuple[int, int],
+) -> csr_matrix:
+    if not counts:
+        return csr_matrix(shape, dtype=np.uint16)
+    return csr_matrix(
+        (list(counts.values()), list(zip(*counts.keys()))),
+        shape=shape,
+        dtype=np.uint16,
+    )
+
+
 class CountStatsSC:
     """Container for mutable single-cell counting statistics.
 
@@ -101,7 +114,12 @@ def make_count_matrix(
     # Maybe do this automatically and parse feature col instead?
     snp_df_cols = ["chrom", "pos", "ref", "alt"]
     if include_samples is not None:
-        snp_df_cols.extend(include_samples)
+        sample_cols = list(include_samples)
+        missing_sample_cols = [col for col in sample_cols if col not in df.columns]
+        if missing_sample_cols and len(sample_cols) == 1 and "GT" in df.columns:
+            sample_name = sample_cols[0]
+            df = df.with_columns(pl.col("GT").alias(sample_name))
+        snp_df_cols.extend(sample_cols)
 
     # Might be more memory efficient to use pandas index instead...
     snp_df = df.select(snp_df_cols).unique(maintain_order=True).with_row_index()
@@ -135,23 +153,10 @@ def make_count_matrix(
 
     # Create sparse matrices
     # sparse array is recommended...but doesnt work with adata
-    sparse_ref = csr_matrix(
-        (list(sc_counts.ref_count.values()), list(zip(*sc_counts.ref_count.keys()))),
-        shape=(snp_df.shape[0], len(bc_dict)),
-        dtype=np.uint16,
-    )
-
-    sparse_alt = csr_matrix(
-        (list(sc_counts.alt_count.values()), list(zip(*sc_counts.alt_count.keys()))),
-        shape=(snp_df.shape[0], len(bc_dict)),
-        dtype=np.uint16,
-    )
-
-    sparse_other = csr_matrix(
-        (list(sc_counts.other_count.values()), list(zip(*sc_counts.other_count.keys()))),
-        shape=(snp_df.shape[0], len(bc_dict)),
-        dtype=np.uint16,
-    )
+    matrix_shape = (snp_df.shape[0], len(bc_dict))
+    sparse_ref = _sparse_from_counts(sc_counts.ref_count, matrix_shape)
+    sparse_alt = _sparse_from_counts(sc_counts.alt_count, matrix_shape)
+    sparse_other = _sparse_from_counts(sc_counts.other_count, matrix_shape)
 
     # Create anndata With total as X
     adata = ad.AnnData(
@@ -171,17 +176,19 @@ def make_count_matrix(
     if include_samples is not None:
         adata.uns["samples"] = include_samples
 
-    # TODO: Allow for other features besides 'region' using include_features
-    # Could be case of no features, or feature is gene
-    if "region" in df.columns:
-        # Get unique snps and associated regions
+    feature_cols = include_features or []
+    if not feature_cols and "region" in df.columns:
+        feature_cols = ["region"]
 
-        # Create dict during analysis step instead
-        adata.uns["feature"] = (
+    if feature_cols:
+        feature_df = (
             df.join(snp_df, on=["chrom", "pos", "ref", "alt"], how="left")
-            .select(["region", "index"])
-            .to_pandas()
+            .select([*feature_cols, "index"])
+            .unique(maintain_order=True)
         )
+        if "region" not in feature_df.columns:
+            feature_df = feature_df.with_columns(pl.col(feature_cols[0]).alias("region"))
+        adata.uns["feature"] = feature_df.to_pandas()
 
         # region_snp_dict = dict(
         #     df.join(snp_df, on=["chrom", "pos", "ref", "alt"], how="left"
diff --git a/src/counting/parse_gene_data.py b/src/counting/parse_gene_data.py
index 6653ed1..682192c 100644
--- a/src/counting/parse_gene_data.py
+++ b/src/counting/parse_gene_data.py
@@ -106,12 +106,12 @@ def parse_gene_file(
     if feature is None:
         feature_list = df.select(pl.col("feature").unique()).to_series()
 
-        if "exon" in feature_list:
-            feature = "exon"
+        if "gene" in feature_list:
+            feature = "gene"
         elif "transcript" in feature_list:
             feature = "transcript"
-        elif "gene" in feature_list:
-            feature = "gene"
+        elif "exon" in feature_list:
+            feature = "exon"
         else:
             logger.warning("exon, gene or transcript not found in feature list: %s", feature_list)
 
@@ -244,6 +244,20 @@ def parse_intersect_genes(
     if parent_attribute is None:
         parent_attribute = "Parent"
 
+    # Guard against empty intersection file (0 variants in region)
+    intersect_path = Path(intersect_file)
+    if not intersect_path.exists() or intersect_path.stat().st_size == 0:
+        return pl.DataFrame(
+            schema={
+                "chrom": pl.Categorical,
+                "pos": pl.UInt32,
+                "ref": pl.Categorical,
+                "alt": pl.Categorical,
+                attribute: pl.Utf8,
+                parent_attribute: pl.Utf8,
+            }
+        )
+
     # AFTER performing gtf_to_bed and intersecting!
     df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
 
@@ -287,8 +301,23 @@ def parse_intersect_genes_new(
     if parent_attribute is None:
         parent_attribute = "Parent"
 
+    # Guard against empty intersection file (0 variants in region)
+    intersect_path = Path(intersect_file)
+    if not intersect_path.exists() or intersect_path.stat().st_size == 0:
+        return pl.DataFrame(
+            schema={
+                "chrom": pl.Categorical,
+                "pos": pl.UInt32,
+                "ref": pl.Categorical,
+                "alt": pl.Categorical,
+                attribute: pl.Utf8,
+                parent_attribute: pl.Utf8,
+            }
+        )
+
     # AFTER performing gtf_to_bed and intersecting!
     df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
+    schema_names = list(df.collect_schema().names())
 
     vcf_schema = [
         pl.col("chrom").cast(pl.Categorical),
@@ -298,12 +327,12 @@ def parse_intersect_genes_new(
     ]
 
     # Expect at min 10 cols, 11 if GT included
-    if len(df.columns) > 10:
-        subset_cols = [df.columns[i] for i in [0, 2, 3, 4, 5, -2, -1]]
+    if len(schema_names) > 10:
+        subset_cols = [schema_names[i] for i in [0, 2, 3, 4, 5, -2, -1]]
         new_cols = ["chrom", "pos", "ref", "alt", "GT", attribute, parent_attribute]
         vcf_schema.append(pl.col("GT").cast(pl.Categorical))
     else:
-        subset_cols = [df.columns[i] for i in [0, 2, 3, 4, -2, -1]]
+        subset_cols = [schema_names[i] for i in [0, 2, 3, 4, -2, -1]]
         new_cols = ["chrom", "pos", "ref", "alt", attribute, parent_attribute]
 
     # Parse dataframe columns
diff --git a/src/counting/run_counting_sc.py b/src/counting/run_counting_sc.py
index 08a0240..c9c4c20 100644
--- a/src/counting/run_counting_sc.py
+++ b/src/counting/run_counting_sc.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import logging
 import re
 from pathlib import Path
 
@@ -98,39 +99,13 @@ def __init__(
                 self.intersect_file = str(
                     Path(self.temp_loc) / f"{variant_prefix}_intersect_regions.bed"
                 )
-                self.is_gene_file = False
-            elif re.search(r"\.g[tf]f(?:\.gz)?$", f_ext, re.I):
-                self.feature_type = "genes"
-                self.intersect_file = str(
-                    Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed"
-                )
-                self.is_gene_file = True
-                gtf_prefix = re.split(r".g[tf]f", Path(self.feature_file).name)[0]
-                self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed")
-                self.use_feature_names = True  # Use feature attributes as region names
-            elif re.search(r"\.gff3(?:\.gz)?$", f_ext, re.I):
-                self.feature_type = "genes"
-                self.intersect_file = str(
-                    Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed"
-                )
-                self.is_gene_file = True
-                gtf_prefix = re.split(r".gff3", Path(self.feature_file).name)[0]
-                self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed")
-                self.use_feature_names = True  # Use feature attributes as feature names
             else:
                 raise ValueError(
-                    f"Invalid feature file type. Expected .bed, .gtf, or .gff3, got: {self.feature_file}"
+                    f"Invalid feature file type. Expected BED or MACS2 peak file, got: {self.feature_file}"
                 )
 
         else:
             self.intersect_file = self.vcf_bed
-            self.is_gene_file = False
-
-        # TODO UPDATE THIS WHEN I ADD AUTOPARSERS
-        if self.is_gene_file:
-            # Possible edge case of vcf and gtf prefix conflict
-            if self.vcf_bed == self.gtf_bed:
-                self.gtf_bed = str(Path(self.temp_loc) / "genes.bed")
 
 
 @tempdir_decorator
@@ -144,7 +119,7 @@ def run_count_variants_sc(
     out_file: str | None = None,
     temp_loc: str | None = None,
 ) -> None:
-    """Run single-cell variant counting pipeline.
+    """Run single-cell ATAC variant counting pipeline.
 
     Parameters
     ----------
@@ -155,7 +130,7 @@ def run_count_variants_sc(
     barcode_file : str
         Path to cell barcode file (one barcode per line).
     feature_file : str | None, optional
-        Path to feature/region file (BED, GTF, or GFF3).
+        Path to feature/region file (BED or MACS2 peak file).
     samples : str | list[str] | None, optional
         Sample ID(s) to process.
     use_region_names : bool, optional
@@ -192,13 +167,13 @@ def run_count_variants_sc(
     )
 
     assert count_files.feature_file is not None
+
     intersect_vcf_region(
         vcf_file=count_files.vcf_bed,
         region_file=count_files.feature_file,
         out_file=count_files.intersect_file,
     )
 
-    # TODO: handle use_region_names better
     df = parse_intersect_region_new(
         intersect_file=count_files.intersect_file,
         samples=count_files.samples,
@@ -206,13 +181,26 @@ def run_count_variants_sc(
         region_col=None,
     )
 
+    # Guard: if no variants survived intersection, warn and write empty output
+    if df.is_empty():
+        logging.getLogger(__name__).warning(
+            "No variants found after intersection — writing empty output file."
+        )
+        import anndata as ad
+
+        ad.AnnData().write_h5ad(count_files.out_file)
+        return
+
     # TODO: handle case where barcode file contains multiple columns
     with open(count_files.barcode_file) as file:
         bc_dict = {line.rstrip(): i for i, line in enumerate(file)}
 
     # Generate Output
     adata = make_count_matrix(
-        bam_file=count_files.bam_file, df=df, bc_dict=bc_dict, include_samples=count_files.samples
+        bam_file=count_files.bam_file,
+        df=df,
+        bc_dict=bc_dict,
+        include_samples=count_files.samples,
     )
 
     # Write outputs
diff --git a/src/mapping/intersect_variant_data.py b/src/mapping/intersect_variant_data.py
index 6d52aba..307ac64 100644
--- a/src/mapping/intersect_variant_data.py
+++ b/src/mapping/intersect_variant_data.py
@@ -10,9 +10,8 @@
 import os
 import subprocess
 from pathlib import Path
+from typing import TYPE_CHECKING
 
-import numpy as np
-import polars as pl
 import pysam
 
 # Multi-format variant support
@@ -25,6 +24,9 @@
 
 logger = logging.getLogger(__name__)
 
+if TYPE_CHECKING:
+    import polars as pl
+
 
 def vcf_to_bed(
     vcf_file: str | Path,
@@ -148,7 +150,7 @@ def make_intersect_df(
     intersect_file: str,
     samples: list[str],
     is_paired: bool = True,
-) -> pl.DataFrame:
+) -> "pl.DataFrame":
     """Parse intersection file into a typed polars DataFrame.
 
     Parameters
@@ -165,6 +167,9 @@ def make_intersect_df(
     pl.DataFrame
         Parsed intersection data with alleles split by sample.
     """
+    import numpy as np
+    import polars as pl
+
     # Create Dataframe
     df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
 
diff --git a/src/mapping/remap_utils.py b/src/mapping/remap_utils.py
index ebf8534..416ebde 100644
--- a/src/mapping/remap_utils.py
+++ b/src/mapping/remap_utils.py
@@ -1,13 +1,15 @@
 import logging
 from collections.abc import Generator
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
-import polars as pl
 from pysam import AlignedSegment, AlignmentFile
 
 logger = logging.getLogger(__name__)
 
+if TYPE_CHECKING:
+    import polars as pl
+
 
 # Generator for iterating through bam
 def paired_read_gen(
@@ -127,13 +129,13 @@ def _build_ref2read_maps(read: AlignedSegment) -> tuple[dict[int, int], dict[int
 
 
 def get_read_het_data(
-    read_df: pl.DataFrame,
+    read_df: "pl.DataFrame",
     read: AlignedSegment,
     col_list: list[str],
     max_seqs: int | None = None,
     include_indels: bool = False,
     insert_qual: int = 30,
-) -> tuple[list[str], list[Any], list[pl.Series]] | None:
+) -> tuple[list[str], list[Any], list["pl.Series"]] | None:
     """Extract heterozygous variant data from read with indel support.
 
     Args:
@@ -150,6 +152,8 @@ def get_read_het_data(
         split_qual: List of quality score segments
         allele_series: List of polars Series with allele data
     """
+    import polars as pl
+
     pos_list = read_df.select(["start", "stop"]).rows()
 
     assert read.query_sequence is not None, "Read has no query sequence"