diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index c612152..c569dfb 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -145,6 +145,17 @@ "./scientific-problem-selection" ] }, + { + "name": "igv-reports", + "source": "./", + "description": "Build self-contained, offline HTML genomic-region reports with igv-reports (create_report). Cohort-aware driver + post-render structural and content verifiers. Includes ONT 5mC/5hmC methylation viewer presets.", + "category": "life-sciences", + "tags": ["bioinformatics", "genomics", "visualization", "variant-validation", "structural-variants", "ont", "nanopore", "methylation", "igv", "html-report"], + "strict": false, + "skills": [ + "./igv-reports" + ] + }, { "name": "tooluniverse", "source": "./tooluniverse", diff --git a/README.md b/README.md index 20015ed..7bb07d1 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ This marketplace provides MCP (Model Context Protocol) servers and skills for li /plugin install nextflow-development@life-sciences /plugin install scvi-tools@life-sciences /plugin install scientific-problem-selection@life-sciences +/plugin install igv-reports@life-sciences ``` For servers requiring authentication (all except PubMed), configure credentials after installation: @@ -146,6 +147,19 @@ Systematic framework for scientific problem selection and strategic research dec - Navigate decision trees in active projects - Strategic research planning and problem choice +#### igv-reports +**Plugin ID**: `igv-reports@life-sciences` + +Build self-contained, offline HTML genomic-region reports with [igv-reports](https://github.com/igvteam/igv-reports) (`create_report`). Cohort-aware driver + post-render structural and content verifiers on top of the upstream Python package. Includes ONT 5mC/5hmC methylation viewer presets. + +**Use cases:** +- Generate per-sample HTML viewers for SV breakpoints, viral integrations, variants, fusion junctions, ChIP peaks, or ROIs +- Build cohort-wide report bundles (one HTML per sample + index) +- Per-read ONT 5mC/5hmC methylation views at promoters / gene bodies / DMRs +- Auto-verify rendered HTML structure and (opt-in) read-count anchors so cohort builds gate on correctness, not just exit code + +**Requirements**: `pip install -U 'igv-reports>=1.16.0'` (upstream engine) + ## Detailed Installation ### 1. Add the marketplace (one time) @@ -172,6 +186,7 @@ Systematic framework for scientific problem selection and strategic research dec /plugin install nextflow-development@life-sciences /plugin install scvi-tools@life-sciences /plugin install scientific-problem-selection@life-sciences +/plugin install igv-reports@life-sciences ``` ### 3. Configure credentials (if needed) diff --git a/igv-reports/LICENSE.txt b/igv-reports/LICENSE.txt new file mode 100644 index 0000000..d2a37d3 --- /dev/null +++ b/igv-reports/LICENSE.txt @@ -0,0 +1,201 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/igv-reports/SKILL.md b/igv-reports/SKILL.md new file mode 100644 index 0000000..7d69985 --- /dev/null +++ b/igv-reports/SKILL.md @@ -0,0 +1,751 @@ +--- +name: igv-reports +description: Use when the user wants an HTML, clickable, browseable, offline, or emailable viewer of genomic data — phrases like "HTML IGV report", "offline IGV", "self-contained HTML", "clickable viewer", "create_report", "igv-reports", "email this viewer", or any browseable HTML of reads at variants, fusion breakpoints, SV junctions, viral integrations, ChIP peaks, ROIs, or ONT 5mC/5hmC methylation views at promoters/gene bodies/DMRs. Trigger even when the user doesn't say "igv-reports" — giveaway is HTML/clickable/offline plus genomic regions. Also fire on /igv-reports. DO NOT use for static PNG/PDF/SVG IGV screenshots — use the igv-screenshots skill instead. +--- + +# igv-reports + +This skill builds **self-contained HTML genomic-region reports** with +[igv-reports](https://github.com/igvteam/igv-reports) (`create_report`). +Each report is a single browseable HTML containing the igv.js viewer plus +embedded data slices for every region. No server, no internet, no IGV +install needed at view time. + +The skill has three entry points: +- **build** — one-shot: sites BED + BAM(s) ± VCF → HTML. +- **cohort** — multi-sample driver from a samplesheet → per-sample HTMLs + index. +- **prep-track** — utility: convert plain-gzip GFF/GTF/BED.gz into a + bgzip + tabix-indexed track that igv-reports can load. + +## What this skill is (and is not) + +This skill is a **driver layer** on top of the upstream `igv-reports` +Python package by the IGV team +([github.com/igvteam/igv-reports](https://github.com/igvteam/igv-reports)). +The naming is unavoidable — both share the `igv-reports` name. + +| Component | Source | Role | +|---|---|---| +| `create_report` CLI | upstream PyPI package `igv-reports` | does the actual HTML rendering | +| `scripts/build_igvreports.py` | **this skill** | wraps `create_report` with default-track resolution, cohort/samplesheet mode, SIF auto-detect | +| `scripts/verify_{report,cohort,anchors}.py` | **this skill** | post-render structural + content audits (not in upstream) | +| `scripts/prep_track.sh` | **this skill** | bgzip+tabix utility for annotation tracks | + +## Install + +```bash +# 1. Install the UPSTREAM igv-reports package (provides `create_report`): +pip install -U 'igv-reports>=1.16.0' + +# 2. The skill's wrapper scripts ship inside this plugin. Once the plugin is +# installed, the scripts live alongside SKILL.md. +``` + +If you only need raw `create_report` (no cohort mode, no verifiers, no +auto-tracks), skip this skill entirely and use upstream directly — +see [igvteam/igv-reports](https://github.com/igvteam/igv-reports) docs. + +## Quickstart + +```bash +python scripts/build_igvreports.py \ + --genome hg38 \ + --sites sites.hg38.bed \ + --bam tumor.bam normal.bam \ + --fasta /path/to/hg38.fa \ + --no-default-tracks \ + --extra-track /path/to/cpg_islands.bed.gz \ + --extra-track /path/to/gencode.v47.annotation.gff3.gz \ + --output report.hg38.html +``` + +If you run many reports across the same genome build, set up a databases YAML +once (schema in `references/databases_config_paths.md`) and point +`$IGV_REPORTS_DB_CONFIG` at it — then `--fasta` and `--no-default-tracks` +become optional. + +## Environment overrides + +All optional. Set per-shell or in a project `.env`: + +| Var | Effect | +|---|---| +| `IGV_REPORTS_DB_CONFIG` | Path to a databases YAML resolving `--genome` to FASTA + default tracks (see `references/databases_config_paths.md`) | +| `IGV_REPORTS_SIF` | Path to an `igv-reports` apptainer SIF (offline / HPC use). Galaxy depot: `https://depot.galaxyproject.org/singularity/igv-reports:1.16.0--pyh7cba7a3_0` | +| `SAMTOOLS_SIF_DEFAULT` | Path to a `samtools` SIF (verifier only) | +| `IGV_REPORTS_BIND` | Colon-separated bind paths for singularity. Unset = no binds. | + +Driver flags `--fasta` and `--no-default-tracks` let you skip the databases +YAML entirely without setting any env var. `--no-apptainer` forces the PATH +`create_report` path even on a SLURM node. The hermetic `tests/unit/` suite +runs anywhere with `pytest` + Python ≥ 3.10. + +## When to use which entry point + +| User request | Entry point | +|---|---| +| "Make an HTML for these 5 SV breakpoints in tumor.bam" | **build** | +| "Give me one HTML per patient for the cohort integration calls" | **cohort** | +| "create_report fails with 'not BGZF' on this gencode" | **prep-track** | + +## Defaults (locked in) + +- Tracks always loaded, top-to-bottom in the viewer: + 1. CpG islands (BED, plain or bgzipped) + 2. Gencode full annotation (GFF3.gz, **transcripts + exons + CDS + UTRs**, NOT a gene-level-only file) + 3. RepeatMasker (BED.gz, bgzipped + tabix-indexed) + Plus the user's BAM(s), VCF, and any extra tracks they pass. +- `--flanking 300` bp on either side of each site (good for SV breakpoints + and point variants alike). Override per call if needed. +- `--standalone` so the HTML is offline-viewable. +- Output filename includes the genome tag — e.g. `cohort.hg38.html` — + so downstream genome-tag enforcement hooks pass. +- Reference FASTA is resolved either explicitly via `--fasta` or via the + YAML pointed to by `$IGV_REPORTS_DB_CONFIG`. Supported genome IDs out of + the box: `hg38`, `mm10`, `mm39`, `t2t_CHM13v2_plusY`, `GRCh37` (extend + `GENOME_ALIASES` in `scripts/build_igvreports.py` if needed). +- Per-genome default track availability when using the YAML is recorded in + `references/databases_config_paths.md` — read it before assembling tracks + so the skill doesn't try to load a track that doesn't exist for the + selected genome. + +## Sites BED format (critical) + +igv-reports' BED parser reads fields **by position** and trips on a header +row (`ValueError: invalid literal for int() with base 10: 'start'`). Always +emit a **plain headerless 4-column BED**: + +``` +chr start end name +chr2 25227855 25342590 DNMT3A_full_gene +``` + +Tab-separated. The `name` becomes the row label in the report's variant +table — make it specific enough to identify the site after deduping. + +By default `create_report` shows only the chr/start/end position columns +in the clickable table. To surface the `name` (or any extra columns from +a 5+ column BED), pass `--info-columns ` to the driver: + +```bash +python scripts/build_igvreports.py ... --info-columns name +python scripts/build_igvreports.py ... --info-columns gene_name,score +``` + +Column names are matched by header (so a `#chrom\tstart\tend\tname\tscore` +header works). For positional BED without a header, the convention is the +4th column = `name`, 5th = `score`, 6th = `strand`. + +The project's `enforce-genome-tag.sh` hook requires a genome tag in the BED +filename: use `sites.hg38.bed`, not `sites.bed`. + +### `--type` for BED-style sites + +When the sites input is a BED (not a VCF), pass `--type mutation` to +`create_report` (or the driver). This gives the right viewer behavior at +each row — one locus per row, no split-screen, table on top. Without it, +some BED layouts trigger create_report's split-screen junction view by +heuristic. Use `--type variant` for VCF sites, or omit for create_report's +auto-detection (only safe with a VCF). + +```bash +python scripts/build_igvreports.py ... --type mutation --info-columns name +``` + +## Pitfalls (the skill should encode and/or detect these) + +| Symptom | Root cause | Fix | +|---|---|---| +| `ValueError: invalid literal for int()` on first row | Header row in sites BED | Strip header — plain BED | +| `UnicodeDecodeError: byte 0x8b` reading a track | igv-reports reading bgzip as text | Filename must end `.gff3.gz` / `.bed.gz` AND be true bgzip (check with `file ` for "extra field") | +| `tabix: not BGZF` | Track was plain-gzipped, not bgzipped | Run **prep-track** entry point | +| `tabix: out of order` while indexing | GFF/GTF/BED records not pos-sorted within chr | **prep-track** does `sort -k1,1 -k4,4n` before bgzip | +| Annotation track empty in viewer | Tabix returns no rows in displayed window — often correct biology (e.g., CGI-distal site). Confirm with `tabix file region` | +| Genome ID lookup fails with `--genome hg38` | igv.js bundled IDs require internet at view + render time. Use `--fasta /path/to/local.fa` instead (always works offline) | + +Full pitfalls + create_report flag reference in `references/best_practices.md`. + +## How to run — quick recipe + +Ensure `create_report` is on PATH (`pip install -U 'igv-reports>=1.16.0'`). +If you use a conda env, activate it first. + +Then call the bundled driver script (paths relative to the installed plugin): + +```bash +python scripts/build_igvreports.py \ + --sites results/run/inputs/sites.hg38.bed \ + --bam tumor.bam normal.bam \ + --vcf calls.vcf \ + --genome hg38 \ + --fasta /path/to/hg38.fa \ + --no-default-tracks \ + --output results/run/reports/cohort.hg38.html +``` + +The driver: +- Resolves the genome's CpG / gencode / rmsk paths from `$IGV_REPORTS_DB_CONFIG` + if set, skipping any that aren't configured for the chosen genome. +- Validates the sites BED is headerless and that all rows have `start < end`. +- Calls `create_report` with `--flanking 300 --standalone`. +- Writes a logs/ entry capturing the full command, the flanking value, the + per-region embedded data sizes, and the resolved track list — useful for + reproducibility and audit-trail expectations. + +For multi-sample cohorts, use `--samplesheet samplesheet.tsv` instead of +`--bam/--vcf`. Samplesheet format: `sample, bam_tumor, bam_normal, vcf, sites_bed`. +The driver emits one HTML per sample plus a top-level `index.html` that lists +all samples with links. Pass `--jobs N` to build the per-sample HTMLs in +parallel via `ThreadPoolExecutor` (each `create_report` call is I/O-bound on +BAM slicing, so threading scales well; `--jobs 6` for a 6-patient cohort +roughly 1/Nx wall-clock vs sequential). Default is `--jobs 1`. Layout matches +the ATLL viral-integration reference implementation: + +``` +results// +├── inputs//sites..bed +├── reports/..html +├── reports/index.html +└── logs/run_.log +``` + +## prep-track — fixing a non-bgzip track + +If a GFF3/GTF/BED.gz is plain-gzip rather than bgzip, igv-reports fails +silently or with an obscure error. Two modes: + +**In-place** (with `.bak.original_gzip` backup) — replaces the original: + +```bash +bash scripts/prep_track.sh /path/to/track.gff3.gz +``` + +**Sibling file** (non-destructive — original untouched) — write the +bgzipped+indexed track to a new path. Use this when other pipelines point +at the original `.gff3.gz` and you can't risk a brief window where the +file is replaced: + +```bash +bash scripts/prep_track.sh /path/to/track.gff3.gz \ + --out /path/to/track.bgz.gff3.gz +``` + +The script (both modes): +1. Backs up the original to `.bak.original_gzip` (in-place mode only). +2. `gunzip -c`s the file. +3. Sorts by `chr` then numeric `pos` (`sort -k1,1 -k4,4n`). + (Gencode delivers records interleaved by feature type at the same locus — + tabix requires pos-sorted.) +4. `bgzip`s to target. +5. `tabix -p `s. +6. Verifies a sample tabix query returns rows. + +Requires `bgzip` and `tabix` from htslib on PATH. + +**Diagnostic** — `file ` for distinguishing the two formats: +- Plain gzip: `gzip compressed data, from Unix, original size ` +- bgzip: `Blocked GNU Zip Format (BGZF; gzipped file with extra field)` + +The `extra field` keyword is the bgzip giveaway. + +## When generating an answer.md / run.sh for the user + +The driver script (`build_igvreports.py`) deliberately abstracts the +underlying `create_report` flags — it sets `--standalone`, `--fasta`, the +`--flanking 300` default, and the YAML-resolved annotation tracks +internally so the user doesn't have to remember them. That abstraction is +good for ergonomics but bad for auditability: a reviewer reading the +`answer.md` later can't see what flags are actually being invoked without +opening the driver source. + +To keep both: when you produce a runnable command for the user, **also +include a code block titled "Equivalent direct create_report invocation" +that shows the fully-expanded command** with all flags and resolved track +paths inline. The user should see the wrapper command they're going to +run AND the underlying command it expands to. Example: + +```` +## Run + +```bash +python build_igvreports.py --genome mm10 --sites peaks.mm10.bed \\ + --bam ./data/ip.bam ./data/input.bam \\ + --output reports/peaks_qc.mm10.html +``` + +### Equivalent direct create_report invocation + +```bash +create_report peaks.mm10.bed \\ + --fasta /path/to/mm10.fa \\ + --flanking 300 --standalone \\ + --tracks ./data/ip.bam ./data/input.bam \\ + /path/to/mm10_CpGIslands.bed \\ + /path/to/gencode.vM25.annotation.gtf.gz \\ + /path/to/rmsk_all_repeats_mm10.bed.gz \\ + --title "ChIP-seq peak QC (mm10) — IP vs Input" \\ + --output reports/peaks_qc.mm10.html +``` +```` + +This costs you ~10 lines and gives the reviewer a full audit trail. For +cohort runs, show the expanded form for ONE representative sample only — +the others differ only in BAM/VCF paths. + +## Post-render verification + +`scripts/verify_report.py` parses a built HTML and confirms it actually +contains what its inputs declared. Six checks: `html_exists`, +`html_min_size`, `region_count` (tableJson rows == sites BED rows), +`region_coords` (each BED row finds a matching `(chrom, start+1, end[, name])` +in tableJson — BED is 0-based, the HTML stores 1-based start), `region_sessions` +(sessionDictionary has one entry per row), and `tracks_present` (every +`name` from `--track-config` or every basename from positional `--tracks` +appears in the decoded igv.js session's `tracks[].name` list). + +```bash +python scripts/verify_report.py \ + --html results//reports/sample.hg38.html \ + --sites results//inputs/sites.hg38.bed \ + --track-config results//inputs/tracks.json \ + --min-size-mb 1.0 \ + --out results//reports/sample.verify.tsv \ + --fail-on-fail +``` + +Output is a TSV with columns `check / status / observed / expected / details` +(also printed to stdout). With `--fail-on-fail`, exits nonzero if any check +is FAIL — wire this into Snakemake / CI so the pipeline gates on render +quality, not just on `create_report`'s exit code. + +NOTE: `--standalone` replaces every track URL with an inlined `data:` URL +after slicing, so URL paths are unrecoverable from the embedded session. +The check matches on track NAMES (which `--standalone` preserves) — for +`--track-config` JSON pass meaningful names; positional `--tracks` mode +uses basenames. + +### Cohort-level verification (`verify_cohort.py`) + +The per-sample verifier above confirms each HTML is internally consistent +but cannot tell whether sample-1's HTML accidentally embeds sample-2's BAM +(e.g., samplesheet typo, copy-paste, tumor/normal slot swap). For cohort +runs, `scripts/verify_cohort.py` adds five cross-sample checks: + +| Check | What it asserts | +|---|---| +| `cohort_html_coverage` (global) | Each samplesheet row has exactly one HTML; flags missing + extras | +| `sample_tracks_match` (per-sample) | Each HTML's session contains every BAM/VCF basename declared in THAT row | +| `no_cross_sample_contamination` (per-sample) | Each HTML contains no basename that belongs to a DIFFERENT row's track columns (default tracks from `databases_config.yaml` are allow-listed) | +| `sample_id_embedded` (per-sample) | The `sample` column value appears in the HTML's `` or filename | +| `index_consistency` (global) | `index.html` links exactly the samplesheet sample set; each target exists and is non-empty | + +**Auto-invoked by default** at the end of `build_igvreports.py --samplesheet` +cohort runs. Disable with `--no-verify`; gate the pipeline with +`--fail-on-fail`. Standalone invocation: + +```bash +python scripts/verify_cohort.py \ + --samplesheet samplesheet.tsv \ + --reports-dir results/<run>/reports/ \ + --genome hg38 \ + --out results/<run>/reports/cohort_verify.tsv \ + --summary results/<run>/reports/cohort_verify.summary.md \ + --fail-on-fail +``` + +The TSV adds a `sample` column on top of the per-sample verify schema, with +`"*"` for cohort-global rows. The markdown rollup (`--summary`) groups +PASS/FAIL counts by check + lists every failure inline. + +Worked regression: `tests/integration/cohort_verify/scenarios.sh` builds a +3-sample cohort and asserts each of four corruption scenarios (missing +HTML, sample swap, index drift, truncated HTML) triggers the expected +check FAILs. + +### Content verification (`verify_anchors.py`) — opt-in, slow + +`verify_cohort.py` proves the HTML *says* the right thing. It can NOT +confirm the embedded BAM *slice* contains the data it claims to. Two +failure modes slip past structural checks: + +1. **Sample swap with matching basename** — the cohort loop wired the wrong + BAM into `sample_1`'s build, but the swapped BAM's `Path.stem` happens + to match what `sample_1`'s row declared (or two files in different dirs + share a basename). Track name passes; slice content is wrong. +2. **Silent empty slice** — region rendered, but the slice has 0 reads + (failed `samtools index`, source BAM corruption, coords outside coverage). + +`scripts/verify_anchors.py` closes the gap by re-running `samtools view -c` +against both the source BAM (at generate time) and the embedded slice (at +verify time), then comparing counts. Two-mode workflow: + +```bash +# 1. After the cohort renders cleanly, freeze the read counts as a regression fixture. +python scripts/verify_anchors.py generate \ + --samplesheet samplesheet.tsv \ + --sites sites.hg38.bed \ + --out anchors.hg38.tsv + +# 2. Re-verify any time after — works against a fresh build of the same inputs, +# or to audit an existing HTML for unexpected content drift. +python scripts/verify_anchors.py verify-cohort \ + --samplesheet samplesheet.tsv \ + --reports-dir results/<run>/reports/ \ + --genome hg38 \ + --anchors anchors.hg38.tsv \ + --out results/<run>/reports/cohort_verify_anchors.tsv \ + --fail-on-fail +``` + +Or chained into the build driver: + +```bash +# Freeze anchors at build time: +python scripts/build_igvreports.py --samplesheet ... --anchors-mode generate \ + --anchors anchors.hg38.tsv + +# Verify a later build against frozen anchors: +python scripts/build_igvreports.py --samplesheet ... --anchors-mode verify \ + --anchors anchors.hg38.tsv --fail-on-fail +``` + +Anchors TSV schema (`#`-prefixed header per lab BED convention): + +``` +#sample track_name track_type chrom start end expected tolerance min max notes +``` + +`track_type` is one of: +- `bam` — `expected` is the count from `samtools view -c -F 1536` against + the source BAM at generate time, and the same count against the + embedded BAM slice at verify time. Default when the column is absent + (backwards compat — pre-2026-05-19 anchor files keep working). +- `bedgraph` — `expected` is the number of data rows in the source + bedGraph overlapping the region (CpG count for methylation data, + peak count for ChIP coverage). Verify-time count comes from the + wig/bedGraph slice embedded by igv-reports in the HTML — gzip-decoded + in-memory, no samtools needed. + +bedGraph tracks come from the samplesheet's `extra_tracks` column. +Anchors for them are generated automatically alongside BAM anchors when +you run `verify_anchors.py generate` against a samplesheet that includes +bedGraph entries (e.g. `*.5mC.bedgraph`, `*.5hmC.bg`, plain or `.gz`). + +`tolerance` is a ratio (default 5%). `min`/`max` are absolute bounds that +override tolerance when set — useful for known-positive sites like +"this integration must have ≥20 reads" or "this promoter must have ≥10 CpGs". + +samtools is resolved in this order: `--samtools-sif PATH` → `$SAMTOOLS_SIF` +→ `$SAMTOOLS_SIF_DEFAULT` → PATH `samtools`. On HPC, prefer a SIF to avoid +the NFS conda cold-start tax. bedGraph anchors don't require samtools. + +**Why this matters for methylation viewers**: the silent-failure mode for +methylation reports is "region rendered, slice has 0 CpGs" — an empty +bedGraph slice because the source had no calls in that window, or +because the slice extraction silently dropped them. Pure structural +verification confirms the bedGraph track is in the HTML but can't tell +whether it's empty. The bedgraph-anchor mode closes this gap. + +**Why opt-in and not default:** the verify step shells out to samtools per +(sample × region) and indexes each slice — ~1 s/anchor. For a 6-sample +cohort × 50 regions that's ~5 min on top of the structural verify (which +runs in seconds). Reach for this when sample swap or content regression +is a real concern; the structural verifier is sufficient for routine builds. + +Worked regression: `tests/integration/anchor_verify/scenarios.sh` builds a +2-sample cohort and asserts each of four content scenarios (tolerance +violation, min-bound violation, corrupted slice, missing anchor) triggers +the expected PASS / FAIL / SKIP outcome. + +## Output and workflow logging + +Every run logs to `logs/run_<YYYYMMDD_HHMMSS>.log` next to the reports dir. +The log captures: +- Resolved track paths (per genome, after databases_config.yaml lookup). +- The exact `create_report` command. +- The flanking value used (default **300 bp** — this is the value that's + baked into all the embedded data slices, so audit trails depend on it). +- Per-region embedded data sizes (extracted post-render so the user can + see which regions inflated the HTML). +- Total HTML size. + +This satisfies CLAUDE.md §"Logging and Audit Trail" — every run is +reproducible from the log alone. + +## Track choice nuances + +For gencode on hg38, the default points at +`gencode.v47.annotation.gff3.gz` (full annotation, bgzip + tabix). This +gives transcript models with exons / CDS / UTRs. The gene-level-only +companion (`gencode.v47.genes.annotation.sorted.gff3.gz`) renders only +solid gene boxes and is fine for high-zoom views, but the full annotation +is the right default for read-level inspection at integration / fusion / +SV junctions. + +For mouse genomes, `databases_config.yaml` ships `.gtf.gz` paths instead. +GTFs work in igv-reports if bgzip + tabix-indexed; **prep-track** converts +plain-gzip GTFs the same way it does GFF3s. + +For T2T-CHM13, only the FASTA + GTF + CGI are indexed in our DB; rmsk is +absent and is auto-skipped by the driver. The variant table will load +without rmsk; flag this in the run log. + +## Common-case examples + +The `examples/` directory has runnable templates: + +- `single_sample.sh` — one BAM + one VCF + a sites BED → one HTML. +- `cohort_samplesheet.sh` — TSV-driven multi-sample run. +- `prep_track_demo.sh` — convert a plain-gzip gencode to bgzip+tabix. +- `methylation_ont/` — ONT 5mC/5hmC viewer (BAM with `colorBy: basemod2` + + per-sample bedGraph at fixed y-axis 0..100). End-to-end worked + example with pre-sliced data; recipe.md explains the slots. + +These are reference implementations; copy and edit them for new runs +rather than starting from scratch. + +## Tests + +Three-layer suite under `tests/`, orchestrated by `tests/run_all.sh`: + +| Layer | What it covers | Runtime | Needs | +|---|---|---|---| +| **unit** (`tests/unit/`) | parser layer of `verify_report.py` + `verify_anchors.py` — TSV loading, status decision, session-entry locator, balanced-brace JSON extractor, decode round-trip — all with synthetic inputs | ~1 s | pytest | +| **smoke** (`tests/smoke/`) | `samtools_count` / `samtools_index` / full slice-decode-and-count round-trip against the committed `tests/fixtures/tiny_colo829.hg38.bam` (457 KB, sliced from public ONT COLO829 release) | ~3 s | pytest + samtools (SIF or PATH) | +| **integration** (`tests/integration/`) | end-to-end: build a 2-/3-sample cohort, structural verify, anchor verify, run 4 corruption scenarios per verifier | ~7 min cold, ~30 s cached | full cohort BAMs (lab default OR `IGV_REPORTS_TEST_BAM_{1,2,3}` env override). SKIPs with exit 77 if neither is available | + +```bash +bash tests/run_all.sh # all three layers +bash tests/run_all.sh --unit-only # ~1 s — fastest feedback loop +bash tests/run_all.sh --no-integration # ~12 s — works on any machine +bash tests/run_all.sh --integration-only +``` + +The fixture provenance + regeneration recipe live in +[tests/fixtures/README.md](tests/fixtures/README.md). Anchor counts the +smoke layer expects (chr2=5, chr7=9) are the contract — any fixture +regeneration that changes them must also update the smoke test constants. + +## ONT methylation viewers (specialized path) + +For per-read 5mC/5hmC visualization the positional `--tracks` API does +not work — you need named tracks with `colorBy: "basemod2"` on the BAMs +and `min: 0, max: 100` on the bedGraph tracks (cross-sample y-axis lock, +see `rules/igv.md`). Use the `--track-config <json>` passthrough: + +```bash +# 1. Write a YAML spec listing samples (see tracks_spec.example.yaml). +# 2. Generate tracks.json with the right defaults baked in: +python scripts/generate_tracks_json.py \ + --spec tracks_spec.yaml --run-dir results/<run>/ \ + --out results/<run>/tracks.json + +# 3. Build the report: +python scripts/build_igvreports.py \ + --sites results/<run>/sites.hg38.bed \ + --track-config results/<run>/tracks.json \ + --genome hg38 --flanking 0 \ + --type mutation --info-columns name \ + --output results/<run>/methylation_report.hg38.html +``` + +### Annotation shortcuts in the YAML + +The default `--tracks` path (SV/variant viewers) auto-resolves CpG islands, +gencode, and RepeatMasker from a databases YAML when you pass +`--genome hg38`. On the `--track-config` (methylation) path you used to +have to hand-paste those paths into the YAML. As of the methylation-polish +round, you can use a `default:` shortcut for the same resolution: + +```yaml +genome: hg38 + +annotation: + # SHORTCUT — resolved from the databases YAML for the genome above. + # Gets an Okabe-Ito color + sensible displayMode you can override per entry. + - default: gencode + - default: cgi + - default: repmasker + - default: epdnew_coding # hg38 only + - default: epdnew_noncoding # hg38 only + + # Mix with EXPLICIT entries when needed (e.g. a pre-sliced custom track): + - name: "My custom peak set" + url: peaks/promoter_slices.bed + format: bed +``` + +Valid `default:` keys: `cgi`, `gencode`, `repmasker`, `epdnew_coding`, +`epdnew_noncoding`. Mixing both forms is supported; order is preserved. +Override the canned `name`/`color`/`displayMode` per entry by adding the +field alongside `default:`. The shortcut needs a top-level `genome:` in +the spec, plus a databases YAML on `--db-config PATH` or +`$IGV_REPORTS_DB_CONFIG` (see `references/databases_config_paths.md` for +the schema). + +Key methylation-specific defaults: +- `--flanking 0` (sites BED already encodes the window — promoter/gene span). +- `--info-columns name` (surface the BED `name` column in the variant table). +- `--type mutation` (one-locus view per row; not split-screen). +- bedGraph not bigwig — igv-reports cannot slice `.bw` directly. + +When `--track-config` is set the driver bypasses the auto-resolved +default annotation tracks (CGI / gencode / rmsk) and the `--bam` / +`--vcf` / `--extra-track` flags — the JSON is the source of truth. +Build annotation slices into the JSON instead. + +**`--apptainer` is auto-detected**: the driver flips to the apptainer SIF +pointed to by `$IGV_REPORTS_SIF` (igv-reports 1.16.0, ~83 MB, pulled from +the Galaxy depot) when `SLURM_JOB_ID` is in the environment — i.e. running +on a compute node where the NFS conda cold-start tax matters. On the login +node or when `$IGV_REPORTS_SIF` is unset, the driver uses PATH +`create_report`. Override with `--apptainer` / `--no-apptainer`; the +decision lands in the run log. + +Full recipe and rationale: `references/methylation_ont.md`. Worked +example with real data: `examples/methylation_ont/`. + +## Exporting HTML and PNG side-by-side (`--also-png`) + +The HTML report is the deep-dive view; sometimes you also need static +PNGs you can email, drop in a Slack channel, or paste into slides. The +driver's `--also-png` flag invokes the sister `igver` tool against the +**same sites BED and same track list** that drove `create_report`, so +both artifacts cover identical regions with matching content. + +```bash +python scripts/build_igvreports.py \ + --samplesheet samplesheet.tsv \ + --genome hg38 \ + --output-dir results/run/reports/ \ + --jobs 6 \ + --also-png \ + --png-dpi 600 --png-display-mode collapse +``` + +Output layout per sample: + +``` +results/run/reports/ +├── <sample>.hg38.html # interactive +├── png_<sample>.hg38/ +│ ├── igver_regions.bed # flanked BED with UIDs (igver -r) +│ ├── igver_input.txt # track paths, one per line (igver -i) +│ ├── manifest.tsv # bridge: BED row ↔ PNG ↔ HTML row +│ └── png/ +│ ├── chr1-100-500.alpha.png # one PNG per region +│ └── chr2-0-700.beta.png +└── index.html +``` + +### How consistency is guaranteed — five levers + +1. **Single sites BED with `--flanking` baked in.** The driver writes + `igver_regions.bed` with `start − flanking` and `end + flanking` + already applied (clamped to 0 on the low side); igver sees the same + coordinates create_report's igv.js viewer slices to. +2. **Single resolved track list.** On the default (positional) path the + exact `[BAMs, VCF, extras, defaults]` list passed to `create_report` is + also written to `igver_input.txt`. On the `--track-config` path the + local-path `url:` entries from the JSON are extracted (http(s) URLs are + skipped — igver can't consume them). +3. **Matched display mode.** Default is `--png-display-mode collapse` to + line up with the HTML's `BAM_DEFAULTS displayMode: COLLAPSED`. Override + to `expand` for per-read SV inspection on both artifacts. +4. **UID-based filenames.** The BED's `name` column (auto-assigned + `region_<idx>` when missing) becomes both the HTML table label (via + `--info-columns name`) and the PNG filename suffix + (`<chr-start-end>.<uid>.png`). A user finds the same region in either + artifact by the same string. +5. **`manifest.tsv` audit trail.** Per-sample TSV with columns: + `bed_row_idx, uid, chrom, start_orig, end_orig, start_flanked, + end_flanked, region, png_path, html_path, html_table_row`. One row + per region in BED order. `verify_cohort.py` reads this to run three + PNG-side checks (count matches, exist + non-empty, html-row contiguity). + +### Resolution of the `igver` invocation + +Order, first match wins: +1. `--igver-cmd '...'` (split on whitespace — supports `apptainer exec ... igver`). +2. `$IGVER_CMD` env var (same shape). +3. `igver` on PATH. + +If none resolve, the build exits before invoking create_report so you +don't pay the HTML cost before finding out PNGs are unavailable. Install +with `pip install igver` or pull a pre-built SIF and point `--igver-cmd` +at it. + +### Methylation caveat (bigwig vs bedGraph) + +The HTML methylation path uses **bedGraph** tracks (igv.js consumes +those directly); igver's per-read methylation view uses **BAMs** with +`--color-by BASE_MODIFICATION`, and igver's cross-sample comparison view +uses **bigwig** tracks. Content can be made identical only if both +formats trace back to the same `modkit pileup` output (`modkit bedmethyl +tobigwig` of the same bedGraph). The driver's `--also-png` passes the +JSON's `url:` entries through verbatim, so if your YAML lists bedGraphs +they'll go to igver as-is — igver will render them but the result may +look different from the HTML's color-coded per-read view. For +publication-quality methylation PNGs, supply a parallel `tracks.json` +that lists bigwigs and run `igver` separately. + +For SV/variant viewers this caveat doesn't apply — both render the +identical BAMs and the result is content-equivalent. + +### Cross-artifact verification + +The driver runs an **inline existence check** right after igver returns: +walks each expected PNG path (`<chr>-<start>-<end>.<uid>.<ext>` derived +from the manifest) and fails the build with an actionable message if +any are missing or zero-byte. This catches igver's documented +silent-exit-0 failure mode (egg-link install without the IGV Java +binary) — `proc.returncode != 0` alone misses it. + +In addition, `verify_cohort.py` then runs three checks per sample: + +| Check | Catches | +|---|---| +| `png_count_matches_bed` | partial igver run (SIGKILL mid-batch), stale manifest from a previous build, filename collisions | +| `pngs_exist_and_nonempty` | empty IGV screenshots (< 10 KB threshold; useful screenshots are typically ≥ 50 KB) | +| `png_html_row_alignment` | manifest rows referencing a different HTML, html_table_row not contiguous 1..N | + +`--png-min-size-kb 5.0` lowers the threshold if you have legitimate +no-data regions where igver produces a near-empty PNG. + +## See also + +- `references/best_practices.md` — full create_report flag reference, + format gotchas, performance notes. Read this if a run fails in a way + not listed in the Pitfalls table above. +- `references/databases_config_paths.md` — per-genome track availability + matrix and exact YAML keys. Read this when adding a new genome or + diagnosing a missing-track warning. +- `references/methylation_ont.md` — ONT 5mC/5hmC cheat-sheet (colorBy, + min:0/max:100, flanking=0, bedGraph vs bigwig, EPDnew lookup). +- `scripts/build_igvreports.py` — the driver. Reads `--samplesheet` or + `--bam/--vcf` direct-args, resolves tracks, validates the sites BED, + writes the HTMLs and the run log. Supports `--track-config <json>` + passthrough for fully-styled track sets. +- `scripts/generate_tracks_json.py` — YAML spec → tracks.json with + ONT-methylation defaults baked in (colorBy=basemod2, min:0/max:100, + group-paired Okabe-Ito colors). +- `scripts/verify_report.py` — post-render structural verifier; parses + the HTML's embedded tableJson + sessionDictionary, confirms region + count / coordinates / track names match the inputs. Emits a verify.tsv + and gates on `--fail-on-fail`. +- `scripts/verify_cohort.py` — cohort-level verifier; layered on top of + verify_report's per-sample checks, adds cross-sample contamination + scanning + index.html / sample-id consistency. Auto-invoked at the end + of `build_igvreports.py --samplesheet`; standalone-runnable too. +- `scripts/verify_anchors.py` — content verifier; samtools-counts the + embedded BAM slices and compares to anchors frozen from the source BAMs + at build time. Catches sample swaps that share basenames and silent + empty slices. Opt-in via `--anchors-mode generate|verify` on the build + driver; slow (~1 s/anchor). See SKILL.md content-verification section. +- `scripts/prep_track.sh` — gunzip → sort → bgzip → tabix utility. +- `igv-screenshots` skill — the **static PNG/PDF/SVG** counterpart based + on igver. Use it instead of this one when the deliverable is a + publication-quality figure rather than a clickable viewer. +- Upstream development: https://github.com/sahuno/igv-reports-skill + — file issues there for skill-level bugs; file issues at + https://github.com/igvteam/igv-reports for `create_report` rendering bugs. diff --git a/igv-reports/examples/portable/README.md b/igv-reports/examples/portable/README.md new file mode 100644 index 0000000..ef14a21 --- /dev/null +++ b/igv-reports/examples/portable/README.md @@ -0,0 +1,20 @@ +# examples/portable + +Reference invocations using only paths and tools you control (no lab +`databases_config.yaml`. Each script accepts environment-variable +overrides for input paths, with `${HOME}/data/...` defaults you can edit +in-place or override at call time: + +```bash +FASTA=/path/to/hg38.fa TUMOR_BAM=/path/to/tumor.bam \ + bash examples/portable/single_sample.sh +``` + +| Script | What it does | +|---|---| +| `single_sample.sh` | Builds one HTML for a tumor/normal pair at two SNV sites | +| `cohort_samplesheet.sh` | Builds per-sample HTMLs + index.html from a 2-row samplesheet | + +For more advanced examples (cohort orchestration, ONT methylation viewer +presets), see the upstream development repo at +https://github.com/sahuno/igv-reports-skill. diff --git a/igv-reports/examples/portable/cohort_samplesheet.sh b/igv-reports/examples/portable/cohort_samplesheet.sh new file mode 100644 index 0000000..5530f05 --- /dev/null +++ b/igv-reports/examples/portable/cohort_samplesheet.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# examples/portable/cohort_samplesheet.sh — generic cohort build. +# +# Builds one HTML per row of a TSV samplesheet, plus an index.html linking +# them all. Demonstrates the samplesheet format and the most common flags. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +WORKDIR="${WORKDIR:-${PWD}/igv_reports_cohort_demo}" +mkdir -p "$WORKDIR" && cd "$WORKDIR" + +# --- inputs (edit these) --- +FASTA="${FASTA:-${HOME}/data/hg38/hg38.fa}" +GENCODE_GFF="${GENCODE_GFF:-${HOME}/data/hg38/gencode.v47.annotation.gff3.gz}" + +# --- samplesheet (one row per sample) --- +# Required columns: sample, sites_bed +# Optional columns: bam_tumor, bam_normal, vcf, extra_tracks (comma-separated) +cat > cohort.tsv <<EOF +sample bam_tumor bam_normal vcf sites_bed +p001 ${HOME}/data/p001/tumor.bam ${HOME}/data/p001/normal.bam sites.hg38.bed +p002 ${HOME}/data/p002/tumor.bam ${HOME}/data/p002/normal.bam sites.hg38.bed +EOF + +# --- shared sites for both patients --- +cat > sites.hg38.bed <<'EOF' +#chrom start end name +chr2 25246499 25246500 DNMT3A_R882 +chr7 148884000 148884001 EZH2_Y646 +EOF + +python "${REPO_ROOT}/scripts/build_igvreports.py" \ + --genome hg38 \ + --samplesheet cohort.tsv \ + --fasta "${FASTA}" \ + --no-default-tracks \ + --extra-track "${GENCODE_GFF}" \ + --output-dir reports \ + --no-apptainer + +echo "Done. Open ${WORKDIR}/reports/index.html in a browser." +echo "Cohort verifier ran automatically; see reports/cohort_verify.summary.md." diff --git a/igv-reports/examples/portable/single_sample.sh b/igv-reports/examples/portable/single_sample.sh new file mode 100644 index 0000000..7296b48 --- /dev/null +++ b/igv-reports/examples/portable/single_sample.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# examples/portable/single_sample.sh — generic single-sample build. +# +# Builds one HTML for a tumor/normal pair at a handful of SNV sites. +# Assumes: +# - `pip install igv-reports` has put `create_report` on PATH. +# - You have your own hg38 FASTA (with .fai sibling) and BAMs. +# - You have your own gencode + CpG-islands track files (or skip them +# with --no-default-tracks alone). +# +# Set these to match your environment before running. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +WORKDIR="${WORKDIR:-${PWD}/igv_reports_demo}" +mkdir -p "$WORKDIR" && cd "$WORKDIR" + +# --- inputs (edit these) --- +FASTA="${FASTA:-${HOME}/data/hg38/hg38.fa}" # must have ${FASTA}.fai +TUMOR_BAM="${TUMOR_BAM:-${HOME}/data/tumor.bam}" +NORMAL_BAM="${NORMAL_BAM:-${HOME}/data/normal.bam}" +GENCODE_GFF="${GENCODE_GFF:-${HOME}/data/hg38/gencode.v47.annotation.gff3.gz}" # bgzip+tabix +CPG_ISLANDS="${CPG_ISLANDS:-${HOME}/data/hg38/hg38_CpGIslands.bed}" + +# --- sites BED (4 cols: chrom, start, end, name) --- +cat > sites.hg38.bed <<'EOF' +#chrom start end name +chr2 25246499 25246500 DNMT3A_R882 +chr7 148884000 148884001 EZH2_Y646 +EOF + +python "${REPO_ROOT}/scripts/build_igvreports.py" \ + --genome hg38 \ + --sites sites.hg38.bed \ + --bam "${TUMOR_BAM}" "${NORMAL_BAM}" \ + --fasta "${FASTA}" \ + --no-default-tracks \ + --extra-track "${GENCODE_GFF}" \ + --extra-track "${CPG_ISLANDS}" \ + --info-columns name \ + --output report.hg38.html \ + --no-apptainer + +echo "Done. Open ${WORKDIR}/report.hg38.html in a browser." diff --git a/igv-reports/references/best_practices.md b/igv-reports/references/best_practices.md new file mode 100644 index 0000000..4fe5c64 --- /dev/null +++ b/igv-reports/references/best_practices.md @@ -0,0 +1,158 @@ +# igv-reports best practices + +Authoritative companion to the skill. Read this when something fails in a +way the SKILL.md pitfalls table doesn't cover, or when introducing a new +input format / track type. + +## Sites/regions input + +Supported by `create_report`: +- **VCF** — variant table is built from CHROM/POS/ID/REF/ALT plus any + `--info-columns` you surface from INFO and `--sample-columns` from + FORMAT. Use `--idlink "https://url/$$"` to make ID a clickable link. +- **BED** — fields parsed by position: `chr / start / end [/ name]`. + A **non-comment header row** (e.g., `chrom start end name`) crashes + `create_report` with `ValueError: invalid literal for int()` because + the parser tries to `int()` the string `start`. A `#`-prefixed comment + header (e.g., `#chrom\tstart\tend\tname`) IS accepted — `create_report` + skips lines starting with `#`. This matches the lab's "BED-like outputs + must have a `#`-prefixed header" convention in CLAUDE.md. +- **MAF** — Mutation Annotation Format (TCGA standard). +- **BEDPE** — paired-end / fusion / SV format. With `--type fusion` each + row is rendered as a multi-locus split-screen view. +- **Generic TSV** — any tab-delimited file. Requires `--sequence`, + `--begin`, `--end` to name the chrom/start/end columns. Add + `--zero_based` if 0-based. + +**File-extension dispatch**: igv-reports picks the parser by extension, +not content. `.bed` → BED parser (which IGNORES `--sequence/--begin/--end`). +If you want a TSV-with-header parsed by name, the extension must NOT be +`.bed`/`.vcf`/`.gff3`/`.maf` — use `.tsv` or `.txt`. + +The project's `enforce-genome-tag.sh` hook requires a genome tag in the +filename: `sites.hg38.bed`, not `sites.bed`. + +## Tracks + +Supported track formats: BAM, CRAM, VCF, BED, GFF3, GTF, WIG, BEDGRAPH. + +**Indexing**: +- BAM/CRAM/VCF MUST be indexed (`.bai`/`.crai`/`.tbi` sidecar). +- Large `.bed.gz` / `.gff3.gz` / `.gtf.gz` SHOULD be tabix-indexed + (`.tbi` sidecar) and **must be true bgzip** — not plain gzip. +- Check format with `file <name>` — true bgzip says + `gzip compressed data, extra field, original size 0`. Plain gzip + has no "extra field". igv-reports trips on plain-gzip .gff3.gz with + cryptic `UnicodeDecodeError: byte 0x8b at position 1` — that 0x8b is + the gzip magic byte the parser is reading as text. + +**Sortedness**: gencode and many other GFF/GTF distributions interleave +records by feature type at the same locus (gene → transcript → exon → CDS → +exon → CDS → ...) rather than strictly position-sorted within each +chromosome. tabix requires pos-sorted within chr. Fix: +`sort -k1,1 -k4,4n` on the body, then bgzip + tabix. The `prep-track` +script in this skill does the full pipeline with backup. + +**Track render order**: the order you pass to `--tracks` is the order +they appear in the IGV.js viewer (top-to-bottom). Convention: +1. BAM/CRAM (the data you want to evaluate) +2. VCF (the calls being inspected) +3. Annotation tracks (genes, regulatory, repeats, CGI) + +The skill defaults always render annotation tracks LAST so they sit at +the bottom and don't push the read evidence off-screen. + +## Reference + +One of `--fasta`, `--twobit`, or `--genome` is required. + +- `--fasta /path/to/local.fa` (with `.fai`) — fully offline, supports + custom or combined references (e.g., host + viral). +- `--genome hg38` — uses igv.js bundled IDs, but **requires internet at + view AND render time** because igv.js fetches the bundled genome. + Avoid for HPC/offline. +- `--twobit` — alternative reference in 2bit format. + +For combined viral+host references, the single FASTA must include all +contigs, and any per-contig tracks must align (e.g., HTLV1_features.bed +must use the same contig name as in the FASTA). + +## Window sizing + +`--flanking N` (igv-reports default 1000, this skill default **300**) +adds N bp on either side of each site. + +| Use case | Recommended flanking | +|---|---| +| Point variants (SNV/indel) | 50–200 bp | +| SV / integration breakpoints | 300–1000 bp (this skill: 300) | +| Whole-gene context | gene length + 5–10 kb | + +`--maxlen N` (default 10,000) — variants exceeding this length switch to +split-screen multilocus view automatically. Useful for SVs > 10 kb. + +`--window N` — initial visible window inside the embedded igv.js viewer +(if not supplied, igv.js defaults to 41 bp, which is too narrow for +read-level inspection). Set to ~`2 × flanking` so the user lands on the +full embedded slice. + +## Output + +- `--standalone` embeds all igv.js JS in the HTML → fully offline, + 4–11 MB per patient typical for cohort runs. +- `--no-embed` keeps external URLs → smaller HTML but online required. + Avoid for HPC/sharing-by-email. + +Per-region BAM data is ALWAYS sliced and embedded by default; only the +flanking-sized portion of large BAMs ships in the HTML — so the HTML stays +manageable even when input BAMs are 100+ GB. + +## Variant table customization + +For VCF input: +- `--info-columns SVTYPE SVLEN ALIGNED_POS DR DV VAF` surfaces those + INFO fields as table columns. +- `--info-columns-prefixes ANN_ HTLV1_` includes any INFO field starting + with the listed prefixes. +- `--sample-columns DP AD GT` (with optional `--samples NAME`) surfaces + per-sample FORMAT fields. +- `--idlink "https://example.com/$$"` makes the VCF ID column clickable + with `$$` replaced by the ID value. + +Order of operations: include `--info-columns` for the call-quality fields +your reviewer needs to see at a glance; the rest is one click into the +variant detail. + +## Performance / size control + +- `--subsample 0.0-1.0` — keep a fraction of BAM alignments per region. + Use for very deep BAMs (>100×) where the rendered viewer would be + read-cluttered. +- `--exclude-flags 1536` (default) — excludes duplicates and QC-fail + reads. Set to 0 to keep everything. +- Render time scales roughly linearly with `n_regions × n_tracks`. The + ATLL cohort run (6 patients × 1–3 integrations + HTLV1 + EBV regions, + 6 tracks) took ~2 min/patient with the gene-level GFF and ~3 min/patient + with the full annotation. + +## Pitfalls observed in production + +| Symptom | Root cause | Fix | +|---|---|---| +| `ValueError: invalid literal for int() with base 10: 'start'` | Non-comment header row in BED sites file | Prefix the header with `#` (skipped by create_report and matches lab convention); or strip it entirely | +| `UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b` | igv-reports reading bgzip as text (file actually plain-gzip but with `.gz` ext) | Convert with prep-track; verify with `file <name>` | +| `tabix: not BGZF` | Plain gzip masquerading as `.gz` | `gunzip → bgzip → tabix` | +| `tabix: out of order` | GFF/GTF/BED records not pos-sorted within chr | `sort -k1,1 -k4,4n` first | +| Empty annotation track in viewer | Tabix lookup returns nothing in window. Often correct biology (e.g., CGI-distal site) — verify with `tabix file region` | +| Title shows weird characters | Unicode em-dash (`—`) in `--title` got mangled by shell escaping | Use plain ASCII `-` | +| HTML loads but viewer is blank | `--genome hg38` without internet at view time | Use `--fasta` + `--standalone` | +| `tabix` index missing for a track | igv-reports looked for `<track>.tbi`, not present | Re-run `tabix -p <gff|gtf|bed>` | +| `samtools index` errors mid-render | BAM index stale (BAM modified after `.bai`) | `samtools index -@ 4 file.bam` | +| Output HTML size much larger than expected | Some region accidentally spans Mb-scale (e.g., a row with start=0 end=chrom_length); flanking compounds this | Validate the sites BED — `awk '$3-$2 > 1e6'` to find offenders | + +## See also + +- Official docs: https://github.com/igvteam/igv-reports +- igv.js track config schema: https://github.com/igvteam/igv.js/wiki/Tracks-2.0 +- This skill's `references/databases_config_paths.md` for which YAML keys + hold which tracks per genome. diff --git a/igv-reports/references/databases_config_paths.md b/igv-reports/references/databases_config_paths.md new file mode 100644 index 0000000..466d13f --- /dev/null +++ b/igv-reports/references/databases_config_paths.md @@ -0,0 +1,70 @@ +# Databases-config YAML schema (for `--db-config` / `$IGV_REPORTS_DB_CONFIG`) + +Optional. Without a databases YAML the driver still works — pass `--fasta` and +`--no-default-tracks` (plus any `--extra-track` you need) on every call. + +The YAML is convenient when running across many regions/cohorts on the same +genome build: one file maps a short `--genome <id>` flag to the FASTA + the +default annotation tracks (CpG islands, gencode, RepeatMasker), so each +invocation stays short. + +## Schema + +```yaml +reference_genomes: + local: + <genome_id>: + fasta: <path> # required + gtf: <path> # gencode .gtf.gz or .gff3.gz (bgzip + tabix preferred) + sizes: <path> # chrom.sizes (optional) + CpGIslands: <path> # .bed (uncompressed or bgzip) + repMaskerBed: <path> # .bed.gz (bgzip + tabix) +``` + +`<genome_id>` is the value you pass to `--genome`. Suggested IDs and aliases: + +| `--genome` value | YAML key | Common alias | +|----------------------|---------------------|--------------| +| `hg38` | `hg38` | GRCh38 | +| `mm10` | `mm10` | GRCm38 | +| `mm39` | `mm39` | GRCm39 | +| `t2t` / `chm13` | `t2t_CHM13v2_plusY` | T2T-CHM13v2 | +| `grch37` / `hg19` | `GRCh37` | hg19 | + +The driver normalizes the input alias to the canonical YAML key. Extend +`GENOME_ALIASES` in `scripts/build_igvreports.py` if you need additional builds. + +## Default-track resolution + +For the `--genome` you pass, the driver tries to load three default tracks: + +1. **CpG islands** → `CpGIslands` key +2. **Gene annotation** → `gtf` key (prefers a sibling `*.gff3.gz` if present) +3. **RepeatMasker** → `repMaskerBed` key + +Any track absent from the YAML for that genome is logged as a warning and +skipped — the report still builds, just without that track. + +## Gencode preference: GFF3 over GTF + +If `gtf` points at `gencode.<version>.annotation.gtf.gz` and a sibling +`gencode.<version>.annotation.gff3.gz` exists in the same directory, the +driver prefers the GFF3 — it carries the full transcript / exon / CDS / UTR +detail that's most useful for read-level inspection at SV / fusion / integration +junctions. The GTF (gene-level) loads as a fallback. + +Override with `--gencode-from-yaml` to force the YAML's `gtf` path regardless. + +## EPDnew (methylation-specific) + +`EPDnewCoding` / `EPDnewNonCoding` keys (BED.gz, bgzip + tabix) are +**not** auto-loaded — methylation-specific. Reference them explicitly via a +`--track-config tracks.json` entry when building a methylation viewer (see +`references/methylation_ont.md`). + +## Missing tracks → workflow + +1. Build or locate the BED / GFF3 / GTF. +2. If it needs bgzip + tabix conversion, run `scripts/prep_track.sh <path>`. +3. Add the path to your `databases_config.yaml` under the appropriate key, or + pass it via `--extra-track <path>` for a one-off run. diff --git a/igv-reports/references/methylation_ont.md b/igv-reports/references/methylation_ont.md new file mode 100644 index 0000000..8764546 --- /dev/null +++ b/igv-reports/references/methylation_ont.md @@ -0,0 +1,243 @@ +--- +name: methylation_ont +genome: hg38 | mm10 | mm39 | t2t +assay: ONT 5mC + 5hmC (CpG) +worked_example: ../examples/methylation_ont/ +--- + +# ONT methylation viewer — cheat-sheet + +Targeted reference for building an igv-reports HTML that shows per-read +5mC/5hmC base-modification calls (BAM, basemod2 coloring) plus per-sample +methylation-fraction bedGraph tracks at fixed promoter / gene / DMR windows. + +When this skill needs to build a methylation viewer, the **default path +(positional `--tracks`) is wrong** — methylation viewers need named, +colored, y-axis-locked tracks. The right path is: + +```bash +build_igvreports.py --track-config tracks.json ... +``` + +with `tracks.json` either generated from a YAML spec (see worked example) +or hand-written from `tracks.template.json`. + +## The four-thing checklist + +### 1. BAM tracks need `colorBy: "basemod2"` + +```json +{ + "name": "<sample>", + "url": "<bam>", + "indexURL": "<bam>.bai", + "format": "bam", + "type": "alignment", + "colorBy": "basemod2", + "showSoftClips": false, + "displayMode": "COLLAPSED" +} +``` + +Without `colorBy: "basemod2"`, the BAM renders as plain alignments +without the per-base 5mC/5hmC colors that are the whole point of the +view. `displayMode: "COLLAPSED"` keeps the BAM panel short so the +bedGraph summary tracks below stay visible. + +### 2. bedGraph tracks need fixed `min: 0, max: 100` + +```json +{ + "name": "<sample> 5mC", + "url": "<bedgraph>", + "format": "bedgraph", + "type": "wig", + "color": "rgb(0,68,136)", + "min": 0, "max": 100 +} +``` + +modkit's bedmethyl output is **percent (0..100)**, not fraction (0..1) — +the y-axis ceiling must be 100. IGV's per-track autoscale defaults +differ per track and hide real cross-sample differences (one sample +might autoscale to 0..82, the next to 0..100; same bar height means +different methylation). Lock all samples' bedGraph tracks to the same +0..100 range. See `rules/igv.md` for the original incident. + +**Use bedGraph, not bigwig.** igv-reports' Python slicer (`utils.getreader`) +dispatches on file extension and has no `.bw` reader — runs fail with +`Exception: Unknown file format`. Pre-slice bigwigs over the report +regions with `bigWigToBedGraph -chrom -start -end <bw> <bg>`, one +output per region, then `cat >>` them into a single bedGraph (UCSC +`bigWigToBedGraph` opens `/dev/stdout` with `O_TRUNC` between calls — +piping multiple invocations loses everything but the last region). + +### 3. `--flanking 0` when sites encode the desired window + +For methylation viewers the sites BED almost always carries the desired +window directly (a promoter span, a DMR, a gene body). Adding 300 bp of +flanking adds nothing and shifts the initial viewer frame. Pass +`--flanking 0` and let the BED row coordinates be the frame. + +The 300 bp default is right for the SV/integration breakpoint workflow +this skill was extracted from — there the BED row is a one-base +breakpoint and you need flanking to see read support. + +### 4. Sites BED with `#chrom\tstart\tend\tname` comment header is fine + +The skill's older docs say "headerless" because non-`#` header rows +crash `create_report` with `ValueError: invalid literal for int()`. +A line starting with `#` is treated as a comment and is fine — and +matches CLAUDE.md's "BED-like outputs must have a `#`-prefixed +header" rule. Use: + +``` +#chrom start end name +chr2 25246000 25259000 DNMT3A_2_promoter +``` + +Pair this with `--info-columns name` so the `name` column shows up in +the report's variant table. + +## Track ordering + +Render order is top-to-bottom in the viewer; put annotation FIRST so +gene tracks anchor the user's eye at the top, then per-sample BAM + 5mC ++ 5hmC triplets stacked below in sample-group order. The worked example +follows: gencode → EPDnew → CpGIslands → RepeatMasker → (per-sample: +BAM, 5mC, 5hmC). + +## Colors (Okabe-Ito, group-paired) + +For two-group studies (e.g., normal vs tumor) pick two color pairs out +of the Okabe-Ito palette so groups are pre-attentively distinguishable: + +| Group | 5mC color | 5hmC color | +|--------|---------------------|----------------------| +| Group A (normal) | `rgb(0,68,136)` blue | `rgb(204,121,167)` reddish-purple | +| Group B (tumor) | `rgb(213,94,0)` vermillion | `rgb(230,159,0)` orange | + +Annotation track colors (also Okabe-Ito): EPDnew = vermillion +`rgb(213,94,0)`, CpG islands = bluish-green `rgb(0,158,115)`, +RepeatMasker = sky-blue `rgb(86,180,233)`. + +`scripts/generate_tracks_json.py` reads these from a `group_colors:` +map in the YAML spec, so a new group only needs one entry. + +## EPDnew promoter track (hg38) + +If your `databases_config.yaml` carries EPDnew for hg38, the suggested +keys are: + +```yaml +reference_genomes: + local: + hg38: + EPDnewCoding: <path-to>/Hs_EPDnew.hg38.bed.gz + EPDnewNonCoding: <path-to>/HsNC_EPDnew.hg38.bed.gz +``` + +Source: <https://epd.expasy.org/epd/human/human_database.php?db=human> + +The skill driver doesn't load these by default — they're a methylation- +specific track. Either reference them directly from `tracks.json` or add +an `EPDnew` entry to a custom `annotation:` section in your YAML spec. +mm10 / mm39 / t2t builds don't ship with EPDnew. + +## Reference-fasta vs `--genome hg38` + +Always pass `--fasta` (skill driver default), never `--genome hg38`. +The igv.js bundled genome IDs require internet at view + render time; +`--fasta` + `--standalone` produces a fully-offline HTML. See +`references/best_practices.md` Reference section. + +## When to use the apptainer SIF (mostly automatic) + +The driver auto-detects whether to run via an apptainer SIF or PATH +`create_report` based on `SLURM_JOB_ID` and `$IGV_REPORTS_SIF`: + +| Environment | Default | Why | +|---|---|---| +| Local / login node (`SLURM_JOB_ID` unset) | PATH `create_report` | No cold-start tax; simplest path. | +| Compute node under SLURM (`SLURM_JOB_ID` set) AND `$IGV_REPORTS_SIF` points at an existing SIF | apptainer + SIF | Fresh node = cold NFS cache = 1-2 M page faults on conda init (~2.5 us each). The SIF reads once into RAM, then stays warm. | +| Compute node BUT no SIF set/found | falls back to PATH `create_report` (logged) | Safe default; no surprise SIF-not-found error. | + +Override either way with `--apptainer` / `--no-apptainer`. The decision +(auto vs. explicit) is logged at run start so post-mortems are unambiguous. + +To set up the SIF once, pull from the Galaxy depot: + +```bash +export IGV_REPORTS_SIF=/path/to/igv-reports_1.16.0.sif +wget -O "$IGV_REPORTS_SIF" \ + 'https://depot.galaxyproject.org/singularity/igv-reports:1.16.0--pyh7cba7a3_0' +``` + +**Mandatory `--cleanenv` for the SIF (driver handles it).** Host RHEL 8 +exports `SSL_CERT_FILE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem` +which doesn't exist inside the Galaxy-depot SIF. `create_report`'s +standalone build path makes an HTTPS GET (likely for the IGV.js +ideogram CDN) that crashes with `[SSL: CERTIFICATE_VERIFY_FAILED]` +mid-render. The driver always invokes `singularity exec --cleanenv ...` +to scrub host env vars before they enter the SIF, so users don't need +to remember the flag. If you call create_report from the SIF directly +(bypassing the driver), include `--cleanenv` yourself. See +`rules/apptainer_env_leak.md` for the full pattern. + +## Worked example + +`../examples/methylation_ont/` is the canonical end-to-end run: +- 4 COLO829 ONT samples (2 normal-blood × 2 tumor) +- 2 promoter windows (DNMT3A_2 + EZH2) +- 5mC + 5hmC bedGraph per sample (8 bedGraph files, pre-sliced) +- gencode + EPDnew + CGI + rmsk annotation slices + +Run `bash examples/methylation_ont/build.sh` to regenerate the HTML; +read `examples/methylation_ont/recipe.md` for the slot-by-slot guide +to adapting it. + +## Post-render verification + +After building the HTML, run `scripts/verify_report.py` to confirm the +embedded content matches your inputs (region count, coordinates, track +names). For methylation viewers this catches the worst silent failure +mode — a render that succeeded for the wrong samples — which the input- +side validation alone can't catch. + +```bash +python scripts/verify_report.py \ + --html methylation_report.hg38.html \ + --sites sites.hg38.bed \ + --track-config tracks.json \ + --min-size-mb 1.0 \ + --out methylation_report.verify.tsv \ + --fail-on-fail +``` + +For `--track-config` builds the check uses the JSON's `name` fields; in +the YAML spec consumed by `generate_tracks_json.py`, those names are the +`name:` keys in `annotation:` and the auto-generated `<sample>`, +`<sample> 5mC`, `<sample> 5hmC` labels per sample. Picking specific +sample names in the YAML therefore drives the verifier's coverage — +generic names like "sample1" weaken the check. + +**For cohort methylation runs** (multi-patient × per-sample HTMLs + +`index.html`), the cohort verifier (`scripts/verify_cohort.py`) is the +more relevant tool: it additionally catches sample-swap bugs (sample-2's +BAMs accidentally ending up in sample-1's HTML), missing samples, and +`index.html` drift. The methylation workflow is especially vulnerable to +sample-swap typos because each patient has multiple ONT runs with similar- +looking flowcell IDs (e.g., `PAU59807` vs `PAU61427`). Auto-invoked by +`build_igvreports.py --samplesheet`; see SKILL.md "Cohort-level +verification" for details. + +## Cross-references + +- `rules/igv.md` — bigwig-can't-be-sliced, y-axis-autoscale, UCSC + `/dev/stdout` truncation; the rules that motivate this cheat-sheet. +- `rules/apptainer_vs_conda.md` — when the `--apptainer` flag pays off. +- `references/best_practices.md` — generic create_report flag reference; + sites BED, tracks, reference, performance, pitfalls table. +- `examples/methylation_ont/recipe.md` — full slot-by-slot example doc. +- `CLAUDE.md` §3A — upstream ONT methylation pipeline (pod5 → dorado → + modkit pileup → bedGraph + bigwig). diff --git a/igv-reports/scripts/build_igvreports.py b/igv-reports/scripts/build_igvreports.py new file mode 100755 index 0000000..d3f5ff7 --- /dev/null +++ b/igv-reports/scripts/build_igvreports.py @@ -0,0 +1,1234 @@ +#!/usr/bin/env python3 +"""build_igvreports.py — generic driver for the igv-reports skill. + +Author: Samuel Ahuno +Purpose: + Build self-contained HTML genomic-region reports with create_report + (igv-reports). Two run modes: + + 1. Single — direct CLI: --sites BED + --bam BAM(s) [+--vcf VCF] + → one HTML at --output. + + 2. Cohort — TSV samplesheet: one HTML per row + an index.html. + Samplesheet columns (tab-separated, with header): + sample bam_tumor bam_normal vcf sites_bed + Optional fifth column: extra_tracks (comma-separated paths). + + Either way, the driver: + - Resolves CpG islands, gencode, and RepeatMasker paths from + databases_config.yaml for the chosen genome (skipping any not + configured for that genome, with a warning). + - Validates that the sites BED is headerless and well-formed. + - Calls create_report with --flanking 300 --standalone by default. + - Writes a logs/ entry capturing the resolved track list, the full + command, the flanking value, and per-region embedded data sizes. + +Usage: + python build_igvreports.py --sites SITES.hg38.bed \\ + --bam tumor.bam normal.bam --vcf calls.vcf \\ + --genome hg38 --fasta /path/to/hg38.fa \\ + --no-default-tracks --output report.hg38.html + + python build_igvreports.py --samplesheet sheet.tsv \\ + --genome hg38 --fasta /path/to/hg38.fa \\ + --no-default-tracks --output-dir results/cohort/ + +Defaults can be shipped via a YAML at IGV_REPORTS_DB_CONFIG (see the +references/databases_config_paths.md schema), in which case --fasta / +--no-default-tracks are not needed. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import shutil +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +from pathlib import Path + +try: + import yaml # PyYAML +except ImportError: + print("ERROR: PyYAML not available. Install with: pip install pyyaml", file=sys.stderr) + sys.exit(2) + +# Optional: point IGV_REPORTS_DB_CONFIG at a YAML mapping genome IDs to +# FASTA + CGI + gencode + RepeatMasker paths so --genome resolves tracks +# automatically. Without it, pass --fasta and --no-default-tracks explicitly. +_DB_CONFIG_ENV = os.environ.get("IGV_REPORTS_DB_CONFIG") +DEFAULT_DBCONFIG = Path(_DB_CONFIG_ENV) if _DB_CONFIG_ENV else None +DEFAULT_FLANKING = 300 +# Optional: point IGV_REPORTS_SIF at an apptainer SIF for offline / HPC runs. +# Galaxy depot: https://depot.galaxyproject.org/singularity/igv-reports:1.16.0--pyh7cba7a3_0 +_SIF_ENV = os.environ.get("IGV_REPORTS_SIF") +IGVREPORTS_SIF = Path(_SIF_ENV) if _SIF_ENV else None + + +def apptainer_bind_args() -> list[str]: + """Build `--bind <path>` tokens for singularity, skipping paths that don't + exist. Source: `$IGV_REPORTS_BIND` (colon-separated). Empty / unset = no + binds (singularity tolerates this; you only need binds when your data + lives outside the container's default-visible filesystem).""" + raw = os.environ.get("IGV_REPORTS_BIND", "") + if not raw: + return [] + tokens: list[str] = [] + for p in raw.split(":"): + if p and Path(p).exists(): + tokens.extend(["--bind", p]) + return tokens + +GENOME_ALIASES = { + "hg38": "hg38", + "GRCh38": "hg38", + "mm10": "mm10", + "GRCm38": "mm10", + "mm39": "mm39", + "GRCm39": "mm39", + "t2t": "t2t_CHM13v2_plusY", + "chm13": "t2t_CHM13v2_plusY", + "T2T": "t2t_CHM13v2_plusY", + "T2T-CHM13": "t2t_CHM13v2_plusY", + "t2t_CHM13v2_plusY": "t2t_CHM13v2_plusY", + "GRCh37": "GRCh37", + "hg19": "GRCh37", +} + + +def setup_logger(log_path: Path) -> logging.Logger: + """Dual-handler logger: file + stderr, with timestamp prefix.""" + log_path.parent.mkdir(parents=True, exist_ok=True) + fmt = logging.Formatter( + "[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) + log = logging.getLogger("igv_reports") + log.setLevel(logging.INFO) + log.handlers.clear() + fh = logging.FileHandler(log_path) + fh.setFormatter(fmt) + log.addHandler(fh) + sh = logging.StreamHandler(sys.stderr) + sh.setFormatter(fmt) + log.addHandler(sh) + return log + + +def resolve_genome(genome: str) -> str: + canon = GENOME_ALIASES.get(genome) + if not canon: + raise SystemExit( + f"ERROR: unknown genome '{genome}'. Supported: {sorted(set(GENOME_ALIASES.values()))}" + ) + return canon + + +def load_db_config(path: Path) -> dict: + """Load the databases YAML. Returns {} (with a warning to stderr) if the + file is missing — callers must handle empty cfg gracefully. + + Users without a YAML can pass --fasta and --no-default-tracks + on the driver, OR set $IGV_REPORTS_DB_CONFIG to their own YAML.""" + if not path.exists(): + sys.stderr.write( + f"[build_igvreports] WARNING: db-config not found at {path}\n" + " Set $IGV_REPORTS_DB_CONFIG to point at your YAML, or pass\n" + " --fasta PATH and --no-default-tracks to bypass it entirely.\n" + ) + return {} + with path.open() as fh: + cfg = yaml.safe_load(fh) or {} + return cfg + + +def resolve_default_tracks(cfg: dict, genome: str, log: logging.Logger) -> list[str]: + """Return ordered list of default tracks present on disk for this genome. + + Order matters — first entry renders at the bottom of the IGV.js view by + default? Actually igv-reports renders --tracks in the order passed, + top-to-bottom. We put annotation tracks LAST so they sit below the + BAM/VCF data the user is actually inspecting. + + Empty cfg (e.g. generic, no databases YAML) → returns [] with a warning. + """ + g = cfg.get("reference_genomes", {}).get("local", {}).get(genome, {}) + if not g: + log.warning( + f"no entry for genome '{genome}' in db-config — skipping default tracks. " + "Pass --extra-track or --track-config for annotation tracks." + ) + return [] + + tracks: list[str] = [] + + # CpG islands. + cgi = g.get("CpGIslands") + if cgi and Path(cgi).exists(): + tracks.append(cgi) + else: + log.warning(f"CpG islands track missing for {genome} (key=CpGIslands, value={cgi})") + + # Gencode. For hg38 prefer the bgzip+tabix .gff3.gz sibling if present. + gtf = g.get("gtf") + gencode_track: str | None = None + if genome == "hg38" and gtf: + sibling = (Path(gtf).parent / "gencode.v47.annotation.gff3.gz") + if sibling.exists() and (sibling.parent / (sibling.name + ".tbi")).exists(): + gencode_track = str(sibling) + log.info(f" hg38: using full gencode annotation: {sibling}") + if gencode_track is None and gtf and Path(gtf).exists(): + gencode_track = gtf + if gencode_track: + tracks.append(gencode_track) + else: + log.warning(f"Gencode track missing for {genome}") + + # RepeatMasker. + rmsk = g.get("repMaskerBed") + if rmsk and Path(rmsk).exists(): + tracks.append(rmsk) + else: + log.warning(f"RepeatMasker track not configured for {genome}") + + return tracks + + +def fasta_for(cfg: dict, genome: str) -> str: + """Resolve a FASTA path from the db-config. Users without a + YAML can bypass this by passing --fasta PATH on the driver.""" + try: + fasta = cfg["reference_genomes"]["local"][genome].get("fasta") + except (KeyError, TypeError): + raise SystemExit( + f"ERROR: db-config has no '{genome}' entry to resolve FASTA from.\n" + " Pass --fasta PATH explicitly, or set $IGV_REPORTS_DB_CONFIG\n" + " to a YAML that defines reference_genomes.local.<genome>.fasta." + ) + if not fasta or not Path(fasta).exists(): + raise SystemExit(f"ERROR: FASTA missing for {genome}: {fasta}") + if not Path(fasta + ".fai").exists(): + raise SystemExit( + f"ERROR: FASTA index missing for {fasta} — run `samtools faidx {fasta}`" + ) + return fasta + + +def validate_bams(bams: list[Path]) -> None: + """Sanity-check BAMs have a coindex sibling (.bai or .csi). + + `create_report` needs a random-access index to slice BAMs at each region; + a BAM with no sibling index produces an obscure pysam error several + layers in. Catch it up front with an actionable message.""" + for bam in bams: + if not bam.exists(): + raise SystemExit(f"ERROR: BAM not found: {bam}") + if not (bam.with_suffix(bam.suffix + ".bai").exists() + or bam.with_suffix(bam.suffix + ".csi").exists() + or bam.with_suffix(".bai").exists() + or bam.with_suffix(".csi").exists()): + raise SystemExit( + f"ERROR: BAM index missing for {bam} — create_report cannot slice it.\n" + f" Fix: samtools index {bam}\n" + f" (or `samtools index -c {bam}` for a .csi index on contigs >512 Mb)" + ) + + +def validate_sites_bed(bed: Path) -> None: + """Sanity-check the sites BED before invoking create_report. + + create_report's BED parser is positional. It skips lines starting with + `#` or `track ` (so the lab's `#chrom\\tstart\\tend\\tname` header is + fine), but a non-comment header row like `chrom\\tstart\\tend` crashes + with `ValueError: invalid literal for int()`. We mirror create_report's + line-skipping logic and emit an informative error if any data row has + non-numeric start/end.""" + if not bed.exists(): + raise SystemExit(f"ERROR: sites BED not found: {bed}") + with bed.open() as fh: + for i, line in enumerate(fh, start=1): + line = line.rstrip("\n") + if not line or line.startswith("#") or line.startswith("track "): + continue + cols = line.split("\t") + if len(cols) < 3: + raise SystemExit(f"ERROR: {bed}:{i}: BED needs >=3 tab-separated columns; got {cols!r}") + try: + start = int(cols[1]) + end = int(cols[2]) + except ValueError: + raise SystemExit( + f"ERROR: {bed}:{i}: non-numeric start/end — likely a header row.\n" + " igv-reports' BED parser is positional and chokes on non-comment\n" + " headers. Prefix the header with `#` (skipped by create_report\n" + " and matches the lab's BED-output convention) or strip it." + ) + if start >= end: + raise SystemExit(f"ERROR: {bed}:{i}: start ({start}) >= end ({end})") + + +def find_create_report() -> str: + """Resolve `create_report` on PATH (provided by `pip install igv-reports` + or any conda env that activated it).""" + cr = shutil.which("create_report") + if cr: + return cr + raise SystemExit( + "ERROR: create_report not on PATH.\n" + " Install: pip install -U 'igv-reports>=1.16.0'\n" + " Offline / air-gapped: point IGV_REPORTS_SIF at an igv-reports SIF\n" + " and rerun with --apptainer (Galaxy depot:\n" + " https://depot.galaxyproject.org/singularity/igv-reports:1.16.0--pyh7cba7a3_0)" + ) + + +def apptainer_create_report_prefix(sif: Path) -> list[str]: + """Return the `singularity exec --cleanenv [--bind <path> ...] <sif> + create_report` prefix. Used when --apptainer is passed; avoids the NFS + conda cold-start tax (rules/apptainer_vs_conda.md). The default SIF is + a dedicated igv-reports container (igv-reports_1.16.0.sif, ~83 MB) + pulled from the Galaxy depot. Override via $IGV_REPORTS_SIF. + + --cleanenv: scrubs host env vars so they don't leak into the SIF. + Specifically: host SSL_CERT_FILE / SSL_CERT_DIR on RHEL 8 point at paths + that don't exist inside Galaxy-depot SIFs, and create_report's standalone- + HTML build path performs an HTTPS GET (for the IGV.js ideogram or similar) + that aborts with `[SSL: CERTIFICATE_VERIFY_FAILED]`. See + rules/apptainer_env_leak.md. + + Binds: see `apptainer_bind_args()` — conditional on path existence.""" + if sif is None: + raise SystemExit( + "ERROR: --apptainer requested but $IGV_REPORTS_SIF is not set.\n" + " Set IGV_REPORTS_SIF to a SIF path and rerun, e.g.:\n" + " export IGV_REPORTS_SIF=/path/to/igv-reports_1.16.0.sif\n" + " Pull the SIF first if needed:\n" + " wget -O \"$IGV_REPORTS_SIF\" \\\n" + " 'https://depot.galaxyproject.org/singularity/igv-reports:1.16.0--pyh7cba7a3_0'" + ) + if not sif.exists(): + raise SystemExit( + f"ERROR: apptainer SIF not found: {sif}\n" + " Pull with one of:\n" + f" apptainer pull {sif} \\\n" + " docker://igv-org/igv-reports:1.16.0\n" + f" wget -O {sif} \\\n" + " 'https://depot.galaxyproject.org/singularity/igv-reports:1.16.0--pyh7e72e81_0'\n" + " Or set $IGV_REPORTS_SIF to a SIF you already have." + ) + return ["singularity", "exec", "--cleanenv", *apptainer_bind_args(), str(sif), "create_report"] + + +def _read_sites_bed_rows(sites: Path) -> list[dict]: + """Read a sites BED into a list of dicts, one per data row. Lines + starting with `#` / `track ` / `browser ` are skipped (same rule + `validate_sites_bed` uses). The 4th column (`name`) becomes the UID + when present; otherwise an auto-generated `region_<idx>` is used.""" + rows: list[dict] = [] + with sites.open() as fh: + for line in fh: + line = line.rstrip("\n") + if not line or line.startswith("#") or line.startswith("track ") or line.startswith("browser "): + continue + cols = line.split("\t") + if len(cols) < 3: + continue + chrom, start_s, end_s = cols[0], cols[1], cols[2] + name = cols[3].strip() if len(cols) >= 4 and cols[3].strip() else "" + rows.append({ + "chrom": chrom, + "start": int(start_s), + "end": int(end_s), + "name": name, + }) + for idx, r in enumerate(rows, start=1): + if not r["name"]: + r["name"] = f"region_{idx:03d}" + r["bed_row_idx"] = idx + return rows + + +def _write_igver_regions_bed(rows: list[dict], flanking: int, out: Path) -> None: + """Emit a BED with `--flanking` baked into start/end and UID in col 4. + Filename collisions in igver's `chr-start-end.<uid>.png` are avoided by + the auto-assigned UIDs in `_read_sites_bed_rows`.""" + with out.open("w") as fh: + for r in rows: + start = max(0, r["start"] - flanking) + end = r["end"] + flanking + fh.write(f"{r['chrom']}\t{start}\t{end}\t{r['name']}\n") + + +def _write_igver_input_list(tracks: list[str], out: Path) -> None: + """One path per line — igver's `-i FOO.txt` consumes this verbatim.""" + with out.open("w") as fh: + for t in tracks: + fh.write(f"{t}\n") + + +def _resolve_igver_cmd(override: str | None) -> list[str]: + """Return the argv prefix used to invoke igver. Resolution order: + 1. Explicit override (split on whitespace — supports `apptainer exec ... igver`). + 2. $IGVER_CMD env var (same shape as the override). + 3. `igver` on PATH.""" + if override: + return override.split() + env_cmd = os.environ.get("IGVER_CMD") + if env_cmd: + return env_cmd.split() + on_path = shutil.which("igver") + if on_path: + return [on_path] + raise SystemExit( + "ERROR: igver not found.\n" + " Install: pip install igver\n" + " Override: --igver-cmd 'apptainer exec /path/to/igver.sif igver'\n" + " Or set $IGVER_CMD" + ) + + +def build_pngs_with_igver( + sites: Path, + tracks: list[str], + genome: str, + flanking: int, + out_dir: Path, + log: logging.Logger, + html_path: Path, + igver_cmd: str | None = None, + dpi: int = 300, + display_mode: str = "collapse", + panel_height: int | None = None, + fmt: str = "png", +) -> Path: + """Invoke igver against the same sites + track list that drove + create_report, write a manifest mapping each BED row to its PNG path + and HTML row, return the manifest path. + + Consistency contract (the five levers from the design): + 1. Same sites BED + same `flanking` baked into the BED rows we pass. + 2. Same resolved track list. + 3. `display_mode` chosen to match HTML defaults (collapse). + 4. UID-based filenames let a user pair PNG ↔ HTML by string match. + 5. The manifest TSV is the audit trail; verify_cohort.py reads it. + + Output layout (caller controls `out_dir`): + out_dir/ + igver_regions.bed - flanked BED with UIDs in col 4 (igver -r) + igver_input.txt - track paths, one per line (igver -i) + png/ - actual PNGs (igver -o); filenames are + chr-start-end.<uid>.<png|svg|pdf> + manifest.tsv - cross-artifact bridge to the HTML + """ + out_dir.mkdir(parents=True, exist_ok=True) + png_dir = out_dir / "png" + png_dir.mkdir(parents=True, exist_ok=True) + + rows = _read_sites_bed_rows(sites) + if not rows: + raise SystemExit(f"ERROR: no data rows found in sites BED: {sites}") + + regions_bed = out_dir / "igver_regions.bed" + _write_igver_regions_bed(rows, flanking, regions_bed) + input_txt = out_dir / "igver_input.txt" + _write_igver_input_list(tracks, input_txt) + + cmd = list(_resolve_igver_cmd(igver_cmd)) + [ + "-i", str(input_txt), + "-r", str(regions_bed), + "-o", str(png_dir), + "-g", genome, + "-d", display_mode, + "--dpi", str(dpi), + "-f", fmt, + "--no-singularity", + ] + if panel_height is not None: + cmd.extend(["-p", str(panel_height)]) + + log.info(f" igver: dpi={dpi} display={display_mode} fmt={fmt} regions={len(rows)}") + log.info(f" igver cmd: {' '.join(cmd)}") + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + log.error(f"igver FAILED for {sites}") + log.error(f"stdout: {proc.stdout}") + log.error(f"stderr: {proc.stderr}") + raise SystemExit(proc.returncode) + + # PNG filename convention is set by igver's _parse_bed_file: + # `<chrom>-<start>-<end>.<uid>.<ext>`. We reconstruct it here. + ext = "svg" if fmt in ("svg", "pdf") else fmt + manifest = out_dir / "manifest.tsv" + with manifest.open("w") as fh: + fh.write( + "#bed_row_idx\tuid\tchrom\tstart_orig\tend_orig\t" + "start_flanked\tend_flanked\tregion\tpng_path\thtml_path\thtml_table_row\n" + ) + for r in rows: + start_f = max(0, r["start"] - flanking) + end_f = r["end"] + flanking + fname = f"{r['chrom']}-{start_f}-{end_f}.{r['name']}.{ext}" + png_rel = (png_dir / fname).resolve() + html_rel = html_path.resolve() + fh.write( + f"{r['bed_row_idx']}\t{r['name']}\t{r['chrom']}\t" + f"{r['start']}\t{r['end']}\t{start_f}\t{end_f}\t" + f"{r['chrom']}:{start_f}-{end_f}\t{png_rel}\t{html_rel}\t" + f"{r['bed_row_idx']}\n" + ) + + log.info(f" png manifest: {manifest} ({len(rows)} rows)") + + # Inline existence check — igver exits 0 even when it fails to render + # (silent exit-0 failure documented in the upstream skill's notes). + # We can't trust the exit code, so verify every expected PNG path is + # on disk and non-empty before returning. Without this a "successful" + # build silently ships an empty png/ dir. + missing: list[str] = [] + empty: list[str] = [] + for r in rows: + start_f = max(0, r["start"] - flanking) + end_f = r["end"] + flanking + fname = f"{r['chrom']}-{start_f}-{end_f}.{r['name']}.{ext}" + p = png_dir / fname + if not p.exists(): + missing.append(fname) + elif p.stat().st_size == 0: + empty.append(fname) + if missing or empty: + log.error( + f"igver returned exit 0 but {len(missing)} expected PNG(s) are missing " + f"and {len(empty)} are zero-byte (out of {len(rows)} regions). " + "This is a documented silent-failure mode of igver." + ) + if missing: + log.error(f" missing: {missing[:5]}{'...' if len(missing) > 5 else ''}") + if empty: + log.error(f" empty: {empty[:5]}{'...' if len(empty) > 5 else ''}") + raise SystemExit( + f"ERROR: igver produced {len(rows) - len(missing) - len(empty)} of " + f"{len(rows)} PNGs (silent exit-0 failure). Check the igver install " + "path — `pip install igver` egg-link lacks the IGV Java binary; use " + "an apptainer SIF via --igver-cmd or $IGVER_CMD." + ) + + return manifest + + +def build_one( + sites: Path, + bams: list[Path], + vcf: Path | None, + extra_tracks: list[Path], + fasta: str, + default_tracks: list[str], + output: Path, + title: str, + flanking: int, + log: logging.Logger, + track_config: Path | None = None, + report_type: str | None = None, + info_columns: list[str] | None = None, + use_apptainer: bool = False, + also_png: bool = False, + igver_cmd: str | None = None, + png_dpi: int = 300, + png_display_mode: str = "collapse", + png_out_dir: Path | None = None, +) -> Path: + """Run create_report for one site set and return the HTML path. + + Two track modes: + * Default — positional `--tracks <path> <path> ...`. Used when + `track_config` is None. BAM + VCF + extra + default annotations, + in render order top-to-bottom. + * track-config — `--track-config <json>`. Used when `track_config` + is provided. The JSON is the source of truth; default_tracks, + bams, vcf, extra_tracks are IGNORED (they go in the JSON instead). + This is the path required for ONT methylation viewers (named + tracks, per-track color/min/max/colorBy/displayMode). + """ + validate_sites_bed(sites) + # Only validate BAMs on the positional --tracks path. The --track-config + # JSON has its own track-resolution semantics and may reference BAMs by + # arbitrary url:; create_report itself will fail loudly there if needed. + if track_config is None: + validate_bams(bams) + output.parent.mkdir(parents=True, exist_ok=True) + + create_report_cmd = ( + apptainer_create_report_prefix(IGVREPORTS_SIF) if use_apptainer + else [find_create_report()] + ) + + cmd: list[str] = list(create_report_cmd) + [ + str(sites), + "--fasta", fasta, + "--flanking", str(flanking), + ] + + if track_config is not None: + cmd.extend(["--track-config", str(track_config)]) + log.info(f" track-config: {track_config} (defaults+bams+vcf bypassed)") + if bams or vcf or extra_tracks or default_tracks: + log.warning( + "--track-config supplied; ignoring --bam/--vcf/--extra-track and " + "auto-resolved default tracks. Put everything in the JSON instead." + ) + else: + # Track ordering: BAMs (data) -> VCF (calls) -> extra -> defaults (annotation, last). + tracks: list[str] = [str(b) for b in bams] + if vcf: + tracks.append(str(vcf)) + tracks.extend(str(t) for t in extra_tracks) + tracks.extend(default_tracks) + cmd.extend(["--tracks", *tracks]) + log.info(f" tracks (in render order):") + for i, t in enumerate(tracks, start=1): + log.info(f" {i:>2}. {t}") + + if report_type: + cmd.extend(["--type", report_type]) + if info_columns: + cmd.extend(["--info-columns", *info_columns]) + + cmd.extend([ + "--standalone", + "--title", title, + "--output", str(output), + ]) + + log.info(f" cmd: {' '.join(cmd)}") + log.info(f" flanking_bp: {flanking}") + + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + log.error(f"create_report FAILED for {sites}") + log.error(f"stdout: {proc.stdout}") + log.error(f"stderr: {proc.stderr}") + raise SystemExit(proc.returncode) + + if output.exists(): + log.info(f" HTML: {output} ({output.stat().st_size / 1024 / 1024:.2f} MB)") + + # PNG sidecar — same regions, same tracks, written next to the HTML. + # On the --track-config path we extract every `url` from the JSON + # (file resources only — http(s) URLs are skipped since igver can't + # consume them); on the positional path we reuse the same ordered + # list we just passed to create_report. + if also_png: + if track_config is not None: + try: + with track_config.open() as fh: + cfg_tracks = json.load(fh) + except Exception as e: + log.warning(f"--also-png: unable to parse --track-config JSON: {e} — skipping PNG step") + return output + png_tracks: list[str] = [] + for t in cfg_tracks if isinstance(cfg_tracks, list) else []: + url = t.get("url") if isinstance(t, dict) else None + if url and not str(url).startswith(("http://", "https://")): + png_tracks.append(str(url)) + if not png_tracks: + log.warning("--also-png: track-config has no local-path tracks — skipping PNG step") + return output + else: + png_tracks = [str(b) for b in bams] + if vcf: + png_tracks.append(str(vcf)) + png_tracks.extend(str(t) for t in extra_tracks) + png_tracks.extend(default_tracks) + + out_dir = png_out_dir if png_out_dir is not None else ( + output.parent / f"png_{output.stem}" + ) + parts = output.stem.split(".") + genome_tag = parts[-1] if len(parts) >= 2 else "hg38" + build_pngs_with_igver( + sites=sites, + tracks=png_tracks, + genome=genome_tag, + flanking=flanking, + out_dir=out_dir, + log=log, + html_path=output, + igver_cmd=igver_cmd, + dpi=png_dpi, + display_mode=png_display_mode, + ) + + return output + + +def parse_samplesheet(path: Path) -> list[dict]: + rows: list[dict] = [] + with path.open() as fh: + header = fh.readline().lstrip("#").rstrip("\n").split("\t") + for ln in fh: + cols = ln.rstrip("\n").split("\t") + if not cols or not cols[0].strip(): + continue + row = dict(zip(header, cols)) + rows.append(row) + required = {"sample", "sites_bed"} + if rows and not required.issubset(rows[0].keys()): + raise SystemExit( + f"ERROR: samplesheet must have columns: sample, sites_bed (got {list(rows[0].keys())}).\n" + " Optional columns: bam_tumor, bam_normal, vcf, extra_tracks (comma-separated)." + ) + return rows + + +def derive_log_path(out_dir: Path, override: Path | None = None) -> Path: + """Choose a log dir matching the lab's `results/<run>/{reports,logs}/` + sibling layout when possible. Fall back to `out_dir/logs/` (in-dir) when + the sibling can't be created — `out_dir.parent` is root, read-only, or + otherwise unwritable. Honor an explicit `override` unconditionally.""" + if override is not None: + log_dir = override + else: + out_dir = out_dir.resolve() + sibling = out_dir.parent / "logs" + try: + sibling.mkdir(parents=True, exist_ok=True) + log_dir = sibling + except (PermissionError, OSError): + log_dir = out_dir / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + return log_dir / f"run_{datetime.now():%Y%m%d_%H%M%S}.log" + + +def write_index(report_paths: dict[str, Path], out: Path, title: str) -> Path: + items = "\n".join( + f' <li><a href="{p.name}">{s}</a></li>' + for s, p in sorted(report_paths.items()) + ) + out.write_text( + "<!doctype html>\n<html><head><title>" + + title + + "\n" + f"

{title}

\n
    \n{items}\n
\n\n" + ) + return out + + +def run_anchors_generate( + samplesheet: Path, + sites_files: list[Path], + out: Path, + fail_on_fail: bool, + log: logging.Logger, +) -> None: + """Invoke `verify_anchors.py generate` once per distinct sites BED in the + cohort, merging into a single anchors TSV at `out`. Most cohorts share + one sites BED so this collapses to a single call; multi-sites cohorts + get one anchor block per sites file.""" + script = Path(__file__).resolve().parent / "verify_anchors.py" + if not script.exists(): + log.warning(f"anchors generate: script not found at {script} — skipping") + return + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text("") # truncate; per-sites blocks appended below + for i, sites in enumerate(sites_files): + block = out.with_suffix(f".part{i}.tsv") + cmd = [ + sys.executable, str(script), "generate", + "--samplesheet", str(samplesheet), + "--sites", str(sites), + "--out", str(block), + ] + log.info(f"anchors generate: {' '.join(cmd)}") + proc = subprocess.run(cmd, capture_output=True, text=True) + for line in (proc.stdout or "").splitlines(): + log.info(f" anchors > {line}") + if proc.stderr: + for line in proc.stderr.splitlines(): + log.info(f" anchors (stderr) > {line}") + if proc.returncode != 0: + if fail_on_fail: + raise SystemExit(proc.returncode) + log.warning(f"anchors generate exited {proc.returncode}; continuing") + continue + # Merge: keep header from first block, body rows from all. + if i == 0: + out.write_text(block.read_text()) + else: + with out.open("a") as fh: + for ln in block.read_text().splitlines(): + if not ln or ln.startswith("#"): + continue + fh.write(ln + "\n") + block.unlink() + log.info(f"anchors generate: wrote {out}") + + +def run_anchors_verify( + samplesheet: Path, + reports_dir: Path, + genome: str, + anchors: Path, + fail_on_fail: bool, + log: logging.Logger, +) -> None: + """Invoke `verify_anchors.py verify-cohort` after a cohort build.""" + script = Path(__file__).resolve().parent / "verify_anchors.py" + if not script.exists(): + log.warning(f"anchors verify: script not found at {script} — skipping") + return + if not anchors.exists(): + log.warning(f"anchors verify: anchors TSV missing: {anchors} — skipping") + return + out = reports_dir / "cohort_verify_anchors.tsv" + cmd = [ + sys.executable, str(script), "verify-cohort", + "--samplesheet", str(samplesheet), + "--reports-dir", str(reports_dir), + "--genome", genome, + "--anchors", str(anchors), + "--out", str(out), + ] + if fail_on_fail: + cmd.append("--fail-on-fail") + log.info(f"anchors verify: {' '.join(cmd)}") + proc = subprocess.run(cmd, capture_output=True, text=True) + for line in (proc.stdout or "").splitlines(): + log.info(f" anchors > {line}") + if proc.stderr: + for line in proc.stderr.splitlines(): + log.info(f" anchors (stderr) > {line}") + log.info(f"anchors verify: TSV={out} exit={proc.returncode}") + if proc.returncode != 0: + if fail_on_fail: + raise SystemExit(proc.returncode) + log.warning(f"anchors verify exited {proc.returncode}; --fail-on-fail not set, continuing") + + +def run_cohort_verify( + samplesheet: Path, + reports_dir: Path, + genome: str, + db_config: Path, + fail_on_fail: bool, + log: logging.Logger, +) -> None: + """Invoke verify_cohort.py at the end of a cohort build. Writes the TSV + + summary next to the cohort's index.html. Fails the build if + `fail_on_fail` is set and the verifier exits nonzero.""" + verify_script = Path(__file__).resolve().parent / "verify_cohort.py" + if not verify_script.exists(): + log.warning(f"verify_cohort: script not found at {verify_script} — skipping") + return + tsv_out = reports_dir / "cohort_verify.tsv" + md_out = reports_dir / "cohort_verify.summary.md" + cmd = [ + sys.executable, str(verify_script), + "--samplesheet", str(samplesheet), + "--reports-dir", str(reports_dir), + "--genome", genome, + "--db-config", str(db_config), + "--out", str(tsv_out), + "--summary", str(md_out), + ] + if fail_on_fail: + cmd.append("--fail-on-fail") + log.info(f"verify_cohort: running {' '.join(cmd)}") + proc = subprocess.run(cmd, capture_output=True, text=True) + # Mirror the verifier's stdout/stderr into the run log so audit-trail stays single-source. + for line in (proc.stdout or "").splitlines(): + log.info(f" verify_cohort > {line}") + if proc.stderr: + for line in proc.stderr.splitlines(): + log.warning(f" verify_cohort (stderr) > {line}") + log.info(f"verify_cohort: TSV={tsv_out} summary={md_out} exit={proc.returncode}") + if proc.returncode != 0: + if fail_on_fail: + raise SystemExit(proc.returncode) + log.warning(f"verify_cohort: exited {proc.returncode} but --fail-on-fail not set; continuing") + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--genome", required=True, help="hg38 | mm10 | mm39 | t2t | GRCh37 (alias-tolerant)") + ap.add_argument("--db-config", default=(str(DEFAULT_DBCONFIG) if DEFAULT_DBCONFIG else None), help=( + "YAML resolving genome -> {fasta, CpGIslands, gtf, repMaskerBed}. " + "Schema: see references/databases_config_paths.md. " + "Override via $IGV_REPORTS_DB_CONFIG, or skip entirely with --fasta + --no-default-tracks." + )) + ap.add_argument("--fasta", help=( + "Explicit FASTA path; bypasses --db-config for FASTA lookup. " + "Required when --db-config is not set or doesn't list the chosen genome. " + "Requires a sibling .fai (run `samtools faidx`)." + )) + ap.add_argument("--no-default-tracks", action="store_true", help=( + "Skip the CpG-islands/gencode/RepeatMasker auto-tracks from --db-config. " + "Combine with --fasta and --extra-track to operate without a databases YAML." + )) + ap.add_argument("--flanking", type=int, default=DEFAULT_FLANKING) + ap.add_argument("--extra-track", action="append", default=[], help="(repeat) extra track path; rendered above default annotations") + + mode = ap.add_mutually_exclusive_group(required=True) + mode.add_argument("--samplesheet", help="TSV: sample, [bam_tumor, bam_normal, vcf,] sites_bed[, extra_tracks]") + mode.add_argument("--sites", help="path to sites BED for single-sample mode") + + ap.add_argument("--bam", nargs="*", default=[], help="BAM/CRAM tracks (single-sample mode)") + ap.add_argument("--vcf", help="VCF track (single-sample mode)") + + ap.add_argument("--output", help="output HTML path (single-sample mode)") + ap.add_argument("--output-dir", help="output dir for cohort mode (default: ./reports)") + ap.add_argument("--title", default=None, help="report title; defaults to sample name + genome") + + ap.add_argument( + "--track-config", + help="path to a tracks.json (igv.js track config). When set, the JSON is " + "passed straight to create_report --track-config and all default " + "tracks / --bam / --vcf / --extra-track are bypassed. Use this for " + "ONT methylation viewers — see examples/methylation_ont/.", + ) + ap.add_argument( + "--type", + dest="report_type", + choices=["mutation", "fusion", "junction"], + default=None, + help="create_report --type. Sets viewer behaviour at each site.", + ) + ap.add_argument( + "--info-columns", + nargs="*", + default=[], + help="VCF INFO or BED columns to surface in the variant table. " + "For BED sites, 'name' is the most useful.", + ) + ap.add_argument( + "--also-png", + action="store_true", + help="After create_report finishes, invoke igver against the same " + "sites BED + track list to produce per-region PNGs alongside " + "the HTML. PNGs land in /png_/png/ with " + "filename `..png` (uid = BED `name` col, " + "auto-assigned `region_` when missing). A manifest TSV " + "bridges PNG ↔ HTML rows. Requires `igver` on PATH or " + "$IGVER_CMD / --igver-cmd override.", + ) + ap.add_argument( + "--igver-cmd", + default=None, + help="Override the igver invocation. Resolution order: this flag, " + "$IGVER_CMD, `igver` on PATH. Pass the full command including " + "any apptainer wrapper, e.g. 'apptainer exec /path/to/igver.sif igver'.", + ) + ap.add_argument( + "--png-dpi", + type=int, + default=300, + help="DPI for igver PNG output (default 300; bump to 600 for slide-quality).", + ) + ap.add_argument( + "--png-display-mode", + choices=["expand", "collapse", "squish"], + default="collapse", + help="igver `-d` flag. Default 'collapse' to match the HTML's BAM " + "BAM_DEFAULTS displayMode. Use 'expand' for per-read SV inspection.", + ) + ap.add_argument( + "--apptainer", + action=argparse.BooleanOptionalAction, + default=None, + help="Run create_report from inside the apptainer SIF pointed to " + "by $IGV_REPORTS_SIF (dedicated igv-reports 1.16.0 SIF, ~83 MB; " + "pull from the Galaxy depot). Skips the NFS conda cold-start " + "tax on HPC. Default: auto-detect — on if SLURM_JOB_ID is set " + "AND $IGV_REPORTS_SIF points at an existing SIF, off otherwise. " + "Override either way with --apptainer / --no-apptainer.", + ) + ap.add_argument( + "--log-dir", + help="explicit log directory. Default: sibling 'logs/' of the output " + "dir (matches results//{reports,logs}/ lab layout); falls " + "back to /logs/ when the sibling is unwritable.", + ) + ap.add_argument( + "--jobs", + "-j", + type=int, + default=1, + help="Number of parallel per-sample builds in cohort (--samplesheet) " + "mode. Each worker invokes create_report in a subprocess, so the " + "win comes from running multiple slicers concurrently against " + "different BAMs. I/O-bound on the BAM-slice step, so threads " + "scale well to ~min(N_samples, N_cores). Default 1 (sequential, " + "preserves prior behavior). Has no effect in single-sample mode.", + ) + ap.add_argument( + "--verify", + action=argparse.BooleanOptionalAction, + default=True, + help="Run scripts/verify_cohort.py at the end of cohort builds " + "(--samplesheet mode). Single-sample (--sites) mode is unaffected " + "and emits no cohort verify TSV. Default: on. Use --no-verify to " + "skip. The verifier inherits --fail-on-fail.", + ) + ap.add_argument( + "--fail-on-fail", + action="store_true", + help="Propagated to verify_cohort.py and verify_anchors.py: exit " + "nonzero if any verifier check is FAIL. Only meaningful with " + "--verify / --anchors-mode and --samplesheet.", + ) + ap.add_argument( + "--anchors-mode", + choices=["off", "generate", "verify"], + default="off", + help="Content (read-count) verification — opt-in because it shells " + "out to samtools per (sample, region) and is slow. 'generate' " + "runs samtools view -c against source BAMs at build time and " + "freezes the counts to --anchors (becomes regression fixture). " + "'verify' decodes each BAM slice from the built HTMLs and " + "compares to --anchors. 'off' (default) skips. See " + "examples/anchor_verify_demo/.", + ) + ap.add_argument( + "--anchors", + help="Path to anchors TSV. With --anchors-mode generate: output. " + "With --anchors-mode verify: input. Ignored when mode=off.", + ) + + args = ap.parse_args() + + genome = resolve_genome(args.genome) + # Only load db-config when something actually needs it (fasta lookup or + # default tracks). Saves the warning noise + lets a fully-explicit + # --fasta + --no-default-tracks invocation run with no YAML at all. + need_db_config = (not args.fasta) or (not args.no_default_tracks) + if need_db_config and not args.db_config: + raise SystemExit( + "ERROR: need a databases YAML to resolve FASTA / default tracks.\n" + " Either:\n" + " - pass --fasta /path/to/genome.fa --no-default-tracks " + "[--extra-track ...], or\n" + " - set $IGV_REPORTS_DB_CONFIG / --db-config to a YAML matching\n" + " the schema in references/databases_config_paths.md." + ) + cfg = load_db_config(Path(args.db_config)) if need_db_config else {} + if args.fasta: + fasta = args.fasta + if not Path(fasta).exists(): + raise SystemExit(f"ERROR: --fasta path not found: {fasta}") + if not Path(fasta + ".fai").exists(): + raise SystemExit( + f"ERROR: FASTA index missing for {fasta} — run `samtools faidx {fasta}`" + ) + else: + fasta = fasta_for(cfg, genome) + + # Logger placed alongside the output. See derive_log_path docstring. + if args.samplesheet: + out_dir = Path(args.output_dir or "reports") + else: + if not args.output: + raise SystemExit("ERROR: --output required in single-sample mode") + out_dir = Path(args.output).parent + out_dir.mkdir(parents=True, exist_ok=True) + log_path = derive_log_path(out_dir, Path(args.log_dir) if args.log_dir else None) + log = setup_logger(log_path) + + log.info(f"=== igv-reports skill, genome={genome} ===") + log.info(f"db_config: {args.db_config}") + log.info(f"fasta: {fasta}") + log.info(f"flanking: {args.flanking} bp (default {DEFAULT_FLANKING})") + + # Resolve --apptainer auto-detect. Tri-state: + # user said --apptainer -> True + # user said --no-apptainer -> False + # user said nothing (None) -> True iff SLURM_JOB_ID is in env + # Rationale: on a fresh SLURM compute node, the NFS conda cold-start tax + # (~1-2 M page faults, ~2.5 us each) is large; the dedicated SIF skips it. + # On the login node, conda is usually warm and the simpler path wins. + # See rules/apptainer_vs_conda.md. + slurm_job = os.environ.get("SLURM_JOB_ID") + if args.apptainer is None: + # Auto-enable SIF mode only when both (a) we're on a SLURM compute + # node where the conda cold-start tax bites, AND (b) $IGV_REPORTS_SIF + # points at an existing SIF. The existence check protects users from + # a confusing SIF-not-found error when they didn't ask for apptainer. + sif_ok = IGVREPORTS_SIF is not None and IGVREPORTS_SIF.exists() + args.apptainer = bool(slurm_job) and sif_ok + if args.apptainer: + decision = f"auto-enabled (SLURM_JOB_ID={slurm_job}, SIF={IGVREPORTS_SIF})" + elif slurm_job and IGVREPORTS_SIF is None: + decision = ( + f"auto-disabled (SLURM_JOB_ID={slurm_job} set, but $IGV_REPORTS_SIF unset; " + f"falling back to PATH create_report)" + ) + elif slurm_job: + decision = ( + f"auto-disabled (SLURM_JOB_ID={slurm_job} set, but SIF not found at " + f"{IGVREPORTS_SIF}; falling back to PATH create_report)" + ) + else: + decision = "auto-disabled (no SLURM_JOB_ID; PATH create_report path)" + log.info(f"apptainer: {decision}") + else: + log.info(f"apptainer: {args.apptainer} (explicit)") + + if args.no_default_tracks: + default_tracks: list[str] = [] + log.info("default tracks: skipped (--no-default-tracks)") + else: + default_tracks = resolve_default_tracks(cfg, genome, log) + log.info(f"default tracks resolved: {len(default_tracks)}") + for t in default_tracks: + log.info(f" - {t}") + + extra_tracks = [Path(p) for p in args.extra_track] + + track_config = Path(args.track_config) if args.track_config else None + if track_config is not None and not track_config.exists(): + raise SystemExit(f"ERROR: --track-config file not found: {track_config}") + + if args.sites: + title = args.title or f"{Path(args.sites).stem} ({genome})" + build_one( + sites=Path(args.sites), + bams=[Path(b) for b in args.bam], + vcf=Path(args.vcf) if args.vcf else None, + extra_tracks=extra_tracks, + fasta=fasta, + default_tracks=default_tracks, + output=Path(args.output), + title=title, + flanking=args.flanking, + log=log, + track_config=track_config, + report_type=args.report_type, + info_columns=args.info_columns, + use_apptainer=args.apptainer, + also_png=args.also_png, + igver_cmd=args.igver_cmd, + png_dpi=args.png_dpi, + png_display_mode=args.png_display_mode, + ) + else: + rows = parse_samplesheet(Path(args.samplesheet)) + n_jobs = max(1, args.jobs) + n_workers = min(n_jobs, len(rows)) if rows else 1 + mode = "sequential" if n_workers == 1 else f"parallel ({n_workers} workers)" + log.info(f"cohort: {len(rows)} samples from {args.samplesheet} — {mode}") + report_paths: dict[str, Path] = {} + failures: list[tuple[str, str]] = [] # (sample, error_message) + + def _build_row(row: dict) -> tuple[str, Path]: + """Build one sample. Runs in a worker thread when --jobs > 1. + + Returns (sample, out_html). Raises on build failure — caught by + the executor and surfaced via future.exception() in the caller.""" + sample = row["sample"] + sites = Path(row["sites_bed"]) + bams = [Path(row[k]) for k in ("bam_tumor", "bam_normal") if row.get(k)] + vcf = Path(row["vcf"]) if row.get("vcf") else None + sample_extras = list(extra_tracks) + if row.get("extra_tracks"): + sample_extras += [Path(p.strip()) for p in row["extra_tracks"].split(",") if p.strip()] + out_html = out_dir / f"{sample}.{genome}.html" + title = args.title or f"{sample} ({genome})" + log.info(f"=== {sample} ===") + build_one( + sites=sites, bams=bams, vcf=vcf, extra_tracks=sample_extras, + fasta=fasta, default_tracks=default_tracks, + output=out_html, title=title, flanking=args.flanking, log=log, + track_config=track_config, + report_type=args.report_type, + info_columns=args.info_columns, + use_apptainer=args.apptainer, + also_png=args.also_png, + igver_cmd=args.igver_cmd, + png_dpi=args.png_dpi, + png_display_mode=args.png_display_mode, + ) + return sample, out_html + + # ThreadPoolExecutor is the right primitive here: build_one() spends + # nearly all its wall time inside subprocess.run(create_report), which + # releases the GIL — so threads scale linearly to the number of + # concurrent create_report processes the host can support. Don't use + # ProcessPoolExecutor: build_one() captures a non-picklable logger. + if n_workers == 1: + for row in rows: + try: + sample, out_html = _build_row(row) + report_paths[sample] = out_html + except SystemExit as exc: + failures.append((row.get("sample", "?"), f"exit={exc.code}")) + except Exception as exc: + failures.append((row.get("sample", "?"), f"{type(exc).__name__}: {exc}")) + else: + with ThreadPoolExecutor(max_workers=n_workers) as pool: + future_to_sample = { + pool.submit(_build_row, row): row.get("sample", "?") for row in rows + } + # as_completed lets failures surface immediately while other + # samples continue building. We collect all errors and decide + # whether to fail the whole run at the end. + for fut in as_completed(future_to_sample): + sample_name = future_to_sample[fut] + try: + sample, out_html = fut.result() + report_paths[sample] = out_html + except SystemExit as exc: + failures.append((sample_name, f"exit={exc.code}")) + except Exception as exc: + failures.append((sample_name, f"{type(exc).__name__}: {exc}")) + + if failures: + log.error(f"cohort: {len(failures)} of {len(rows)} samples FAILED:") + for s, err in failures: + log.error(f" - {s}: {err}") + # Always raise on build failures — these aren't verifier soft-fails, + # they're missing HTMLs. --fail-on-fail is for verifier behavior. + raise SystemExit(1) + + idx = write_index(report_paths, out_dir / "index.html", f"igv-reports cohort ({genome})") + log.info(f"Wrote cohort index: {idx}") + + if args.verify: + run_cohort_verify( + samplesheet=Path(args.samplesheet), + reports_dir=out_dir, + genome=genome, + db_config=Path(args.db_config), + fail_on_fail=args.fail_on_fail, + log=log, + ) + else: + log.info("verify_cohort: skipped (--no-verify)") + + if args.anchors_mode != "off": + if not args.anchors: + raise SystemExit("ERROR: --anchors PATH required when --anchors-mode != off") + anchors_path = Path(args.anchors) + if args.anchors_mode == "generate": + sites_files = sorted({Path(r["sites_bed"]) for r in rows if r.get("sites_bed")}) + run_anchors_generate( + samplesheet=Path(args.samplesheet), + sites_files=sites_files, + out=anchors_path, + fail_on_fail=args.fail_on_fail, + log=log, + ) + else: # verify + run_anchors_verify( + samplesheet=Path(args.samplesheet), + reports_dir=out_dir, + genome=genome, + anchors=anchors_path, + fail_on_fail=args.fail_on_fail, + log=log, + ) + + log.info(f"=== DONE: build_igvreports.py completed successfully ===") + + +if __name__ == "__main__": + main() diff --git a/igv-reports/scripts/generate_tracks_json.py b/igv-reports/scripts/generate_tracks_json.py new file mode 100755 index 0000000..baedd42 --- /dev/null +++ b/igv-reports/scripts/generate_tracks_json.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +"""generate_tracks_json.py — build an igv-reports tracks.json from a YAML spec. + +Author: Samuel Ahuno +Purpose: + ONT methylation viewers need named, colored, y-axis-locked tracks that + the positional `create_report --tracks` API cannot express. The path is + `--track-config `, but hand-writing that JSON for 4-8 samples + with 5mC + 5hmC bedGraph pairs each is tedious and error-prone. + + This helper consumes a small YAML spec (see + examples/methylation_ont/tracks_spec.example.yaml) and emits the JSON + with the right defaults baked in: + + * BAM tracks -> colorBy=basemod2, showSoftClips=false, displayMode=COLLAPSED + * bedGraph -> type=wig, min=0, max=100 (methylation percent) + * Annotation -> displayMode honored, color honored + * Group color -> reads from `group_colors:` map keyed by sample.group + +Usage: + python generate_tracks_json.py \ + --spec examples/methylation_ont/tracks_spec.example.yaml \ + --run-dir examples/methylation_ont \ + --out examples/methylation_ont/tracks.json + + --run-dir is prepended to any relative `url:` path in the spec, so the + emitted JSON has absolute paths that create_report can resolve from any + working directory. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print("ERROR: PyYAML not available. Install with: pip install pyyaml", file=sys.stderr) + sys.exit(2) + + +BAM_DEFAULTS = { + "format": "bam", + "type": "alignment", + "colorBy": "basemod2", + "showSoftClips": False, + "displayMode": "COLLAPSED", +} + +BEDGRAPH_DEFAULTS = { + "format": "bedgraph", + "type": "wig", + "min": 0, + "max": 100, +} + + +# YAML shortcut keys (annotation: - default: ) map to the +# databases_config.yaml field for each genome plus display metadata. +# Colors are Okabe-Ito where chosen — colorblind-safe. format/displayMode +# match what build_igvreports.py emits on the non-track-config path. +ANNOTATION_DEFAULTS = { + "cgi": { + "display_name": "CpG islands", + "yaml_key": "CpGIslands", + "format": "bed", + "displayMode": "EXPANDED", + "color": "rgb(0,158,115)", # Okabe-Ito green + }, + "gencode": { + "display_name": "Gencode", + "yaml_key": "gtf", + "format": "gff", # works for .gtf.gz and .gff3.gz + "displayMode": "EXPANDED", + "color": None, # IGV.js renders its own gene-track palette + }, + "repmasker": { + "display_name": "RepeatMasker", + "yaml_key": "repMaskerBed", + "format": "bed", + "displayMode": "COLLAPSED", + "color": None, + }, + "epdnew_coding": { + "display_name": "EPDnew (coding)", + "yaml_key": "EPDnewCoding", + "format": "bed", + "displayMode": "EXPANDED", + "color": "rgb(213,94,0)", # Okabe-Ito vermillion + }, + "epdnew_noncoding": { + "display_name": "EPDnew (non-coding)", + "yaml_key": "EPDnewNonCoding", + "format": "bed", + "displayMode": "EXPANDED", + "color": "rgb(86,180,233)", # Okabe-Ito sky blue + }, +} + + +def load_db_config(path: Path) -> dict: + """Load databases_config.yaml; return {} on miss. Same semantics as the + twin function in build_igvreports.py so the two stay aligned.""" + if not path.exists(): + sys.stderr.write( + f"[generate_tracks_json] WARNING: db-config not found at {path}\n" + " Annotation entries using `default:` shortcuts will fail to resolve.\n" + " Use explicit `url:` paths, or set $IGV_REPORTS_DB_CONFIG.\n" + ) + return {} + with path.open() as fh: + return yaml.safe_load(fh) or {} + + +def resolve_annotation_default(default_key: str, genome: str, cfg: dict) -> dict: + """Look up a built-in annotation by short key (`cgi`, `gencode`, ...) for + the given genome in the databases YAML. Returns a partial track dict with + `display_name` / `url` / `indexURL` / `format` / `displayMode` / `color` + populated; caller merges with name-overrides from the YAML. + + Raises SystemExit if the key is unknown, the genome is absent, or the + resolved path doesn't exist on disk.""" + if default_key not in ANNOTATION_DEFAULTS: + valid = ", ".join(sorted(ANNOTATION_DEFAULTS)) + raise SystemExit( + f"ERROR: unknown annotation default '{default_key}'. Valid: {valid}" + ) + meta = ANNOTATION_DEFAULTS[default_key] + g = cfg.get("reference_genomes", {}).get("local", {}).get(genome, {}) + if not g: + raise SystemExit( + f"ERROR: db-config has no entry for genome '{genome}' " + f"(needed to resolve `default: {default_key}`)." + ) + yaml_key = meta["yaml_key"] + raw = g.get(yaml_key) + if not raw: + raise SystemExit( + f"ERROR: db-config has no '{yaml_key}' for genome '{genome}' " + f"(needed to resolve `default: {default_key}`)." + ) + # For hg38 gencode, prefer the bgzip+tabix .gff3.gz sibling if present + # (mirrors build_igvreports.py:resolve_default_tracks gencode handling). + url = raw + if default_key == "gencode" and genome == "hg38": + sibling = Path(raw).parent / "gencode.v47.annotation.gff3.gz" + if sibling.exists() and (sibling.parent / (sibling.name + ".tbi")).exists(): + url = str(sibling) + if not Path(url).exists(): + raise SystemExit( + f"ERROR: resolved path missing on disk for `default: {default_key}` " + f"({genome}): {url}" + ) + # indexURL: include only if it actually exists. tabix .tbi is the standard + # sibling for bgzipped tracks; igv.js falls back gracefully when absent. + index_url = None + for cand in (url + ".tbi", url + ".csi"): + if Path(cand).exists(): + index_url = cand + break + + track: dict = { + "display_name": meta["display_name"], + "url": url, + "format": meta["format"], + "displayMode": meta["displayMode"], + } + if index_url is not None: + track["indexURL"] = index_url + if meta["color"] is not None: + track["color"] = meta["color"] + return track + + +def abspath_relative_to(p: str, run_dir: Path) -> str: + """Resolve `p` to an absolute path. If `p` is already absolute, return as-is.""" + pp = Path(p) + if pp.is_absolute(): + return str(pp) + return str((run_dir / pp).resolve()) + + +def build_annotation_tracks(spec: dict, run_dir: Path, cfg: dict | None = None) -> list[dict]: + """Build the annotation-track list. Each entry in `spec["annotation"]` + is either: + + Explicit (existing behavior): + - name: "Gencode v47" + url: /abs/or/relative/path.gff3.gz + indexURL: /abs/or/relative/path.gff3.gz.tbi (optional) + format: gff (optional, default bed) + displayMode: EXPANDED (optional) + color: "rgb(...)" (optional) + + Shortcut (NEW — needs top-level `genome:` in spec and a loaded `cfg`): + - default: gencode # one of: cgi, gencode, repmasker, + # epdnew_coding, epdnew_noncoding + name: "Gencode v47" # OPTIONAL override of the canned display name + color: "rgb(...)" # OPTIONAL override of the canned color + displayMode: COLLAPSED # OPTIONAL override + + Shortcut entries are resolved through resolve_annotation_default() against + the databases YAML keyed by the spec's top-level `genome:`.""" + out: list[dict] = [] + genome = spec.get("genome") + for a in spec.get("annotation", []): + if "default" in a: + if not genome: + raise SystemExit( + "ERROR: annotation entry uses `default:` but spec is missing " + "top-level `genome:` — add e.g. `genome: hg38` to the YAML." + ) + resolved = resolve_annotation_default(a["default"], genome, cfg or {}) + track = { + "name": a.get("name", resolved["display_name"]), + "url": resolved["url"], + "format": a.get("format", resolved["format"]), + "type": "annotation", + "displayMode": a.get("displayMode", resolved["displayMode"]), + } + if "indexURL" in resolved: + track["indexURL"] = resolved["indexURL"] + if a.get("color") or resolved.get("color"): + track["color"] = a.get("color", resolved.get("color")) + out.append(track) + continue + # Explicit-path entry — preserves the prior behavior verbatim. + track = { + "name": a["name"], + "url": abspath_relative_to(a["url"], run_dir), + "format": a.get("format", "bed"), + "type": "annotation", + "displayMode": a.get("displayMode", "EXPANDED"), + } + if a.get("indexURL"): + track["indexURL"] = abspath_relative_to(a["indexURL"], run_dir) + if a.get("color"): + track["color"] = a["color"] + out.append(track) + return out + + +def build_sample_tracks(spec: dict, run_dir: Path) -> list[dict]: + group_colors = spec.get("group_colors", {}) + out: list[dict] = [] + for s in spec.get("samples", []): + name = s["name"] + group = s.get("group", "default") + gc = group_colors.get(group, {}) + + # BAM (per-read basemod2 view). + if s.get("bam"): + bam_abs = abspath_relative_to(s["bam"], run_dir) + track = {"name": name, "url": bam_abs, "indexURL": bam_abs + ".bai"} + track.update(BAM_DEFAULTS) + out.append(track) + + # 5mC bedGraph. + if s.get("bedgraph_5mC"): + track = { + "name": f"{name} 5mC", + "url": abspath_relative_to(s["bedgraph_5mC"], run_dir), + } + track.update(BEDGRAPH_DEFAULTS) + if gc.get("5mC"): + track["color"] = gc["5mC"] + out.append(track) + + # 5hmC bedGraph. + if s.get("bedgraph_5hmC"): + track = { + "name": f"{name} 5hmC", + "url": abspath_relative_to(s["bedgraph_5hmC"], run_dir), + } + track.update(BEDGRAPH_DEFAULTS) + if gc.get("5hmC"): + track["color"] = gc["5hmC"] + out.append(track) + + return out + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--spec", required=True, help="YAML spec (see tracks_spec.example.yaml)") + ap.add_argument("--run-dir", required=True, help="dir that relative urls in spec are resolved against") + ap.add_argument("--out", required=True, help="output tracks.json path") + ap.add_argument("--db-config", default=os.environ.get("IGV_REPORTS_DB_CONFIG"), help=( + "Databases YAML used to resolve `annotation: - default: ` shortcuts " + "(cgi/gencode/repmasker/epdnew_coding/epdnew_noncoding) for the spec's " + "`genome:`. Defaults to $IGV_REPORTS_DB_CONFIG. " + "Not loaded if no shortcut entries appear. The YAML schema is " + "`reference_genomes.local..{CpGIslands,gtf,repMaskerBed," + "EPDnewCoding,EPDnewNonCoding}` — see references/databases_config_paths.md." + )) + ap.add_argument("--force", action="store_true", + help="overwrite --out if it already exists (default: refuse and exit 2 so hand-edits aren't clobbered)") + args = ap.parse_args() + + spec_path = Path(args.spec) + if not spec_path.exists(): + raise SystemExit(f"ERROR: spec not found: {spec_path}") + run_dir = Path(args.run_dir).resolve() + if not run_dir.exists(): + raise SystemExit(f"ERROR: run-dir not found: {run_dir}") + + with spec_path.open() as fh: + spec = yaml.safe_load(fh) + + # Only load the db-config if any annotation entry uses the shortcut form; + # specs that hand-paste paths remain self-contained. + needs_cfg = any("default" in a for a in spec.get("annotation", [])) + if needs_cfg: + if not args.db_config: + raise SystemExit( + "ERROR: spec has `default:` annotation shortcuts but --db-config " + "is not set and $IGV_REPORTS_DB_CONFIG is empty.\n" + " Pass --db-config /path/to/databases.yaml, or convert the " + "shortcuts to explicit `url:` entries." + ) + cfg = load_db_config(Path(args.db_config)) + else: + cfg = {} + + tracks = build_annotation_tracks(spec, run_dir, cfg) + build_sample_tracks(spec, run_dir) + + out_path = Path(args.out) + if out_path.exists() and not args.force: + raise SystemExit( + f"ERROR: {out_path} already exists. A user may have hand-edited it after generation.\n" + " Pass --force to overwrite, or move the existing file aside and rerun." + ) + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w") as fh: + json.dump(tracks, fh, indent=2) + fh.write("\n") + + print(f"Wrote {len(tracks)} tracks to {out_path}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/igv-reports/scripts/prep_track.sh b/igv-reports/scripts/prep_track.sh new file mode 100755 index 0000000..42b2b01 --- /dev/null +++ b/igv-reports/scripts/prep_track.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# prep_track.sh — convert a plain-gzip GFF3/GTF/BED.gz into a properly +# bgzipped + tabix-indexed track that igv-reports can load. +# +# Author: Samuel Ahuno +# Why: igv-reports parses tracks by extension and needs bgzip+tabix. +# Plain gzip with `.gz` extension trips it with a UnicodeDecodeError or +# silently fails. Tabix indexing additionally requires position-sorted +# records within each chromosome, which gencode/many-other distributions +# do not guarantee — they interleave records by feature type. +# +# Pipeline: backup -> gunzip -> sort by chr+pos (preserving header) -> +# bgzip in place -> tabix -p . +# +# Usage: +# prep_track.sh +# prep_track.sh --out +# +# In-place mode (default): +# (replaced with new bgzip) +# .tbi (new tabix index) +# .bak.original_gzip (backup of the original .gz) +# +# Sibling mode (--out PATH; non-destructive): +# (unchanged) +# (new bgzip — same extension family as input) +# .tbi (new tabix index) +# (no backup created — original is left as-is) + +set -euo pipefail + +INPUT="" +OUT="" +while [[ $# -gt 0 ]]; do + case "$1" in + --out) + [[ $# -lt 2 ]] && { echo "ERROR: --out requires a path" >&2; exit 2; } + OUT=$2; shift 2 ;; + --out=*) + OUT=${1#--out=}; shift ;; + -h|--help) + sed -n '2,28p' "$0" >&2; exit 0 ;; + --) + shift; break ;; + -*) + echo "ERROR: unknown flag: $1" >&2; exit 2 ;; + *) + if [[ -z "$INPUT" ]]; then INPUT=$1 + else echo "ERROR: unexpected positional arg: $1" >&2; exit 2 + fi + shift ;; + esac +done + +if [[ -z "$INPUT" ]]; then + echo "Usage: $0 [--out ]" >&2 + exit 2 +fi +if [[ ! -f "$INPUT" ]]; then + echo "ERROR: file not found: $INPUT" >&2 + exit 2 +fi +if [[ -n "$OUT" && -e "$OUT" ]]; then + echo "ERROR: --out target already exists: $OUT — refusing to overwrite. Move it aside and rerun." >&2 + exit 2 +fi + +# Detect format by suffix. +case "$INPUT" in + *.gff3.gz|*.gff.gz) FMT=gff ;; + *.gtf.gz) FMT=gff ;; # tabix preset for GTF is named "gff" + *.bed.gz|*.bedgraph.gz) FMT=bed ;; + *) echo "ERROR: unsupported extension: $INPUT (need .gff3.gz, .gtf.gz, .bed.gz, .bedgraph.gz)" >&2; exit 2 ;; +esac + +# Need bgzip / tabix / sort / gunzip. +for tool in bgzip tabix sort gunzip awk file; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "ERROR: $tool not on PATH. Install htslib and add bgzip/tabix to PATH first." >&2 + exit 2 + fi +done + +# Resolve where the final bgzip + .tbi will land. In sibling mode we never +# touch the original. In in-place mode the target IS the original, with a +# backup taken first. +if [[ -n "$OUT" ]]; then + TARGET=$OUT + mkdir -p "$(dirname "$TARGET")" +else + TARGET=$INPUT +fi + +# Detect if already bgzip — skip the whole conversion if it is and just +# rebuild the index. (In sibling mode this means: copy + index, leaving +# the original untouched.) +if file "$INPUT" | grep -q "extra field"; then + if [[ "$TARGET" != "$INPUT" ]]; then + cp -p "$INPUT" "$TARGET" + echo "[$(date '+%F %T')] $INPUT already bgzip; copied to $TARGET, rebuilding tabix index." + else + echo "[$(date '+%F %T')] $INPUT is already bgzip; rebuilding tabix index only." + fi + rm -f "${TARGET}.tbi" + tabix -p "$FMT" "$TARGET" + echo "[$(date '+%F %T')] DONE: ${TARGET}.tbi" + exit 0 +fi + +# In-place mode: take a backup of the original. In sibling mode no backup is +# needed since the original is never modified. +if [[ "$TARGET" == "$INPUT" ]]; then + BACKUP="${INPUT}.bak.original_gzip" + if [[ -f "$BACKUP" ]]; then + echo "[$(date '+%F %T')] backup already exists: $BACKUP — refusing to overwrite. Move it aside and rerun if you want a fresh backup." + else + cp -p "$INPUT" "$BACKUP" + echo "[$(date '+%F %T')] backed up to $BACKUP" + fi +fi + +# Decompress to a sibling-of-INPUT temp (always, regardless of target). +TMP="${INPUT%.gz}.unsorted.tmp" +SORTED="${INPUT%.gz}.sorted.tmp" +gunzip -c "$INPUT" > "$TMP" +echo "[$(date '+%F %T')] decompressed to $TMP ($(stat -c %s "$TMP") bytes)" + +# Sort: preserve any leading # header lines, sort body by chr (column 1) +# then numeric pos (column 4 for GFF/GTF; column 2 for BED). +case "$FMT" in + gff) POS_COL=4 ;; + bed) POS_COL=2 ;; +esac + +(grep '^#' "$TMP" || true) > "$SORTED" +grep -v '^#' "$TMP" \ + | sort -k1,1 -k${POS_COL},${POS_COL}n -S 2G --parallel=4 \ + >> "$SORTED" +echo "[$(date '+%F %T')] sorted by chr,pos (col $POS_COL) into $SORTED" + +# bgzip and index. Sibling mode: SORTED -> TARGET. In-place: SORTED -> TARGET (== INPUT). +TARGET_UNCOMPRESSED="${TARGET%.gz}" +mv "$SORTED" "$TARGET_UNCOMPRESSED" +rm -f "$TMP" +# Remove any pre-existing .gz at the target (in in-place mode the original +# plain-gzip file is still present; bgzip refuses to overwrite without -f). +rm -f "$TARGET" +bgzip -@ 4 "$TARGET_UNCOMPRESSED" +echo "[$(date '+%F %T')] bgzipped: $TARGET ($(stat -c %s "$TARGET") bytes)" + +rm -f "${TARGET}.tbi" +tabix -p "$FMT" "$TARGET" +echo "[$(date '+%F %T')] indexed: ${TARGET}.tbi ($(stat -c %s "${TARGET}.tbi") bytes)" + +# Sanity check: pull the first contig's first 100 kb and confirm tabix returns rows. +FIRST_CONTIG=$(zcat "$TARGET" | awk '$1!~/^#/ {print $1; exit}') +if [[ -n "$FIRST_CONTIG" ]]; then + N=$(tabix "$TARGET" "${FIRST_CONTIG}:1-100000" | wc -l) + echo "[$(date '+%F %T')] sanity: ${FIRST_CONTIG}:1-100000 returns $N row(s)" +fi + +if [[ "$TARGET" == "$INPUT" ]]; then + echo "[$(date '+%F %T')] DONE — track ready for igv-reports. Original preserved at $BACKUP" +else + echo "[$(date '+%F %T')] DONE — sibling track ready at $TARGET. Original $INPUT untouched." +fi diff --git a/igv-reports/scripts/verify_anchors.py b/igv-reports/scripts/verify_anchors.py new file mode 100755 index 0000000..d11efc8 --- /dev/null +++ b/igv-reports/scripts/verify_anchors.py @@ -0,0 +1,822 @@ +#!/usr/bin/env python3 +"""verify_anchors.py — content verifier for create_report HTMLs. + +Author: Samuel Ahuno +Purpose: + The structural verifier (verify_report / verify_cohort) confirms the HTML + *says* the right thing: region count, coords, track names. It cannot + confirm the embedded BAM slices actually contain the data they claim to. + Failure modes it misses: + + 1. Sample swap — track name says `p17424_1.sorted` but the slice was + cut from `p17424_3.sorted.bam` (cohort loop wired the wrong path; + Path.stem matched and the structural check passed). + 2. Silent empty slice — region rendered, slice is 0 reads (failed + index, BAM corruption, coords outside coverage). + 3. Regression across create_report versions — flanking/slicing logic + changes silently between releases. + + This verifier closes the gap by re-running `samtools view -c` against + both the source BAM (at generate time) and the embedded slice (at + verify time), then comparing counts. + + Anchor TSV format (`#`-prefixed header, lab BED-output convention): + + #sample track_name chrom start end expected tolerance min max notes + + - `tolerance` and `min`/`max` are mutually exclusive per row; if `min` + or `max` is non-empty it wins. Blank tolerance falls back to + --tolerance flag default (0.05). + - `expected` is the count from `samtools view -c -F 1536 source.bam + chrom:start-end` at generate time. Generate writes it; verify reads it. + +Subcommands: + generate — walk (sample × region) grid, count reads from source BAMs, + write an anchors.tsv that becomes a regression fixture. + verify — given one HTML + anchors.tsv, decode each anchor's BAM + slice and count it, compare to expected. + verify-cohort — apply `verify` across all HTMLs in a cohort. + +Container resolution (samtools): + 1. --samtools-sif PATH + 2. $SAMTOOLS_SIF_DEFAULT env var + 3. `samtools` on PATH + 4. Hard error + +Typical use: + # at build time, freeze the regression fixture + python verify_anchors.py generate \\ + --samplesheet sheet.tsv \\ + --sites sites.hg38.bed \\ + --out anchors.hg38.tsv + + # any time after, audit a built HTML + python verify_anchors.py verify \\ + --html report.hg38.html \\ + --anchors anchors.hg38.tsv \\ + --out verify_anchors.tsv \\ + --fail-on-fail + + # cohort-wide + python verify_anchors.py verify-cohort \\ + --samplesheet sheet.tsv \\ + --reports-dir results//reports/ \\ + --genome hg38 \\ + --anchors anchors.hg38.tsv \\ + --out cohort_verify_anchors.tsv +""" + +from __future__ import annotations + +import argparse +import base64 +import dataclasses +import os +import re +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +# Same-dir imports — reuse verify_report's HTML parser helpers. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +import verify_report as vr + + +_SAMTOOLS_SIF_ENV = os.environ.get("SAMTOOLS_SIF_DEFAULT") +DEFAULT_SAMTOOLS_SIF = Path(_SAMTOOLS_SIF_ENV) if _SAMTOOLS_SIF_ENV else None + + +def _apptainer_bind_args() -> list[str]: + """Conditional `--bind ` tokens, matching build_igvreports.py. + + Source: $IGV_REPORTS_BIND (colon-separated). Empty / unset = no binds. + Paths that don't exist are silently skipped.""" + raw = os.environ.get("IGV_REPORTS_BIND", "") + if not raw: + return [] + tokens: list[str] = [] + for p in raw.split(":"): + if p and Path(p).exists(): + tokens.extend(["--bind", p]) + return tokens +# Match igv-reports BamReader default exclude flag (rules out PCR/optical +# duplicates and supplementary alignments — see igv_reports/bam.py). +EXCLUDE_FLAGS = "1536" +DEFAULT_TOLERANCE = 0.05 +ANCHOR_HEADER = [ + "sample", "track_name", "track_type", "chrom", "start", "end", + "expected", "tolerance", "min", "max", "notes", +] +# Supported track_type values. `bam` = samtools-view read count; +# `bedgraph` = data row count in the wig/bedGraph slice (CpG count for +# methylation, peak count for ChIP coverage, etc.). +VALID_TRACK_TYPES = {"bam", "bedgraph"} + + +@dataclasses.dataclass +class AnchorRow: + sample: str + track_name: str + chrom: str + start: int + end: int + expected: int + track_type: str = "bam" # bam | bedgraph; bam keeps backwards compat + tolerance: str = "" # blank => fall back to --tolerance flag + min_count: str = "" # blank => not used + max_count: str = "" # blank => not used + notes: str = "" + + @property + def region(self) -> str: + return f"{self.chrom}:{self.start}-{self.end}" + + +@dataclasses.dataclass +class AnchorCheck: + sample: str + track_name: str + region: str + status: str # PASS | FAIL | SKIP + observed: str = "" + expected: str = "" + details: str = "" + + +# --------------------------------------------------------------------------- +# samtools resolution +# --------------------------------------------------------------------------- + +def resolve_samtools(sif: Path | None) -> list[str]: + """Return a samtools command prefix (list of argv tokens). + + Priority: --samtools-sif → $SAMTOOLS_SIF → $SAMTOOLS_SIF_DEFAULT → PATH. + Falling back to PATH emits a warning (SIF preferred on shared HPC + storage where conda envs on NFS pay a cold-start cost).""" + candidate = sif + if candidate is None: + env = os.environ.get("SAMTOOLS_SIF") + if env: + candidate = Path(env) + if candidate is None and DEFAULT_SAMTOOLS_SIF is not None and DEFAULT_SAMTOOLS_SIF.exists(): + candidate = DEFAULT_SAMTOOLS_SIF + if candidate is not None: + if not candidate.exists(): + raise SystemExit(f"ERROR: samtools SIF not found: {candidate}") + return [ + "singularity", "exec", "--cleanenv", *_apptainer_bind_args(), + str(candidate), "samtools", + ] + path_sam = shutil.which("samtools") + if path_sam: + sys.stderr.write( + f"[verify_anchors] WARNING: falling back to PATH samtools at {path_sam}; " + "SIF preferred for HPC cold-start cost (rules/apptainer_vs_conda.md)\n" + ) + return [path_sam] + raise SystemExit( + "ERROR: no samtools found. Provide --samtools-sif, set $SAMTOOLS_SIF, " + "or install samtools on PATH." + ) + + +def samtools_count(samtools_cmd: list[str], bam: Path, region: str) -> int: + """Run `samtools view -c -F 1536 ` and return the count.""" + proc = subprocess.run( + samtools_cmd + ["view", "-c", "-F", EXCLUDE_FLAGS, str(bam), region], + capture_output=True, text=True, + ) + if proc.returncode != 0: + raise RuntimeError( + f"samtools view -c failed (exit {proc.returncode}) for {bam} {region}: " + f"{proc.stderr.strip()}" + ) + return int(proc.stdout.strip()) + + +def samtools_index(samtools_cmd: list[str], bam: Path) -> None: + """Run `samtools index `.""" + proc = subprocess.run( + samtools_cmd + ["index", str(bam)], + capture_output=True, text=True, + ) + if proc.returncode != 0: + raise RuntimeError( + f"samtools index failed (exit {proc.returncode}) for {bam}: " + f"{proc.stderr.strip()}" + ) + + +# --------------------------------------------------------------------------- +# bedGraph / wig counting +# --------------------------------------------------------------------------- + +# A line in a wig/bedGraph file is either a header (track/fixedStep/ +# variableStep/browser/#) or a data row. We count data rows only. +_WIG_HEADER_PREFIXES = ("track", "browser", "fixedStep", "variableStep", "#") + + +def _is_wig_data_line(line: str) -> bool: + """True iff `line` is a non-empty wig/bedGraph data row (not header, + not comment, not blank).""" + s = line.strip() + if not s: + return False + if s.startswith(_WIG_HEADER_PREFIXES): + return False + return True + + +def bedgraph_count_source(track_path: Path, chrom: str, start: int, end: int) -> int: + """Count data rows in `track_path` (bedGraph or wig) overlapping the + region [start, end) on `chrom`. + + Handles three input shapes: + - bgzip+tabix indexed (`.bg.gz` / `.bedgraph.gz` + sibling `.tbi`): + delegate to `tabix` for O(log N) lookup. + - Plain gzip: stream-decompress, linear scan filtering on chrom + overlap. + - Plain text: linear scan filtering on chrom + overlap. + + Overlap rule matches IGV/igv-reports: row [r_start, r_end) overlaps + query [q_start, q_end) iff r_start < q_end AND r_end > q_start. + + Raises FileNotFoundError if `track_path` is absent. Returns 0 for a + region with no overlapping rows.""" + import gzip + if not track_path.exists(): + raise FileNotFoundError(f"bedGraph track not found: {track_path}") + + tbi = track_path.with_suffix(track_path.suffix + ".tbi") + if track_path.suffix == ".gz" and tbi.exists() and shutil.which("tabix"): + # Fast path — tabix-indexed bgzip. Tabix already handles overlap and + # comment-line skipping; we count the lines it emits. + proc = subprocess.run( + ["tabix", str(track_path), f"{chrom}:{start}-{end}"], + capture_output=True, text=True, + ) + if proc.returncode != 0: + raise RuntimeError( + f"tabix failed for {track_path} {chrom}:{start}-{end}: {proc.stderr.strip()}" + ) + return sum(1 for ln in proc.stdout.splitlines() if _is_wig_data_line(ln)) + + # Slow path — linear scan. Open with gzip if .gz, else text. Filter on + # chrom first (cheap) before parsing positions. + opener = gzip.open if track_path.suffix == ".gz" else open + count = 0 + with opener(track_path, "rt") as fh: + for line in fh: + if not _is_wig_data_line(line): + continue + cols = line.rstrip("\n").split("\t") + if len(cols) < 3: + continue + if cols[0] != chrom: + continue + try: + r_start = int(cols[1]) + r_end = int(cols[2]) + except ValueError: + continue + if r_start < end and r_end > start: + count += 1 + return count + + +def bedgraph_count_slice(slice_bytes: bytes) -> int: + """Count data rows in a wig/bedGraph slice that was extracted from an + igv-reports HTML via decode_track_slice(). + + igv-reports stores wig slices as `data:application/gzip;base64,<...>` + where the decoded bytes are gzipped wig text (per + igv_reports/datauri.py:get_data_uri). We gunzip in-memory and count + data rows.""" + import gzip + try: + text = gzip.decompress(slice_bytes).decode("utf-8", errors="replace") + except (OSError, gzip.BadGzipFile): + # Some create_report versions write the wig slice uncompressed for + # small payloads. Fall back to raw bytes interpreted as text. + text = slice_bytes.decode("utf-8", errors="replace") + return sum(1 for ln in text.splitlines() if _is_wig_data_line(ln)) + + +# --------------------------------------------------------------------------- +# anchors.tsv I/O +# --------------------------------------------------------------------------- + +def write_anchors(anchors: list[AnchorRow], out: Path) -> None: + out.parent.mkdir(parents=True, exist_ok=True) + lines = ["#" + "\t".join(ANCHOR_HEADER)] + for a in anchors: + lines.append("\t".join([ + a.sample, a.track_name, a.track_type, + a.chrom, str(a.start), str(a.end), + str(a.expected), a.tolerance, a.min_count, a.max_count, a.notes, + ])) + out.write_text("\n".join(lines) + "\n") + + +def load_anchors(path: Path) -> list[AnchorRow]: + if not path.exists(): + raise SystemExit(f"ERROR: anchors TSV not found: {path}") + rows: list[AnchorRow] = [] + with path.open() as fh: + header: list[str] | None = None + for i, line in enumerate(fh, start=1): + line = line.rstrip("\n") + if not line: + continue + if line.startswith("#"): + if header is None: + header = line.lstrip("#").split("\t") + continue + if header is None: + raise SystemExit(f"{path}:{i}: data row before header — anchors TSV needs a `#`-prefixed header") + cols = line.split("\t") + if len(cols) < len(header): + cols += [""] * (len(header) - len(cols)) + d = dict(zip(header, cols)) + try: + # Validate numeric optional fields at load time so a mis-tabbed + # row fails here, not deep inside decide_status() with a + # confusing 'could not convert' on the notes value. + tolerance = (d.get("tolerance", "") or "").strip() + if tolerance: + float(tolerance) + min_count = (d.get("min", "") or "").strip() + if min_count: + int(min_count) + max_count = (d.get("max", "") or "").strip() + if max_count: + int(max_count) + # track_type was added 2026-05-19; older anchor files + # without the column default to "bam" so they keep working. + track_type = (d.get("track_type", "") or "bam").strip() or "bam" + if track_type not in VALID_TRACK_TYPES: + raise ValueError( + f"unknown track_type '{track_type}' (valid: {sorted(VALID_TRACK_TYPES)})" + ) + rows.append(AnchorRow( + sample=d["sample"], + track_name=d["track_name"], + track_type=track_type, + chrom=d["chrom"], + start=int(d["start"]), + end=int(d["end"]), + expected=int(d["expected"]), + tolerance=tolerance, + min_count=min_count, + max_count=max_count, + notes=d.get("notes", "") or "", + )) + except (KeyError, ValueError) as e: + raise SystemExit( + f"{path}:{i}: malformed anchor row: {e}\n" + f" row was: {cols!r}\n" + f" expected columns: {ANCHOR_HEADER}\n" + f" hint: TSV reader requires explicit tab separation — " + "if you generate the row with awk, pass `-F'\\t'`." + ) + return rows + + +# --------------------------------------------------------------------------- +# samplesheet → (sample, track_path) iteration (shared with build_igvreports) +# --------------------------------------------------------------------------- + +def parse_samplesheet(path: Path) -> list[dict]: + """Mirror build_igvreports.parse_samplesheet without importing it (avoids + pulling in PyYAML for code paths that don't need it).""" + rows: list[dict] = [] + with path.open() as fh: + header = fh.readline().lstrip("#").rstrip("\n").split("\t") + for ln in fh: + cols = ln.rstrip("\n").split("\t") + if not cols or not cols[0].strip(): + continue + rows.append(dict(zip(header, cols))) + return rows + + +def sample_bam_paths(row: dict) -> list[tuple[str, Path]]: + """Return [(track_name, bam_path), ...] for the BAM columns in a row. + track_name = Path.stem (matches igv-reports' positional auto-naming — + see verify_report.expected_track_labels).""" + out: list[tuple[str, Path]] = [] + for col in ("bam_tumor", "bam_normal"): + v = row.get(col) + if v and v.strip(): + p = Path(v.strip()) + out.append((p.stem, p)) + extras = row.get("extra_tracks") or "" + for entry in extras.split(","): + entry = entry.strip() + if entry.endswith(".bam") or entry.endswith(".cram"): + p = Path(entry) + out.append((p.stem, p)) + return out + + +# wig/bedGraph extensions that we count rows for. .wig included because +# igv-reports treats both as "wig" format under the hood (tracks.py:60-61). +_BEDGRAPH_EXTS = (".bedgraph", ".bedgraph.gz", ".bg", ".bg.gz", ".wig", ".wig.gz") + + +def _is_bedgraph(path: str) -> bool: + p = path.lower() + return any(p.endswith(ext) for ext in _BEDGRAPH_EXTS) + + +def sample_bedgraph_paths(row: dict) -> list[tuple[str, Path]]: + """Return [(track_name, bedgraph_path), ...] for the bedGraph/wig + entries in a row's `extra_tracks` (comma-separated, mirrors the + build_igvreports samplesheet schema). track_name = Path.stem after + stripping a trailing `.gz` — matches igv-reports' positional auto- + naming (a `foo.bedgraph.gz` becomes `foo.bedgraph` in the track table, + then the verifier's structural check strips the format suffix; we + keep the format suffix here so the anchor row pairs unambiguously + with the source file).""" + out: list[tuple[str, Path]] = [] + extras = row.get("extra_tracks") or "" + for entry in extras.split(","): + entry = entry.strip() + if not _is_bedgraph(entry): + continue + p = Path(entry) + stem = p.stem + if stem.endswith(".bedgraph") or stem.endswith(".wig") or stem.endswith(".bg"): + # foo.bedgraph.gz -> Path.stem = 'foo.bedgraph'; strip one + # more level so the track_name matches what igv-reports renders. + stem = stem.rsplit(".", 1)[0] + out.append((stem, p)) + return out + + +# --------------------------------------------------------------------------- +# Slice extraction from embedded session +# --------------------------------------------------------------------------- + +_DATA_URL_RE = re.compile(r"data:[^;]+;base64,(.+)", flags=re.DOTALL) + + +def decode_track_slice(track_url: str, dest: Path) -> Path: + """Decode a track's `data:...;base64,...` URL, write bytes to `dest`. + + Per igv_reports/datauri.py: BAM slices come back from pysam.view as + bytes starting with BGZF magic (0x1f 0x8b), so igv-reports tags them + as `data:application/gzip;base64,...`. We accept any data: URL with a + base64 payload — the bytes are what matters, not the declared mediatype. + """ + m = _DATA_URL_RE.match(track_url) + if not m: + raise ValueError("track url is not a data: base64 URL") + raw = base64.b64decode(m.group(1)) + dest.write_bytes(raw) + return dest + + +def locate_session_entry( + session_dict: dict, table_json: dict, chrom: str, start: int, end: int, +) -> tuple[str, dict | None, str]: + """Locate the session entry for an anchor's (chrom, start+1, end). + + Returns (outcome, session_or_none, detail) where outcome is one of: + 'absent' — no tableJson row matches this region → caller should SKIP + (anchor lists a region the HTML never rendered) + 'broken' — row matched but session missing/undecodable → caller FAILs + (structural inconsistency or HTML corruption) + 'ok' — session decoded; second element is the dict + HTML stores 1-based start (per verify_report comment); BED is 0-based. + """ + headers = table_json.get("headers", []) + try: + col_chrom = headers.index("Chrom") + col_start = headers.index("Start") + col_end = headers.index("End") + except ValueError as e: + return ("broken", None, f"tableJson missing expected column: {e}") + rows = table_json.get("rows", []) + want = (chrom, start + 1, end) + for idx, row in enumerate(rows): + if (row[col_chrom], int(row[col_start]), int(row[col_end])) == want: + data_url = session_dict.get(str(idx)) + if data_url is None: + return ("broken", None, f"sessionDictionary has no entry for row index {idx}") + session = vr.decode_session_entry(data_url) + if session is None: + return ("broken", None, f"session entry {idx} failed to gunzip/decode") + return ("ok", session, "") + return ("absent", None, f"no tableJson row matched ({chrom}, {start+1}, {end})") + + +def find_track(session: dict, track_name: str) -> dict | None: + for t in session.get("tracks", []): + if t.get("name") == track_name: + return t + return None + + +# --------------------------------------------------------------------------- +# Status decision +# --------------------------------------------------------------------------- + +def decide_status(anchor: AnchorRow, observed: int, default_tol: float) -> tuple[str, str]: + """Return (status, details). min/max wins over tolerance when present.""" + if anchor.min_count or anchor.max_count: + bounds_ok = True + bits = [] + if anchor.min_count: + ok = observed >= int(anchor.min_count) + bits.append(f"min={anchor.min_count} {'OK' if ok else 'FAIL'}") + bounds_ok = bounds_ok and ok + if anchor.max_count: + ok = observed <= int(anchor.max_count) + bits.append(f"max={anchor.max_count} {'OK' if ok else 'FAIL'}") + bounds_ok = bounds_ok and ok + return ("PASS" if bounds_ok else "FAIL"), "; ".join(bits) + tol = float(anchor.tolerance) if anchor.tolerance else default_tol + if anchor.expected == 0: + ok = observed == 0 + return ("PASS" if ok else "FAIL"), f"expected=0, observed={observed}" + diff_ratio = abs(observed - anchor.expected) / anchor.expected + ok = diff_ratio <= tol + return ("PASS" if ok else "FAIL"), f"diff_ratio={diff_ratio:.3f} (tol={tol:.3f})" + + +# --------------------------------------------------------------------------- +# Subcommand: generate +# --------------------------------------------------------------------------- + +def cmd_generate(args: argparse.Namespace) -> None: + samtools_cmd = resolve_samtools(Path(args.samtools_sif) if args.samtools_sif else None) + rows = parse_samplesheet(Path(args.samplesheet)) + bed_rows = vr.load_sites_bed(Path(args.sites)) + if not rows: + raise SystemExit("ERROR: samplesheet has no data rows") + if not bed_rows: + raise SystemExit("ERROR: sites BED has no data rows") + + anchors: list[AnchorRow] = [] + for row in rows: + sample = row["sample"] + bams = sample_bam_paths(row) + bgs = sample_bedgraph_paths(row) + if not bams and not bgs: + sys.stderr.write(f"[generate] {sample}: no BAM or bedGraph tracks in row — skipping\n") + continue + # BAM anchors (read count via samtools). + for track_name, bam in bams: + if not bam.exists(): + sys.stderr.write(f"[generate] {sample}/{track_name}: BAM missing: {bam}\n") + continue + for b in bed_rows: + region = f"{b['chrom']}:{b['start']}-{b['end']}" + try: + count = samtools_count(samtools_cmd, bam, region) + except RuntimeError as e: + sys.stderr.write(f"[generate] {sample}/{track_name} {region}: {e}\n") + continue + anchors.append(AnchorRow( + sample=sample, track_name=track_name, track_type="bam", + chrom=b["chrom"], start=b["start"], end=b["end"], + expected=count, notes=b["name"] or "", + )) + sys.stderr.write(f"[generate] {sample}/{track_name} {region}: {count} reads\n") + # bedGraph / wig anchors (data row count = CpG count for methylation). + for track_name, bg in bgs: + if not bg.exists(): + sys.stderr.write(f"[generate] {sample}/{track_name}: bedGraph missing: {bg}\n") + continue + for b in bed_rows: + try: + count = bedgraph_count_source(bg, b["chrom"], b["start"], b["end"]) + except (FileNotFoundError, RuntimeError) as e: + sys.stderr.write(f"[generate] {sample}/{track_name} {b['chrom']}:{b['start']}-{b['end']}: {e}\n") + continue + anchors.append(AnchorRow( + sample=sample, track_name=track_name, track_type="bedgraph", + chrom=b["chrom"], start=b["start"], end=b["end"], + expected=count, notes=b["name"] or "", + )) + sys.stderr.write(f"[generate] {sample}/{track_name} {b['chrom']}:{b['start']}-{b['end']}: {count} rows\n") + + out = Path(args.out) + write_anchors(anchors, out) + sys.stderr.write(f"[generate] wrote {len(anchors)} anchors -> {out}\n") + + +# --------------------------------------------------------------------------- +# Subcommand: verify (single HTML) +# --------------------------------------------------------------------------- + +def verify_one_html( + html_path: Path, anchors: list[AnchorRow], samtools_cmd: list[str], + default_tol: float, +) -> list[AnchorCheck]: + """Verify all anchors against one HTML. Anchors whose track_name doesn't + appear in the HTML are SKIPped (cohort verify-cohort filters by sample, + so this function trusts the caller passed the right anchor subset).""" + checks: list[AnchorCheck] = [] + if not html_path.is_file(): + for a in anchors: + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "SKIP", + details=f"HTML missing: {html_path}", + )) + return checks + html_text = html_path.read_text() + table_json = vr.parse_table_json(html_text) + session_dict = vr.parse_session_dictionary(html_text) + if table_json is None or session_dict is None: + for a in anchors: + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "FAIL", + details="tableJson or sessionDictionary missing from HTML", + )) + return checks + with tempfile.TemporaryDirectory(prefix="verify_anchors_") as td: + tmp = Path(td) + for a in anchors: + outcome, session, locate_detail = locate_session_entry( + session_dict, table_json, a.chrom, a.start, a.end, + ) + if outcome == "absent": + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "SKIP", + details=locate_detail, + )) + continue + if outcome == "broken": + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "FAIL", + expected=str(a.expected), + details=locate_detail, + )) + continue + assert session is not None # outcome == "ok" + track = find_track(session, a.track_name) + if track is None: + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "SKIP", + details=f"track '{a.track_name}' not in HTML session", + )) + continue + url = track.get("url", "") + if a.track_type == "bedgraph": + # wig/bedGraph slices are gzip(text) base64-encoded by + # igv_reports/datauri.py. Count data rows in the embedded + # slice; no samtools needed. + try: + m = _DATA_URL_RE.match(url) + if not m: + raise ValueError("track url is not a data: base64 URL") + raw = base64.b64decode(m.group(1)) + observed = bedgraph_count_slice(raw) + except (ValueError, RuntimeError) as e: + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "FAIL", + expected=str(a.expected), + details=f"bedGraph slice decode/count failed: {e}", + )) + continue + else: + # BAM (default). + slice_path = tmp / f"{a.sample}__{a.track_name}__{a.chrom}_{a.start}_{a.end}.bam" + try: + decode_track_slice(url, slice_path) + samtools_index(samtools_cmd, slice_path) + observed = samtools_count(samtools_cmd, slice_path, a.region) + except (ValueError, RuntimeError) as e: + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, "FAIL", + expected=str(a.expected), + details=f"slice decode/count failed: {e}", + )) + continue + status, details = decide_status(a, observed, default_tol) + checks.append(AnchorCheck( + a.sample, a.track_name, a.region, status, + observed=str(observed), expected=str(a.expected), + details=details, + )) + return checks + + +def cmd_verify(args: argparse.Namespace) -> None: + samtools_cmd = resolve_samtools(Path(args.samtools_sif) if args.samtools_sif else None) + anchors = load_anchors(Path(args.anchors)) + checks = verify_one_html(Path(args.html), anchors, samtools_cmd, args.tolerance) + write_checks(checks, Path(args.out) if args.out else None) + if args.fail_on_fail and any(c.status == "FAIL" for c in checks): + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Subcommand: verify-cohort +# --------------------------------------------------------------------------- + +def cmd_verify_cohort(args: argparse.Namespace) -> None: + samtools_cmd = resolve_samtools(Path(args.samtools_sif) if args.samtools_sif else None) + anchors = load_anchors(Path(args.anchors)) + rows = parse_samplesheet(Path(args.samplesheet)) + reports_dir = Path(args.reports_dir) + genome = args.genome + + # Group anchors by sample for per-HTML filtering. + by_sample: dict[str, list[AnchorRow]] = {} + for a in anchors: + by_sample.setdefault(a.sample, []).append(a) + + all_checks: list[AnchorCheck] = [] + for row in rows: + sample = row["sample"] + html_path = reports_dir / f"{sample}.{genome}.html" + sample_anchors = by_sample.get(sample, []) + if not sample_anchors: + all_checks.append(AnchorCheck( + sample, "*", "*", "SKIP", + details="no anchors for this sample in anchors.tsv", + )) + continue + all_checks.extend(verify_one_html(html_path, sample_anchors, samtools_cmd, args.tolerance)) + + # Surface anchor samples that don't match any samplesheet row. + samplesheet_samples = {r["sample"] for r in rows} + anchor_orphans = sorted(set(by_sample.keys()) - samplesheet_samples) + for s in anchor_orphans: + all_checks.append(AnchorCheck( + s, "*", "*", "SKIP", + details="anchor sample not present in samplesheet", + )) + + write_checks(all_checks, Path(args.out) if args.out else None) + if args.fail_on_fail and any(c.status == "FAIL" for c in all_checks): + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Output +# --------------------------------------------------------------------------- + +def write_checks(checks: list[AnchorCheck], out: Path | None) -> None: + lines = ["sample\ttrack_name\tregion\tstatus\tobserved\texpected\tdetails"] + for c in checks: + lines.append("\t".join([ + c.sample, c.track_name, c.region, c.status, + c.observed, c.expected, c.details, + ])) + text = "\n".join(lines) + "\n" + if out: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(text) + sys.stdout.write(text) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + sub = ap.add_subparsers(dest="cmd", required=True) + + # generate + g = sub.add_parser("generate", help="freeze samtools view -c counts into anchors.tsv") + g.add_argument("--samplesheet", required=True) + g.add_argument("--sites", required=True) + g.add_argument("--out", required=True, help="path to write anchors TSV") + g.add_argument("--samtools-sif", help="explicit samtools SIF path") + g.set_defaults(func=cmd_generate) + + # verify + v = sub.add_parser("verify", help="audit one HTML against anchors.tsv") + v.add_argument("--html", required=True) + v.add_argument("--anchors", required=True) + v.add_argument("--out", help="write checks TSV here in addition to stdout") + v.add_argument("--samtools-sif") + v.add_argument("--tolerance", type=float, default=DEFAULT_TOLERANCE, + help=f"default ratio tolerance when row tolerance/min/max blank (default {DEFAULT_TOLERANCE})") + v.add_argument("--fail-on-fail", action="store_true") + v.set_defaults(func=cmd_verify) + + # verify-cohort + vc = sub.add_parser("verify-cohort", help="audit all HTMLs in a cohort against anchors.tsv") + vc.add_argument("--samplesheet", required=True) + vc.add_argument("--reports-dir", required=True) + vc.add_argument("--genome", required=True) + vc.add_argument("--anchors", required=True) + vc.add_argument("--out") + vc.add_argument("--samtools-sif") + vc.add_argument("--tolerance", type=float, default=DEFAULT_TOLERANCE) + vc.add_argument("--fail-on-fail", action="store_true") + vc.set_defaults(func=cmd_verify_cohort) + + args = ap.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/igv-reports/scripts/verify_cohort.py b/igv-reports/scripts/verify_cohort.py new file mode 100755 index 0000000..18b3a07 --- /dev/null +++ b/igv-reports/scripts/verify_cohort.py @@ -0,0 +1,632 @@ +#!/usr/bin/env python3 +"""verify_cohort.py — cohort-level structural verifier for create_report runs. + +Author: Samuel Ahuno +Purpose: + Catches sample-to-HTML mismatches in cohort mode. Per-sample verification + (verify_report.py) confirms each HTML is internally consistent, but it + has no notion of WHICH sample an HTML *should* belong to. This verifier + re-reads the samplesheet and cross-checks every HTML against the row that + produced it, plus scans for cross-sample contamination. + +Threat model — failure modes this catches that per-sample verify cannot: + * Wrong BAM embedded under right filename (samplesheet typo, copy-paste). + * Tumor/normal slot swap. + * Missing HTML for a samplesheet row (cohort loop silently skipped). + * Index.html lying — links to a sample that doesn't exist, or omits one. + * Sample-2's BAM accidentally winding up inside sample-1's HTML. + +Checks emitted (per sample, plus two cohort-global rows tagged sample="*"): + Per-sample (delegated to verify_report.py for the structural ones): + * html_exists, html_min_size, region_count, region_coords, + region_sessions, tracks_present -- run verify_report.py against + each sample's HTML using that sample's row as input + Cohort-specific (added here): + C2 sample_tracks_match -- the HTML's session contains every + track basename declared in this row + C3 no_cross_sample_contamination -- the HTML's session contains NO + basename that belongs to another + row's track columns but not this + row (default-track basenames from + databases_config.yaml are excluded) + C4 sample_id_embedded -- the `sample` column value appears in + the HTML's or filename + Cohort-global (one row each, sample='*'): + C1 cohort_html_coverage -- every samplesheet sample has exactly + one matching HTML; flag missing+extras + C5 index_consistency -- index.html (if present) links exactly + the samplesheet sample set; each link + target exists and is non-empty + +Output: + TSV with columns: sample / check / status / observed / expected / details + (also printed to stdout). Optional --summary <path>.md emits a one-page + rollup: total samples, PASS/FAIL counts per check, contamination incidents + listed by sample. + +Exit code: 0, or 1 if --fail-on-fail is set and any row is FAIL. + +Typical use (auto-invoked by build_igvreports.py --samplesheet, but can be +run standalone too): + + python verify_cohort.py \\ + --samplesheet samplesheet.tsv \\ + --reports-dir results/<run>/reports/ \\ + --genome hg38 \\ + --out results/<run>/reports/cohort_verify.tsv \\ + --summary results/<run>/reports/cohort_verify.summary.md \\ + --fail-on-fail + +Skill location: + <repo-root>/ +""" + +from __future__ import annotations + +import argparse +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path + +# Same-dir imports — both verify_report.py and build_igvreports.py live here. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +import verify_report as vr +import build_igvreports as bir + + +_DBCONFIG_ENV = os.environ.get("IGV_REPORTS_DB_CONFIG") +DEFAULT_DBCONFIG = Path(_DBCONFIG_ENV) if _DBCONFIG_ENV else None +DEFAULT_TRACK_COLUMNS = ["bam_tumor", "bam_normal", "vcf", "extra_tracks"] + + +@dataclass +class CohortCheck: + sample: str # "*" for cohort-global checks + name: str + status: str # PASS | FAIL | SKIP + observed: str = "" + expected: str = "" + details: str = "" + + +# --------------------------------------------------------------------------- +# Samplesheet inspection +# --------------------------------------------------------------------------- + +def row_track_paths(row: dict, track_columns: list[str]) -> list[str]: + """Extract all track paths from a samplesheet row. Honors `extra_tracks` + being a comma-separated list (per build_igvreports.py convention).""" + paths: list[str] = [] + for col in track_columns: + val = row.get(col) + if not val or not val.strip(): + continue + if col == "extra_tracks": + paths.extend(p.strip() for p in val.split(",") if p.strip()) + else: + paths.append(val.strip()) + return paths + + +def track_labels_of(paths: list[str]) -> set[str]: + """Return the names igv-reports auto-assigns to positional --tracks for + these paths. igv-reports strips ONE final suffix (verified against + create_report 1.16.2 — see verify_report.expected_track_labels).""" + return {Path(p).stem for p in paths} + + +# --------------------------------------------------------------------------- +# Cohort-global checks (C1, C5) +# --------------------------------------------------------------------------- + +def check_html_coverage(rows: list[dict], reports_dir: Path, genome: str) -> CohortCheck: + expected_files = {f"{r['sample']}.{genome}.html" for r in rows} + actual_files = {p.name for p in reports_dir.glob(f"*.{genome}.html")} + missing = sorted(expected_files - actual_files) + extras = sorted(actual_files - expected_files) + if not missing and not extras: + return CohortCheck( + "*", "cohort_html_coverage", "PASS", + observed=f"{len(actual_files)} HTMLs", + expected=f"{len(expected_files)} HTMLs", + ) + details = [] + if missing: + details.append(f"missing: {', '.join(missing[:5])}" + (" ..." if len(missing) > 5 else "")) + if extras: + details.append(f"unexpected: {', '.join(extras[:5])}" + (" ..." if len(extras) > 5 else "")) + return CohortCheck( + "*", "cohort_html_coverage", "FAIL", + observed=f"{len(actual_files)} HTMLs", + expected=f"{len(expected_files)} HTMLs", + details="; ".join(details), + ) + + +def check_index_consistency(rows: list[dict], reports_dir: Path) -> CohortCheck: + index = reports_dir / "index.html" + if not index.exists(): + return CohortCheck( + "*", "index_consistency", "SKIP", + details=f"no {index.name} present (cohort write_index() not invoked)", + ) + text = index.read_text() + # build_igvreports.write_index() emits <li><a href="<file>">SAMPLE</a></li>. + # Match <a href="..."> ... </a> and pull both the href and the link text. + found: dict[str, str] = {} # sample -> href + for m in re.finditer(r'<a href="([^"]+)">([^<]+)</a>', text): + href, label = m.group(1), m.group(2).strip() + found[label] = href + + expected_samples = {r["sample"] for r in rows} + indexed_samples = set(found.keys()) + missing = sorted(expected_samples - indexed_samples) + extras = sorted(indexed_samples - expected_samples) + broken_links = [] + for sample, href in found.items(): + target = reports_dir / href + if not target.exists() or target.stat().st_size < 1024: + broken_links.append(f"{sample}->{href}") + + if not missing and not extras and not broken_links: + return CohortCheck( + "*", "index_consistency", "PASS", + observed=f"{len(found)} links", + expected=f"{len(expected_samples)} samples", + ) + details = [] + if missing: + details.append(f"missing from index: {', '.join(missing[:5])}") + if extras: + details.append(f"unexpected in index: {', '.join(extras[:5])}") + if broken_links: + details.append(f"broken: {', '.join(broken_links[:5])}") + return CohortCheck( + "*", "index_consistency", "FAIL", + observed=f"{len(found)} links", + expected=f"{len(expected_samples)} samples", + details="; ".join(details), + ) + + +# --------------------------------------------------------------------------- +# Per-sample checks (delegate to verify_report + add C2, C3, C4) +# --------------------------------------------------------------------------- + +def per_sample_structural(sample: str, html_path: Path, sites_path: Path, + tracks: list[str], min_size_mb: float) -> list[CohortCheck]: + """Run verify_report.py's 6 structural checks against one sample's HTML.""" + out: list[CohortCheck] = [] + out.append(_wrap(sample, vr.check_html_exists(html_path))) + if not html_path.is_file(): + for n in ("html_min_size", "region_count", "region_coords", + "region_sessions", "tracks_present"): + out.append(CohortCheck(sample, n, "SKIP", details="HTML missing")) + return out + out.append(_wrap(sample, vr.check_html_min_size(html_path, min_size_mb))) + if not sites_path.exists(): + for n in ("region_count", "region_coords", "region_sessions", "tracks_present"): + out.append(CohortCheck(sample, n, "SKIP", details=f"sites BED missing: {sites_path}")) + return out + html_text = html_path.read_text() + table_json = vr.parse_table_json(html_text) + session_dict = vr.parse_session_dictionary(html_text) + bed_rows = vr.load_sites_bed(sites_path) + out.append(_wrap(sample, vr.check_region_count(bed_rows, table_json))) + out.append(_wrap(sample, vr.check_region_coords(bed_rows, table_json))) + out.append(_wrap(sample, vr.check_region_sessions(table_json, session_dict))) + labels = vr.expected_track_labels(tracks, None) + out.append(_wrap(sample, vr.check_tracks_present(session_dict, labels))) + return out + + +def _wrap(sample: str, c: vr.Check) -> CohortCheck: + return CohortCheck(sample, c.name, c.status, c.observed, c.expected, c.details) + + +def session_track_names(html_path: Path) -> set[str]: + """Decode the first sessionDictionary entry and return its track names. + Returns an empty set on any decode failure.""" + if not html_path.is_file(): + return set() + text = html_path.read_text() + sd = vr.parse_session_dictionary(text) + if not sd: + return set() + sample_key = sorted(sd.keys())[0] + session = vr.decode_session_entry(sd[sample_key]) + if session is None: + return set() + return {t.get("name") for t in session.get("tracks", []) if t.get("name")} + + +def check_sample_tracks_match(sample: str, html_path: Path, row_tracks: list[str]) -> CohortCheck: + """C2: each track-stem declared in this sample's row appears as a track + name in this HTML's session. (igv-reports auto-names positional tracks + by Path.stem — see verify_report.py's expected_track_labels rationale.)""" + if not html_path.is_file(): + return CohortCheck(sample, "sample_tracks_match", "SKIP", details="HTML missing") + expected = sorted(track_labels_of(row_tracks)) + if not expected: + return CohortCheck(sample, "sample_tracks_match", "SKIP", + details="no track paths in samplesheet row") + names = session_track_names(html_path) + misses = [b for b in expected if b not in names] + if misses: + return CohortCheck( + sample, "sample_tracks_match", "FAIL", + observed=f"{len(expected) - len(misses)}/{len(expected)} found", + expected=f"{len(expected)}/{len(expected)} found", + details="missing: " + ", ".join(misses[:5]) + (" ..." if len(misses) > 5 else ""), + ) + return CohortCheck( + sample, "sample_tracks_match", "PASS", + observed=f"{len(expected)}/{len(expected)} found", + ) + + +def check_no_cross_sample_contamination( + sample: str, + html_path: Path, + this_row_labels: set[str], + other_rows_labels: set[str], + allow_list: set[str], +) -> CohortCheck: + """C3: HTML must not contain any track-name label that belongs to OTHER + samplesheet rows but not this one and not the default-track allow list. + Labels are Path.stem (igv-reports's auto-naming for positional tracks).""" + if not html_path.is_file(): + return CohortCheck(sample, "no_cross_sample_contamination", "SKIP", details="HTML missing") + suspicious = (other_rows_labels - this_row_labels) - allow_list + if not suspicious: + return CohortCheck( + sample, "no_cross_sample_contamination", "PASS", + observed="0 suspect labels in scope", + ) + names = session_track_names(html_path) + incidents = sorted([b for b in suspicious if b in names]) + if not incidents: + return CohortCheck( + sample, "no_cross_sample_contamination", "PASS", + observed=f"{len(suspicious)} other-sample labels scanned, 0 found", + ) + return CohortCheck( + sample, "no_cross_sample_contamination", "FAIL", + observed=f"{len(incidents)} contamination incidents", + details="found: " + ", ".join(incidents[:5]) + (" ..." if len(incidents) > 5 else ""), + ) + + +def check_sample_id_embedded(sample: str, html_path: Path) -> CohortCheck: + """C4: the sample id appears in the HTML's embedded <title>. + + Filename is intentionally NOT checked. The filename is what the cohort + loop named the file; the title is what `create_report --title` baked + INTO the HTML at render time. For swap detection, only the title is a + real signal — a copy-paste of sample_2.html over sample_1.html leaves + the filename as `sample_1.hg38.html` but the title still says + `sample_2 (hg38)`. Build_igvreports.py's default title pattern is + `<sample> (<genome>)`, so this works out of the box. + + If --title is overridden and omits the sample id, this check will FAIL + — which is the right behavior for a verifier that doesn't know the + user's intent.""" + if not html_path.is_file(): + return CohortCheck(sample, "sample_id_embedded", "SKIP", details="HTML missing") + # Read just the head so we don't scan 25 MB for a string. + head = html_path.read_text()[:16384] + m = re.search(r"<title>([^<]*)", head, flags=re.IGNORECASE) + if not m: + return CohortCheck( + sample, "sample_id_embedded", "SKIP", + details="no tag in HTML head; cannot verify", + ) + title = m.group(1) + if sample in title: + return CohortCheck(sample, "sample_id_embedded", "PASS", + observed=f"in <title>: {title!r}") + return CohortCheck( + sample, "sample_id_embedded", "FAIL", + observed=f"title={title!r}", + details=f"sample id {sample!r} not in <title> — likely a swap or wrong --title", + ) + + +# --------------------------------------------------------------------------- +# Allow-list (default tracks resolved from databases_config.yaml) +# --------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- +# PNG-side checks (opt-in, only fire when a manifest exists alongside the HTML) +# --------------------------------------------------------------------------- + +def find_png_manifest(reports_dir: Path, sample: str, genome: str) -> Path | None: + """Return the manifest path written by `build_pngs_with_igver` if the + sample was built with --also-png, else None. + + Convention from build_igvreports.py: + <reports_dir>/png_<sample>.<genome>/manifest.tsv + """ + candidate = reports_dir / f"png_{sample}.{genome}" / "manifest.tsv" + return candidate if candidate.exists() else None + + +def _parse_png_manifest(manifest: Path) -> list[dict]: + """Read the manifest into a list of dicts. Schema is fixed at write time + (see build_igvreports.py:build_pngs_with_igver) so we use the file's `#` + header line for column names.""" + rows: list[dict] = [] + with manifest.open() as fh: + header_line = fh.readline().lstrip("#").rstrip("\n") + cols = header_line.split("\t") + for line in fh: + line = line.rstrip("\n") + if not line or line.startswith("#"): + continue + vals = line.split("\t") + if len(vals) != len(cols): + continue + rows.append(dict(zip(cols, vals))) + return rows + + +def check_png_count_matches_bed( + sample: str, manifest: Path, sites_path: Path, +) -> CohortCheck: + """P1 — manifest row count must equal the data-row count in the sites BED. + Catches a partial igver run (e.g. SIGKILL mid-way), filename collisions + that overwrite earlier PNGs, or a stale manifest from a previous build.""" + try: + rows = _parse_png_manifest(manifest) + except Exception as e: + return CohortCheck(sample, "png_count_matches_bed", "FAIL", + details=f"manifest unreadable: {e}") + try: + bed_rows = bir._read_sites_bed_rows(sites_path) + except Exception as e: + return CohortCheck(sample, "png_count_matches_bed", "FAIL", + details=f"sites BED unreadable: {e}") + if len(rows) == len(bed_rows): + return CohortCheck(sample, "png_count_matches_bed", "PASS", + observed=str(len(rows)), expected=str(len(bed_rows))) + return CohortCheck(sample, "png_count_matches_bed", "FAIL", + observed=str(len(rows)), expected=str(len(bed_rows)), + details="manifest row count != sites BED data row count") + + +def check_pngs_exist_and_nonempty( + sample: str, manifest: Path, min_size_kb: float = 10.0, +) -> CohortCheck: + """P2 — every PNG path in the manifest must exist and be larger than the + threshold. igver can produce a near-empty file on a region with no data + in any track; we want those flagged rather than silently shipped.""" + try: + rows = _parse_png_manifest(manifest) + except Exception as e: + return CohortCheck(sample, "pngs_exist_and_nonempty", "FAIL", + details=f"manifest unreadable: {e}") + missing: list[str] = [] + tiny: list[str] = [] + for r in rows: + p = Path(r.get("png_path", "")) + if not p.exists(): + missing.append(p.name) + continue + if p.stat().st_size < min_size_kb * 1024: + tiny.append(f"{p.name} ({p.stat().st_size} B)") + if not missing and not tiny: + return CohortCheck(sample, "pngs_exist_and_nonempty", "PASS", + observed=f"{len(rows)} pngs all present and >= {min_size_kb:.1f} kB") + parts = [] + if missing: + parts.append(f"missing: {missing[:3]}{'...' if len(missing) > 3 else ''}") + if tiny: + parts.append(f"below threshold: {tiny[:3]}{'...' if len(tiny) > 3 else ''}") + return CohortCheck(sample, "pngs_exist_and_nonempty", "FAIL", + observed=f"missing={len(missing)} tiny={len(tiny)}", + expected="all PNGs present, >= 10 kB", + details="; ".join(parts)) + + +def check_png_html_row_alignment( + sample: str, manifest: Path, html_path: Path, +) -> CohortCheck: + """P3 — every manifest row references the matching HTML, and html_table_row + indices form a contiguous 1..N sequence (no skips, no duplicates). This is + the audit-trail check: a user clicking row N in the HTML should be able to + find the PNG named in manifest row N.""" + try: + rows = _parse_png_manifest(manifest) + except Exception as e: + return CohortCheck(sample, "png_html_row_alignment", "FAIL", + details=f"manifest unreadable: {e}") + if not rows: + return CohortCheck(sample, "png_html_row_alignment", "FAIL", + details="manifest has no data rows") + html_resolved = str(html_path.resolve()) + wrong_html = [r for r in rows if r.get("html_path") != html_resolved] + try: + indices = [int(r["html_table_row"]) for r in rows] + except (KeyError, ValueError) as e: + return CohortCheck(sample, "png_html_row_alignment", "FAIL", + details=f"manifest html_table_row malformed: {e}") + expected_indices = list(range(1, len(rows) + 1)) + if wrong_html: + return CohortCheck(sample, "png_html_row_alignment", "FAIL", + details=f"{len(wrong_html)} manifest rows reference a different HTML") + if indices != expected_indices: + return CohortCheck(sample, "png_html_row_alignment", "FAIL", + observed=f"{indices[:5]}{'...' if len(indices) > 5 else ''}", + expected=f"contiguous 1..{len(rows)}", + details="html_table_row indices not contiguous") + return CohortCheck(sample, "png_html_row_alignment", "PASS", + observed=f"{len(rows)} aligned rows") + + +def resolve_default_track_labels(db_config: Path, genome: str) -> set[str]: + """Reuse the driver's logic so the allow-list stays in sync with what was + actually loaded. Returns Path.stem of each default track (matches igv- + reports's auto-naming convention — see track_labels_of).""" + import logging + log = logging.getLogger("verify_cohort.allow_list_probe") + log.addHandler(logging.NullHandler()) + cfg = bir.load_db_config(db_config) + canon = bir.resolve_genome(genome) + try: + paths = bir.resolve_default_tracks(cfg, canon, log) + except SystemExit: + # genome not in db_config — fail open with an empty allow-list; the + # contamination check will then be over-conservative, never under. + return set() + return {Path(p).stem for p in paths} + + +# --------------------------------------------------------------------------- +# Output +# --------------------------------------------------------------------------- + +def write_tsv(checks: list[CohortCheck], out: Path | None) -> None: + lines = ["sample\tcheck\tstatus\tobserved\texpected\tdetails"] + for c in checks: + lines.append("\t".join((c.sample, c.name, c.status, c.observed, c.expected, c.details))) + text = "\n".join(lines) + "\n" + if out: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(text) + sys.stdout.write(text) + + +def write_summary(checks: list[CohortCheck], rows: list[dict], out: Path) -> None: + by_status = {"PASS": 0, "FAIL": 0, "SKIP": 0} + by_check: dict[str, dict[str, int]] = {} + fail_rows = [] + for c in checks: + by_status[c.status] = by_status.get(c.status, 0) + 1 + by_check.setdefault(c.name, {"PASS": 0, "FAIL": 0, "SKIP": 0})[c.status] += 1 + if c.status == "FAIL": + fail_rows.append(c) + + n_samples = len(rows) + lines = [] + lines.append(f"# Cohort verification summary\n") + lines.append(f"- samples: **{n_samples}**") + lines.append(f"- total checks: {sum(by_status.values())} (PASS={by_status['PASS']}, FAIL={by_status['FAIL']}, SKIP={by_status['SKIP']})") + lines.append("") + lines.append("## Per-check totals") + lines.append("") + lines.append("| check | PASS | FAIL | SKIP |") + lines.append("|---|---:|---:|---:|") + for check_name in sorted(by_check): + s = by_check[check_name] + lines.append(f"| {check_name} | {s['PASS']} | {s['FAIL']} | {s['SKIP']} |") + lines.append("") + if fail_rows: + lines.append("## Failures") + lines.append("") + lines.append("| sample | check | observed | expected | details |") + lines.append("|---|---|---|---|---|") + for c in fail_rows: + lines.append(f"| {c.sample} | {c.name} | {c.observed} | {c.expected} | {c.details} |") + else: + lines.append("## Failures\n\nNone — cohort verified clean.\n") + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text("\n".join(lines) + "\n") + + +# --------------------------------------------------------------------------- +# Driver +# --------------------------------------------------------------------------- + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--samplesheet", required=True, help="TSV that drove the cohort build (same one passed to build_igvreports.py --samplesheet)") + ap.add_argument("--reports-dir", required=True, help="dir containing <sample>.<genome>.html files (and optional index.html)") + ap.add_argument("--genome", required=True, help="genome tag (hg38 | mm10 | mm39 | t2t | GRCh37)") + ap.add_argument("--db-config", default=str(DEFAULT_DBCONFIG) if DEFAULT_DBCONFIG else None, + help="databases_config.yaml to resolve default-track allow-list. " + "Defaults to $IGV_REPORTS_DB_CONFIG; the allow-list check is skipped " + "if neither is set.") + ap.add_argument( + "--track-columns", nargs="*", default=DEFAULT_TRACK_COLUMNS, + help=f"samplesheet columns containing track paths (default: {DEFAULT_TRACK_COLUMNS}). " + "`extra_tracks` is parsed comma-separated if present.", + ) + ap.add_argument("--min-size-mb", type=float, default=0.5, help="per-sample HTML min size (passed through to verify_report)") + ap.add_argument("--png-min-size-kb", type=float, default=10.0, + help="PNG min size threshold (only used when --also-png manifests are present). " + "Defaults to 10 KB — empty IGV screenshots are typically <2 KB, " + "useful ones >= 50 KB.") + ap.add_argument("--out", help="write the TSV report here in addition to stdout") + ap.add_argument("--summary", help="write a one-page markdown rollup here") + ap.add_argument("--fail-on-fail", action="store_true", help="exit nonzero if any check is FAIL") + args = ap.parse_args() + + samplesheet = Path(args.samplesheet) + reports_dir = Path(args.reports_dir) + if not samplesheet.exists(): + raise SystemExit(f"ERROR: samplesheet not found: {samplesheet}") + if not reports_dir.is_dir(): + raise SystemExit(f"ERROR: reports-dir not found: {reports_dir}") + + rows = bir.parse_samplesheet(samplesheet) + if not rows: + raise SystemExit(f"ERROR: samplesheet has no data rows: {samplesheet}") + + # When neither --db-config nor $IGV_REPORTS_DB_CONFIG is set, skip the + # contamination allow-list (no false positives, no false negatives — we + # just can't claim a track is a "known annotation" without the YAML). + allow_list = (resolve_default_track_labels(Path(args.db_config), args.genome) + if args.db_config else set()) + + # Pre-compute track-label sets per sample for the contamination check. + # Labels are Path.stem of each track path, matching igv-reports's auto- + # naming (see track_labels_of). + per_sample_labels: dict[str, set[str]] = { + r["sample"]: track_labels_of(row_track_paths(r, args.track_columns)) for r in rows + } + all_labels = set().union(*per_sample_labels.values()) if per_sample_labels else set() + + checks: list[CohortCheck] = [] + # C1 cohort_html_coverage + checks.append(check_html_coverage(rows, reports_dir, args.genome)) + + # Per-sample: 6 structural (verify_report) + C2 + C3 + C4 + for r in rows: + sample = r["sample"] + html_path = reports_dir / f"{sample}.{args.genome}.html" + sites_path = Path(r["sites_bed"]) + tracks = row_track_paths(r, args.track_columns) + + checks.extend(per_sample_structural(sample, html_path, sites_path, tracks, args.min_size_mb)) + checks.append(check_sample_tracks_match(sample, html_path, tracks)) + + this_labels = per_sample_labels[sample] + other_labels = all_labels - this_labels + checks.append(check_no_cross_sample_contamination(sample, html_path, this_labels, other_labels, allow_list)) + checks.append(check_sample_id_embedded(sample, html_path)) + + # PNG-side checks fire only when build_igvreports.py was run with + # --also-png (detected via the per-sample manifest). Cohorts without + # PNGs see no extra rows; cohorts with PNGs get three more checks. + manifest = find_png_manifest(reports_dir, sample, args.genome) + if manifest is not None: + checks.append(check_png_count_matches_bed(sample, manifest, sites_path)) + checks.append(check_pngs_exist_and_nonempty(sample, manifest, args.png_min_size_kb)) + checks.append(check_png_html_row_alignment(sample, manifest, html_path)) + + # C5 index_consistency + checks.append(check_index_consistency(rows, reports_dir)) + + out_path = Path(args.out) if args.out else None + write_tsv(checks, out_path) + if args.summary: + write_summary(checks, rows, Path(args.summary)) + + if args.fail_on_fail and any(c.status == "FAIL" for c in checks): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/igv-reports/scripts/verify_report.py b/igv-reports/scripts/verify_report.py new file mode 100755 index 0000000..744920e --- /dev/null +++ b/igv-reports/scripts/verify_report.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 +"""verify_report.py — post-render structural verifier for create_report HTMLs. + +Author: Samuel Ahuno +Purpose: + Validates that a self-contained create_report HTML actually contains what + its inputs declared. Catches the "silent garbage" failure mode where the + HTML builds (exit 0, plausible file size) but the content doesn't match the + user's intent: wrong region count, wrong coordinates, missing tracks, or a + catastrophic empty render. + +Dual role: + - CLI: `python verify_report.py --html ... --sites ... [--track-config ...]` + - Library: importable helpers (parse_table_json, parse_session_dictionary, + decode_session_entry, load_sites_bed, expected_track_labels, the + `check_*` functions, and the Check dataclass). verify_cohort.py imports + these to do per-sample checks + add cross-sample assertions. + +Checks emitted (one TSV row per check, ordered): + 1. html_exists Output file is a regular file. + 2. html_min_size Output >= --min-size-mb (default 0.5 MB). + 3. region_count tableJson rows count == sites BED data-row count. + 4. region_coords Each BED row finds a matching (chrom, start+1, end[, name]) + in the embedded tableJson. BED is 0-based half-open; + create_report stores 1-based start in the table. + 5. region_sessions sessionDictionary has an entry for each tableJson row. + 6. tracks_present For --track-config <json>: each track's `name` field + appears in the decoded session's tracks[].name list. + For --tracks <path...>: each path's Path.stem appears + in the decoded session's tracks[].name list. igv- + reports strips ONE final suffix when auto-naming + positional tracks (e.g. `x.5mC.bedgraph` -> `x.5mC`, + `gencode.v47.annotation.gff3.gz` -> `gencode.v47. + annotation.gff3`). Skipped if neither flag is given. + NOTE: --standalone embeds slices as data: URLs, so + original URL paths are absent from the session — we + match on track NAMES, which are preserved. + +Output: + TSV with columns: check / status / observed / expected / details + status is one of PASS / FAIL / SKIP. + +Exit code: + 0 always, unless --fail-on-fail is set and at least one row is FAIL. + +Typical use: + python verify_report.py \\ + --html report.hg38.html \\ + --sites sites.hg38.bed \\ + --track-config tracks.json \\ + --out verify.tsv \\ + --min-size-mb 1.0 \\ + --fail-on-fail +""" + +from __future__ import annotations + +import argparse +import base64 +import dataclasses +import gzip +import json +import re +import sys +from pathlib import Path + + +@dataclasses.dataclass +class Check: + name: str + status: str # PASS | FAIL | SKIP + observed: str = "" + expected: str = "" + details: str = "" + + +# --------------------------------------------------------------------------- +# Sites-BED loader (mirrors create_report's #-skip behavior) +# --------------------------------------------------------------------------- + +def load_sites_bed(path: Path) -> list[dict]: + """Return a list of {chrom, start, end, name} dicts; skips '#' and 'track '.""" + rows: list[dict] = [] + with path.open() as fh: + for i, line in enumerate(fh, start=1): + line = line.rstrip("\n") + if not line or line.startswith("#") or line.startswith("track "): + continue + cols = line.split("\t") + if len(cols) < 3: + raise SystemExit(f"{path}:{i}: BED row has <3 columns") + try: + start = int(cols[1]) + end = int(cols[2]) + except ValueError as e: + raise SystemExit(f"{path}:{i}: non-numeric start/end: {e}") + rows.append({ + "chrom": cols[0], + "start": start, + "end": end, + "name": cols[3] if len(cols) >= 4 else None, + }) + return rows + + +# --------------------------------------------------------------------------- +# HTML extractors +# --------------------------------------------------------------------------- + +def _extract_balanced_blob(text: str, anchor: str, opener: str = "{") -> str | None: + """Find `anchor` in `text`, then return the substring starting at the next + `opener` and ending at the matched closer. Skips characters inside double- + quoted strings (with backslash escapes). Returns None if not found.""" + closer = "}" if opener == "{" else "]" + i = text.find(anchor) + if i < 0: + return None + start = text.find(opener, i) + if start < 0: + return None + depth = 0 + in_str = False + escape = False + for j in range(start, len(text)): + c = text[j] + if escape: + escape = False + continue + if c == "\\": + escape = True + continue + if c == '"': + in_str = not in_str + continue + if in_str: + continue + if c == opener: + depth += 1 + elif c == closer: + depth -= 1 + if depth == 0: + return text[start:j + 1] + return None + + +def parse_table_json(html: str) -> dict | None: + blob = _extract_balanced_blob(html, "tableJson = ", "{") + if not blob: + return None + return json.loads(blob) + + +def parse_session_dictionary(html: str) -> dict | None: + blob = _extract_balanced_blob(html, "sessionDictionary = ", "{") + if not blob: + return None + return json.loads(blob) + + +def decode_session_entry(data_url: str) -> dict | None: + """A sessionDictionary value looks like 'data:application/gzip;base64,XXXX'. + Strip the prefix, base64-decode, gunzip, parse JSON. Return the IGV.js + session dict (or None on any error — failures here are non-fatal).""" + try: + m = re.match(r"data:application/gzip;base64,(.+)", data_url, flags=re.DOTALL) + if not m: + return None + raw = base64.b64decode(m.group(1)) + return json.loads(gzip.decompress(raw)) + except Exception: + return None + + +# --------------------------------------------------------------------------- +# Track-input parser +# --------------------------------------------------------------------------- + +def expected_track_labels(tracks: list[str] | None, track_config: Path | None) -> list[str]: + """Return the track NAMES we expect to see in the embedded igv.js session. + + `--standalone` replaces every track URL with an inlined `data:...` URL after + slicing, so URL paths are unrecoverable from the embedded session — we have + to match on track names instead, which the standalone build preserves. + + - For --track-config <json>: use the `name` field of each entry verbatim. + - For positional --tracks <path...>: use Path(p).stem (igv-reports strips + ONE final suffix when auto-naming positional tracks — verified 2026-05-16 + against create_report 1.16.2: `colo829bl_PAU59807.5mC.bedgraph` -> + `colo829bl_PAU59807.5mC`, `gencode.v47.annotation.gff3.gz` -> + `gencode.v47.annotation.gff3`, `x.bam` -> `x`). + Empty list means 'check skipped'. + """ + out: list[str] = [] + if track_config and track_config.exists(): + with track_config.open() as fh: + cfg = json.load(fh) + for entry in cfg: + name = entry.get("name") + if name: + out.append(name) + return out + if tracks: + for t in tracks: + out.append(Path(t).stem) + return out + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + +def check_html_exists(html: Path) -> Check: + if html.is_file(): + return Check("html_exists", "PASS", observed=str(html)) + return Check("html_exists", "FAIL", observed=str(html), details="not a regular file") + + +def check_html_min_size(html: Path, floor_mb: float) -> Check: + size_mb = html.stat().st_size / 1024 / 1024 + status = "PASS" if size_mb >= floor_mb else "FAIL" + return Check( + "html_min_size", + status, + observed=f"{size_mb:.2f} MB", + expected=f">= {floor_mb:.2f} MB", + ) + + +def check_region_count(bed_rows: list[dict], table_json: dict | None) -> Check: + if table_json is None: + return Check("region_count", "FAIL", details="tableJson not found in HTML") + n_html = len(table_json.get("rows", [])) + n_bed = len(bed_rows) + return Check( + "region_count", + "PASS" if n_html == n_bed else "FAIL", + observed=str(n_html), + expected=str(n_bed), + ) + + +def check_region_coords(bed_rows: list[dict], table_json: dict | None) -> Check: + """For each BED row, find a matching row in the HTML by (chrom, start+1, end[, name]). + The HTML stores 1-based start, BED is 0-based half-open.""" + if table_json is None: + return Check("region_coords", "FAIL", details="tableJson not found") + headers = table_json.get("headers", []) + rows = table_json.get("rows", []) + try: + col_chrom = headers.index("Chrom") + col_start = headers.index("Start") + col_end = headers.index("End") + col_name = headers.index("Name") if "Name" in headers else None + except ValueError as e: + return Check("region_coords", "FAIL", details=f"missing column in tableJson headers: {e}") + + html_set = { + (r[col_chrom], int(r[col_start]), int(r[col_end])): (r[col_name] if col_name is not None else None) + for r in rows + } + misses: list[str] = [] + for b in bed_rows: + key = (b["chrom"], b["start"] + 1, b["end"]) + if key not in html_set: + misses.append(f"{b['chrom']}:{b['start']}-{b['end']}") + continue + # If both have a name, names must match. + if col_name is not None and b["name"] is not None and html_set[key] != b["name"]: + misses.append(f"{b['chrom']}:{b['start']}-{b['end']} name mismatch (BED={b['name']!r}, HTML={html_set[key]!r})") + if misses: + return Check( + "region_coords", "FAIL", + observed=f"{len(bed_rows) - len(misses)}/{len(bed_rows)} matched", + expected=f"{len(bed_rows)}/{len(bed_rows)} matched", + details="; ".join(misses[:5]) + (" ..." if len(misses) > 5 else ""), + ) + return Check("region_coords", "PASS", observed=f"{len(bed_rows)}/{len(bed_rows)} matched") + + +def check_region_sessions(table_json: dict | None, session_dict: dict | None) -> Check: + if table_json is None or session_dict is None: + return Check("region_sessions", "FAIL", details="tableJson or sessionDictionary missing") + n_rows = len(table_json.get("rows", [])) + n_sess = len(session_dict) + # Sessions are keyed by stringified row index 0..N-1. + expected_keys = {str(i) for i in range(n_rows)} + actual_keys = set(session_dict.keys()) + if expected_keys.issubset(actual_keys): + return Check( + "region_sessions", "PASS", + observed=str(n_sess), + expected=f">={n_rows} (one per row)", + ) + return Check( + "region_sessions", "FAIL", + observed=f"keys={sorted(actual_keys)[:5]}...", + expected=f"keys 0..{n_rows-1}", + details=f"missing keys: {sorted(expected_keys - actual_keys)[:5]}", + ) + + +def check_tracks_present( + session_dict: dict | None, + expected_labels: list[str], +) -> Check: + if not expected_labels: + return Check("tracks_present", "SKIP", details="neither --tracks nor --track-config provided") + if session_dict is None or not session_dict: + return Check("tracks_present", "FAIL", details="sessionDictionary missing or empty") + # Decode the first available session entry. Track names are identical + # across per-region sessions (only the data: URL slices differ). + sample_key = sorted(session_dict.keys())[0] + session = decode_session_entry(session_dict[sample_key]) + if session is None: + return Check("tracks_present", "FAIL", details="failed to decode/gunzip session entry") + session_track_names = {t.get("name") for t in session.get("tracks", []) if t.get("name")} + misses = [lab for lab in expected_labels if lab not in session_track_names] + if misses: + return Check( + "tracks_present", "FAIL", + observed=f"{len(expected_labels) - len(misses)}/{len(expected_labels)} found", + expected=f"{len(expected_labels)}/{len(expected_labels)} found", + details="missing: " + ", ".join(misses[:5]) + (" ..." if len(misses) > 5 else ""), + ) + return Check( + "tracks_present", "PASS", + observed=f"{len(expected_labels)}/{len(expected_labels)} found", + ) + + +# --------------------------------------------------------------------------- +# Driver +# --------------------------------------------------------------------------- + +def write_tsv(checks: list[Check], out: Path | None) -> None: + lines = ["check\tstatus\tobserved\texpected\tdetails"] + for c in checks: + lines.append(f"{c.name}\t{c.status}\t{c.observed}\t{c.expected}\t{c.details}") + text = "\n".join(lines) + "\n" + if out: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(text) + # Always also emit to stdout for piping / inspection. + sys.stdout.write(text) + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--html", required=True, help="path to the create_report HTML to verify") + ap.add_argument("--sites", required=True, help="path to the sites BED that was passed to create_report") + ap.add_argument("--tracks", nargs="*", default=[], help="track paths that were passed to create_report (--tracks mode)") + ap.add_argument("--track-config", help="track config JSON that was passed to create_report (--track-config mode)") + ap.add_argument("--min-size-mb", type=float, default=0.5, help="minimum acceptable HTML size in MB (default: 0.5)") + ap.add_argument("--out", help="write the TSV report here in addition to stdout") + ap.add_argument("--fail-on-fail", action="store_true", help="exit nonzero if any check is FAIL") + args = ap.parse_args() + + html_path = Path(args.html) + sites_path = Path(args.sites) + out_path = Path(args.out) if args.out else None + track_config = Path(args.track_config) if args.track_config else None + + checks: list[Check] = [check_html_exists(html_path)] + + # If the HTML doesn't exist, every downstream check would crash; mark them SKIP and bail. + if checks[0].status == "FAIL": + checks.append(Check("html_min_size", "SKIP", details="HTML missing")) + checks.append(Check("region_count", "SKIP", details="HTML missing")) + checks.append(Check("region_coords", "SKIP", details="HTML missing")) + checks.append(Check("region_sessions", "SKIP", details="HTML missing")) + checks.append(Check("tracks_present", "SKIP", details="HTML missing")) + write_tsv(checks, out_path) + if args.fail_on_fail: + sys.exit(1) + return + + checks.append(check_html_min_size(html_path, args.min_size_mb)) + + html_text = html_path.read_text() + table_json = parse_table_json(html_text) + session_dict = parse_session_dictionary(html_text) + bed_rows = load_sites_bed(sites_path) + + checks.append(check_region_count(bed_rows, table_json)) + checks.append(check_region_coords(bed_rows, table_json)) + checks.append(check_region_sessions(table_json, session_dict)) + checks.append(check_tracks_present(session_dict, expected_track_labels(args.tracks, track_config))) + + write_tsv(checks, out_path) + + if args.fail_on_fail and any(c.status == "FAIL" for c in checks): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/igv-reports/tests/.gitignore b/igv-reports/tests/.gitignore new file mode 100644 index 0000000..ae835ac --- /dev/null +++ b/igv-reports/tests/.gitignore @@ -0,0 +1,11 @@ +# pytest + Python cache directories +__pycache__/ +.pytest_cache/ + +# Integration scratch outputs (per scenarios.sh trap cleanup, these only +# linger on test failure or when KEEP_REPORTS=1) +integration/*/reports/ +integration/*/samplesheet*.tsv +integration/*/sites*.bed +integration/*/anchors*.tsv +integration/*/logs/ diff --git a/igv-reports/tests/fixtures/README.md b/igv-reports/tests/fixtures/README.md new file mode 100644 index 0000000..1c0078b --- /dev/null +++ b/igv-reports/tests/fixtures/README.md @@ -0,0 +1,51 @@ +# tests/fixtures + +Committed test fixtures derived from publicly released bioinformatics data. +Used by the smoke + integration test layers; safe to redistribute. + +## tiny_colo829.hg38.bam + +A 457 KB BAM (+ 85 KB `.bai`) sliced from Oxford Nanopore Technologies' +publicly released COLO829BL (matched normal) ONT reference dataset. + +| Property | Value | +|---|---| +| **Source dataset** | ONT COLO829 / COLO829BL R10.4.1 5kHz sup basecalls | +| **ENA project** | PRJEB57425 | +| **Source flowcell** | PAU59807 (COLO829BL) | +| **Basecaller** | Dorado, model `dna_r10.4.1_e8.2_400bps_sup@v5.0.0`, `5mCG_5hmCG@latest,6mA@latest` | +| **Reference** | hg38 (`Homo_sapiens_assembly38.fasta`) | +| **Slice regions** | `chr2:25245000-25248000` (around DNMT3A), `chr7:148882000-148886000` (around EZH2) | +| **Subsample** | 20% reads, seed 42 (`samtools view --subsample 0.2 --subsample-seed 42`) | +| **Filtering** | `-F 1536` (drops PCR/optical dups + supplementary alignments — matches igv-reports' BamReader default) | +| **License** | The source data is openly released by ONT; this slice inherits that status. Slicing/subsampling is non-creative transformation. | + +## Anchor sanity counts (used by smoke + integration tests) + +| Region | `samtools view -c -F 1536` | +|---|---| +| `chr2:25246500-25246501` | **5** | +| `chr7:148884000-148884001` | **9** | + +These counts are the contract: any change to the fixture (regeneration with +different params, etc.) must preserve these exact integers, or update the +constants in `tests/smoke/test_slice_count.py` and the integration `scenarios.sh`. + +## Regenerate + +```bash +bash tests/fixtures/build_fixtures.sh +``` + +Requires `samtools` (via PATH or `$SAMTOOLS_SIF`) and a local copy of the ONT +COLO829 release pointed to by `$COLO829BL_BAM`. Public source: ENA project +PRJEB57425. + +## Why these regions + +The two sites are coding mutations in well-known cancer driver genes +(DNMT3A R882, EZH2 Y646) at coordinates the demos already use. Picking +real loci keeps the test data biologically interpretable and lets the same +fixture exercise both the parser layer (anchors named for real variants +read naturally) and the slice-decode layer (read counts you can sanity-check +in IGV against the source BAM if needed). diff --git a/igv-reports/tests/fixtures/build_fixtures.sh b/igv-reports/tests/fixtures/build_fixtures.sh new file mode 100755 index 0000000..54a7d89 --- /dev/null +++ b/igv-reports/tests/fixtures/build_fixtures.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# build_fixtures.sh — regenerate tests/fixtures/tiny_colo829.hg38.bam from +# the publicly released ONT COLO829BL reads. +# +# The output BAM is committed to the repo (it's small public data — see +# fixtures/README.md). Regenerate only when you need to expand the slice +# regions, change subsample rate, or update for a new basecaller version. +# If the output counts change, also update tests/smoke/test_slice_count.py +# anchor constants and any integration scenarios.sh expected values. +set -euo pipefail + +FIX_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Resolve samtools. +if [[ -n "${SAMTOOLS_SIF:-}" && -f "${SAMTOOLS_SIF}" ]]; then + BINDS=() + if [[ -n "${IGV_REPORTS_BIND:-}" ]]; then + IFS=':' read -ra BP <<< "${IGV_REPORTS_BIND}" + for p in "${BP[@]}"; do [[ -d "$p" ]] && BINDS+=(--bind "$p"); done + fi + SAM=(apptainer exec --cleanenv "${BINDS[@]}" "${SAMTOOLS_SIF}" samtools) +elif command -v samtools >/dev/null 2>&1; then + SAM=(samtools) +else + echo "ERROR: no samtools available — install (pip/conda) or set SAMTOOLS_SIF" >&2 + exit 1 +fi + +# Source BAM — must be supplied via env. Public source: ENA project PRJEB57425 +# (ONT COLO829 release). +SRC="${COLO829BL_BAM:-}" +if [[ -z "${SRC}" || ! -f "${SRC}" ]]; then + echo "ERROR: source BAM not provided." >&2 + echo " Set COLO829BL_BAM=<path-to-COLO829BL-ONT-BAM> and re-run." >&2 + echo " Public source: ENA project PRJEB57425." >&2 + exit 1 +fi + +OUT="${FIX_DIR}/tiny_colo829.hg38.bam" + +echo "[build_fixtures] source: ${SRC}" +echo "[build_fixtures] output: ${OUT}" +echo "[build_fixtures] regions: chr2:25245000-25248000 (DNMT3A), chr7:148882000-148886000 (EZH2)" +echo "[build_fixtures] subsample: 0.2, seed 42" + +"${SAM[@]}" view -bh -F 1536 --subsample 0.2 --subsample-seed 42 \ + "${SRC}" \ + chr2:25245000-25248000 chr7:148882000-148886000 \ + -o "${OUT}" +"${SAM[@]}" index "${OUT}" + +echo "[build_fixtures] sizes:" +ls -lh "${OUT}" "${OUT}.bai" + +echo "[build_fixtures] anchor counts (must remain stable across regens):" +chr2_n=$("${SAM[@]}" view -c -F 1536 "${OUT}" chr2:25246500-25246501) +chr7_n=$("${SAM[@]}" view -c -F 1536 "${OUT}" chr7:148884000-148884001) +echo " chr2:25246500-25246501 = ${chr2_n}" +echo " chr7:148884000-148884001 = ${chr7_n}" + +if [[ "${chr2_n}" != "5" || "${chr7_n}" != "9" ]]; then + echo + echo "WARNING: anchor counts have changed from the committed fixture's contract" >&2 + echo " (chr2=5, chr7=9). Update tests/smoke/test_slice_count.py and any" >&2 + echo " integration scenarios.sh expected values, then commit both the new" >&2 + echo " BAM and the updated test constants together." >&2 +fi diff --git a/igv-reports/tests/fixtures/tiny_colo829.hg38.bam b/igv-reports/tests/fixtures/tiny_colo829.hg38.bam new file mode 100644 index 0000000..beaeca8 Binary files /dev/null and b/igv-reports/tests/fixtures/tiny_colo829.hg38.bam differ diff --git a/igv-reports/tests/fixtures/tiny_colo829.hg38.bam.bai b/igv-reports/tests/fixtures/tiny_colo829.hg38.bam.bai new file mode 100644 index 0000000..0920cae Binary files /dev/null and b/igv-reports/tests/fixtures/tiny_colo829.hg38.bam.bai differ diff --git a/igv-reports/tests/integration/anchor_verify/README.md b/igv-reports/tests/integration/anchor_verify/README.md new file mode 100644 index 0000000..2c4b311 --- /dev/null +++ b/igv-reports/tests/integration/anchor_verify/README.md @@ -0,0 +1,92 @@ +# anchor_verify_demo — regression test for `verify_anchors.py` + +End-to-end check that the anchor-based content verifier catches the four +failure modes it's designed to catch. Self-asserting — exits nonzero on any +mismatch. + +## What it does + +1. Generates a 2-sample samplesheet (TSV) pointing at two real COLO829 ONT BAMs. +2. Calls `verify_anchors.py generate` to freeze `samtools view -c` counts + into `anchors.hg38.tsv` (the regression fixture). +3. Calls `build_igvreports.py --samplesheet ... --no-verify` to produce + `reports/sample_{1,2}.hg38.html` + `index.html`. +4. Runs `verify_anchors.py verify-cohort` against the clean cohort (all PASS). +5. Runs four corruption scenarios, each asserting the expected outcome: + + | Scenario | Corruption | Expected | + |---|---|---| + | A | Mutate anchors `expected` to 9999 (real ~56) | `sample_1/chr2/FAIL` (diff_ratio) | + | B | Set anchor `min=1000` (real ~56) | `sample_1/chr2/FAIL` (min bound) | + | C | Mangle a session's base64 payload (`H4sI` → `XXXX`) | `sample_1/*/FAIL` (decode), sample_2 PASS | + | D | Drop an anchor row | row absent from output, others PASS | + +6. Cleans up generated `reports/`, samplesheet, sites BED, anchors TSVs, and + `logs/` on exit (set `KEEP_REPORTS=1` to leave them). + +## Run + +```bash +bash tests/integration/anchor_verify/scenarios.sh +``` + +Or as part of the full test suite: + +```bash +bash tests/run_all.sh # all layers +bash tests/run_all.sh --integration-only +``` + +Runtime: **~6-8 min cold** (the cohort build dominates); **~15 s** when the +cohort is cached. Set `REBUILD=1` to force a rebuild of the HTMLs; otherwise +existing HTMLs in `reports/` are reused so verifier iteration is seconds. + +Disk: ~10 MB temp under `reports/`, auto-cleaned via `trap`. + +## Why these scenarios + +The four scenarios cover every status the verifier emits: + +- **PASS** (scenario 0): observed within tolerance of expected, or within + `min`/`max` bounds. +- **FAIL — tolerance** (A): observed read count differs from expected beyond + the per-row tolerance (default 5%). Catches the silent sample-swap case + where the wrong source BAM was wired into the build pipeline — same track + name, different read counts. +- **FAIL — bound** (B): `min`/`max` columns let you assert "this integration + site should have ≥20 reads supporting it" — a stronger claim than + tolerance, useful for known-positive sites. +- **FAIL — broken decode** (C): the HTML's session entry can't be gunzipped + or its inner BAM data URL can't be base64-decoded. Catches arbitrary HTML + tampering or `create_report` version drift that breaks the embedding format. +- **SKIP** (D): an anchor row references a `(sample, region)` pair that the + HTML doesn't render. Dropped silently because anchor TSVs are intentionally + re-usable across runs — a region that exists in one cohort's anchors but + not in another cohort's HTMLs is benign, not a build failure. + +## BAM paths (parameterized) + +Requires BAMs supplied via +env vars when running elsewhere: + +```bash +IGV_REPORTS_TEST_BAM_1=/path/to/sample1.bam \ +IGV_REPORTS_TEST_BAM_2=/path/to/sample2.bam \ + bash tests/integration/anchor_verify/scenarios.sh +``` + +The verifier doesn't care which BAMs, only that they're different so +scenarios A-C have the contrast they need. If a default doesn't exist and +no env override is set, the script exits **77** (POSIX skipped-test +convention) and `run_all.sh` reports it as a skip, not a failure. + +## Why this is `integration`, not `smoke` or `unit` + +This test depends on real BAMs and on `create_report` actually running, so +it can't fit in `tests/smoke/` (which uses only the committed COLO829 slice +fixture and runs in seconds) or `tests/unit/` (parser-only, no I/O). + +For the parser-level regression checks that gave rise to this verifier, +see [tests/unit/test_verify_anchors.py](../../unit/test_verify_anchors.py). +For the samtools/decode round-trip, see +[tests/smoke/test_slice_count.py](../../smoke/test_slice_count.py). diff --git a/igv-reports/tests/integration/anchor_verify/scenarios.sh b/igv-reports/tests/integration/anchor_verify/scenarios.sh new file mode 100755 index 0000000..fcb95b9 --- /dev/null +++ b/igv-reports/tests/integration/anchor_verify/scenarios.sh @@ -0,0 +1,191 @@ +#!/usr/bin/env bash +# scenarios.sh — end-to-end integration test for scripts/verify_anchors.py. +# +# Builds a 2-sample cohort, freezes BAM-read-count anchors from the source +# BAMs, verifies the clean cohort, then runs four corruption scenarios and +# asserts each triggers the expected PASS / FAIL / SKIP outcomes. +# +# Runtime: ~6-8 min cold (cohort build dominates); ~15 s when cohort is cached. +# Disk: ~10 MB under ./reports/ (auto-cleaned on success unless KEEP_REPORTS=1). +# +# BAM source — two different indexed BAMs (any organism, any size). Defaults +# require env vars (no built-in defaults): +# IGV_REPORTS_TEST_BAM_1, _2 +# Tests SKIP (exit 77) when defaults are unset and no override is provided. +set -euo pipefail + +EX_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(cd "${EX_DIR}/../../.." && pwd)" +BUILD="${SKILL_DIR}/scripts/build_igvreports.py" +ANCHORS="${SKILL_DIR}/scripts/verify_anchors.py" + +# BAM sources — must be supplied via env vars. No defaults: integration tests +# require two indexed BAMs from a public release. +BAM_S1="${IGV_REPORTS_TEST_BAM_1:-}" +BAM_S2="${IGV_REPORTS_TEST_BAM_2:-}" + +for bam in "${BAM_S1}" "${BAM_S2}"; do + if [[ -z "${bam}" || ! -f "${bam}" ]]; then + echo "SKIP: integration test needs two indexed BAMs." >&2 + echo " Set IGV_REPORTS_TEST_BAM_{1,2} to paths of two .bam files (each with sibling .bai)." >&2 + exit 77 # POSIX skipped-test convention + fi +done + +SHEET="${EX_DIR}/samplesheet.hg38.tsv" +SITES="${EX_DIR}/sites.hg38.bed" +OUTDIR="${EX_DIR}/reports" +ANCHORS_TSV="${EX_DIR}/anchors.hg38.tsv" + +cleanup() { + if [[ -n "${KEEP_REPORTS:-}" ]]; then + echo "(KEEP_REPORTS set — leaving artifacts in ${OUTDIR} and ${EX_DIR}/anchors* for inspection)" + return + fi + rm -rf "${OUTDIR}" "${SHEET}" "${SITES}" "${ANCHORS_TSV}" \ + "${EX_DIR}/anchors.corrupted.tsv" "${EX_DIR}/anchors.min.tsv" \ + "${EX_DIR}/anchors.subset.tsv" "${EX_DIR}/logs" +} +trap 'rc=$?; if [[ $rc -eq 0 ]]; then cleanup; else echo "(scenarios.sh exited $rc — leaving artifacts for debug)"; fi' EXIT + +# Requires `create_report` (pip install igv-reports) and `samtools` on PATH. +# If you use a conda env, activate it before invoking this script. + +# Defensive restore: a previous run may have died mid-corruption leaving +# .bak files. Bring HTMLs back to their original state before we start. +for f in "${OUTDIR}"/*.hg38.html.bak; do + [[ -f "$f" ]] && mv "$f" "${f%.bak}" && echo "(restored ${f%.bak} from .bak)" +done 2>/dev/null || true + +# --- 1. Inputs ----------------------------------------------------------------- +# Two SNV-style point sites; --flanking 300 keeps BAM slicing in seconds even +# at 167 GB source BAMs. We're testing the verifier, not the renderer. +cat >"${SITES}" <<EOF +#chrom start end name +chr2 25246500 25246501 DNMT3A_SNV +chr7 148884000 148884001 EZH2_SNV +EOF + +printf 'sample\tbam_tumor\tsites_bed\n' >"${SHEET}" +printf 'sample_1\t%s\t%s\n' "${BAM_S1}" "${SITES}" >>"${SHEET}" +printf 'sample_2\t%s\t%s\n' "${BAM_S2}" "${SITES}" >>"${SHEET}" + +# --- 2. Generate anchors from source BAMs -------------------------------------- +echo "=== generate: freezing samtools-view counts as anchors ===" +python "${ANCHORS}" generate \ + --samplesheet "${SHEET}" \ + --sites "${SITES}" \ + --out "${ANCHORS_TSV}" 2>&1 | tail -6 +echo + +# --- 3. Build cohort ----------------------------------------------------------- +if [[ -z "${REBUILD:-}" \ + && -f "${OUTDIR}/sample_1.hg38.html" \ + && -f "${OUTDIR}/sample_2.hg38.html" ]]; then + echo "=== reusing existing cohort in ${OUTDIR} (set REBUILD=1 to force) ===" +else + echo "=== building 2-sample cohort (this takes ~5-7 min on warm node) ===" + python "${BUILD}" \ + --samplesheet "${SHEET}" \ + --genome hg38 \ + --flanking 300 \ + --type mutation \ + --info-columns name \ + --output-dir "${OUTDIR}" \ + --no-apptainer \ + --no-verify # auto-verify is structural; we exercise the anchor verifier ourselves below +fi +echo + +assert_status() { + # assert_status <sample> <region> <expected_status> <verify_tsv> + local sample="$1" region="$2" expected="$3" tsv="$4" + local actual + actual=$(awk -F'\t' -v s="$sample" -v r="$region" '$1==s && $3==r {print $4; exit}' "$tsv") + if [[ "$actual" != "$expected" ]]; then + echo " FAIL ASSERTION: sample=$sample region=$region expected=$expected actual=${actual:-<missing>}" + return 1 + fi + echo " OK sample=$sample region=$region status=$actual" +} + +# --- 4. Scenario 0: clean cohort, all PASS ------------------------------------- +echo "=== scenario 0: clean — all anchors expected PASS ===" +python "${ANCHORS}" verify-cohort \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --anchors "${ANCHORS_TSV}" \ + --out "${OUTDIR}/scenario0.tsv" \ + --fail-on-fail >/dev/null +echo " baseline: 4/4 PASS (verify-cohort exited 0)" +echo + +# --- 5. Scenario A: tolerance violation ---------------------------------------- +echo "=== scenario A: corrupt expected count outside tolerance — FAIL on diff_ratio ===" +awk -F'\t' 'BEGIN{OFS="\t"} /^#/{print; next} NR==2 {$7=9999; print; next} {print}' "${ANCHORS_TSV}" > "${EX_DIR}/anchors.corrupted.tsv" +python "${ANCHORS}" verify-cohort \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --anchors "${EX_DIR}/anchors.corrupted.tsv" \ + --out "${OUTDIR}/A.tsv" >/dev/null || true +assert_status "sample_1" "chr2:25246500-25246501" "FAIL" "${OUTDIR}/A.tsv" +assert_status "sample_1" "chr7:148884000-148884001" "PASS" "${OUTDIR}/A.tsv" +echo + +# --- 6. Scenario B: min/max bound violation ------------------------------------ +echo "=== scenario B: anchor min=1000 (real count ~56) — FAIL on min ===" +awk -F'\t' 'BEGIN{OFS="\t"} /^#/{print; next} NR==2 {$9=1000; print; next} {print}' "${ANCHORS_TSV}" > "${EX_DIR}/anchors.min.tsv" +python "${ANCHORS}" verify-cohort \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --anchors "${EX_DIR}/anchors.min.tsv" \ + --out "${OUTDIR}/B.tsv" >/dev/null || true +assert_status "sample_1" "chr2:25246500-25246501" "FAIL" "${OUTDIR}/B.tsv" +echo + +# --- 7. Scenario C: corrupt data URL inside HTML — FAIL on decode -------------- +echo "=== scenario C: mangle a session's base64 payload — FAIL on session decode ===" +cp "${OUTDIR}/sample_1.hg38.html" "${OUTDIR}/sample_1.hg38.html.bak" +# Replace one base64 chunk inside a session data URL. The H4sI prefix is the +# base64-encoded gzip magic 0x1f 0x8b 0x08; mangling it breaks the gunzip step +# that decodes the session, simulating arbitrary HTML tampering. +sed -i 's|data:application/gzip;base64,H4sI|data:application/gzip;base64,XXXX|g' "${OUTDIR}/sample_1.hg38.html" +python "${ANCHORS}" verify-cohort \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --anchors "${ANCHORS_TSV}" \ + --out "${OUTDIR}/C.tsv" >/dev/null || true +# Both regions in sample_1 should FAIL (sed hits every session URL in the file). +assert_status "sample_1" "chr2:25246500-25246501" "FAIL" "${OUTDIR}/C.tsv" +assert_status "sample_1" "chr7:148884000-148884001" "FAIL" "${OUTDIR}/C.tsv" +# sample_2 unaffected. +assert_status "sample_2" "chr2:25246500-25246501" "PASS" "${OUTDIR}/C.tsv" +mv "${OUTDIR}/sample_1.hg38.html.bak" "${OUTDIR}/sample_1.hg38.html" +echo + +# --- 8. Scenario D: anchor missing for a (sample, region) — SKIP not FAIL ------ +echo "=== scenario D: drop sample_1's chr2 anchor — that region SKIPs, others PASS ===" +awk -F'\t' 'BEGIN{OFS="\t"} /^#/{print; next} !($1=="sample_1" && $4=="chr2"){print}' "${ANCHORS_TSV}" > "${EX_DIR}/anchors.subset.tsv" +python "${ANCHORS}" verify-cohort \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --anchors "${EX_DIR}/anchors.subset.tsv" \ + --out "${OUTDIR}/D.tsv" \ + --fail-on-fail >/dev/null +# The dropped anchor shouldn't appear at all (nothing to verify). Remaining anchors PASS. +n_rows=$(awk -F'\t' 'NR>1 && $1=="sample_1" && $3=="chr2"' "${OUTDIR}/D.tsv" | wc -l) +if [[ "${n_rows}" -ne 0 ]]; then + echo " FAIL ASSERTION: sample_1/chr2 should NOT appear (dropped anchor) but got ${n_rows} rows" + exit 1 +fi +echo " OK sample_1/chr2 anchor dropped — no row emitted" +assert_status "sample_1" "chr7:148884000-148884001" "PASS" "${OUTDIR}/D.tsv" +assert_status "sample_2" "chr2:25246500-25246501" "PASS" "${OUTDIR}/D.tsv" +echo + +echo "=== all 4 scenarios PASSED — verify_anchors.py behaves as expected ===" diff --git a/igv-reports/tests/integration/cohort_verify/README.md b/igv-reports/tests/integration/cohort_verify/README.md new file mode 100644 index 0000000..05de1d0 --- /dev/null +++ b/igv-reports/tests/integration/cohort_verify/README.md @@ -0,0 +1,79 @@ +# cohort_verify_demo — regression test for `verify_cohort.py` + +End-to-end check that the cohort verifier catches the four failure modes +it's designed to catch. Self-asserting — exit nonzero on any mismatch. + +## What it does + +1. Generates a 3-sample samplesheet (TSV) pointing at three real COLO829 ONT BAMs. +2. Calls `build_igvreports.py --samplesheet ... --no-verify` to produce + `reports/sample_{1,2,3}.hg38.html` + `index.html`. +3. Runs `verify_cohort.py` against the clean cohort (expects all PASS). +4. Runs four corruption scenarios, each asserting the expected check FAILs: + + | Scenario | Corruption | Expected FAILs | + |---|---|---| + | A | Delete `sample_3.hg38.html` | `*/cohort_html_coverage`, `sample_3/html_exists` | + | B | Replace `sample_1.hg38.html` with sample_2's content | `sample_1/sample_tracks_match`, `sample_1/no_cross_sample_contamination`, `sample_1/sample_id_embedded` | + | C | Drop one `<li>` from `index.html` | `*/index_consistency` | + | D | Truncate `sample_2.hg38.html` to 1 KB | `sample_2/html_min_size`, `sample_2/region_count` | + +5. Cleans up generated `reports/`, samplesheet, sites BED, and logs/ on exit. + +## Run + +```bash +bash tests/integration/cohort_verify/scenarios.sh +``` + +Or as part of the full test suite: + +```bash +bash tests/run_all.sh # all layers +bash tests/run_all.sh --integration-only +``` + +## BAM paths (parameterized) + +Requires BAMs supplied via +env vars when running elsewhere: + +```bash +IGV_REPORTS_TEST_BAM_1=/path/to/sample1.bam \ +IGV_REPORTS_TEST_BAM_2=/path/to/sample2.bam \ +IGV_REPORTS_TEST_BAM_3=/path/to/sample3.bam \ + bash tests/integration/cohort_verify/scenarios.sh +``` + +If a default doesn't exist and no env override is set, the script exits +**77** (POSIX skipped-test convention) and `run_all.sh` reports it as a +skip, not a failure. + +Runtime: ~60-90 s on a warm node (3-sample cohort build at 1-bp point-variant +sites + 4 reverify cycles). Per-sample HTML ends up ~3-5 MB. Cold-cache +network reads of the underlying ONT BAMs can extend this to 2-3 min on +first invocation. + +Disk: ~15 MB temporary under `reports/`, auto-cleaned via `trap`. + +The sites BED uses 1-bp point-variant style coordinates (not 13 kb promoter +windows like the methylation example) so BAM slicing stays fast — we're +testing the verifier, not the renderer. Adapt for other workflows if you +want to exercise wider windows. + +## How to provide BAMs + +Set `IGV_REPORTS_TEST_BAM_{1,2,3}` to paths of three indexed BAMs you have +access to. The verifier doesn't care which BAMs — it only requires that the +three rows in the samplesheet declare *different* BAMs (so scenario B's +contamination check has signal). Without those env vars, the test exits 77 +(POSIX skip). + +## Why this is `integration`, not `smoke` or `unit` + +This test depends on real BAMs and on `create_report` actually running, so +it can't fit in `tests/smoke/` (which uses only the committed COLO829 slice +fixture and runs in seconds) or `tests/unit/` (parser-only, no I/O). + +For the parser-level regression checks that gave rise to this verifier, +see [tests/unit/test_verify_report.py](../../unit/test_verify_report.py). diff --git a/igv-reports/tests/integration/cohort_verify/scenarios.sh b/igv-reports/tests/integration/cohort_verify/scenarios.sh new file mode 100755 index 0000000..63ff7d2 --- /dev/null +++ b/igv-reports/tests/integration/cohort_verify/scenarios.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash +# scenarios.sh — end-to-end integration test for scripts/verify_cohort.py. +# +# Builds a 3-sample cohort, snapshots a clean verify pass, then runs four +# corruption scenarios and asserts each triggers the expected check FAILs. +# Exit nonzero if any assertion misses. +# +# Runtime: ~6-8 min cold (cohort build dominates); ~30 s when cohort is cached. +# Disk: ~15 MB under ./reports/ (auto-cleaned on success unless KEEP_REPORTS=1). +# +# BAM source — three different indexed BAMs (any organism, any size). Defaults +# require env vars (no built-in defaults): +# IGV_REPORTS_TEST_BAM_1, _2, _3 +# Tests SKIP (exit 77) when defaults are unset and no override is provided. +set -euo pipefail + +EX_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(cd "${EX_DIR}/../../.." && pwd)" +BUILD="${SKILL_DIR}/scripts/build_igvreports.py" +VERIFY="${SKILL_DIR}/scripts/verify_cohort.py" + +# BAM sources — must be supplied via env vars. No defaults: integration tests +# require three indexed BAMs from a public ONT release (e.g. COLO829 from EBI). +BAM_S1="${IGV_REPORTS_TEST_BAM_1:-}" +BAM_S2="${IGV_REPORTS_TEST_BAM_2:-}" +BAM_S3="${IGV_REPORTS_TEST_BAM_3:-}" + +for bam in "${BAM_S1}" "${BAM_S2}" "${BAM_S3}"; do + if [[ -z "${bam}" || ! -f "${bam}" ]]; then + echo "SKIP: integration test needs three indexed BAMs." >&2 + echo " Set IGV_REPORTS_TEST_BAM_{1,2,3} to paths of three .bam files (each with sibling .bai)." >&2 + exit 77 # POSIX skipped-test convention + fi +done + +SHEET="${EX_DIR}/samplesheet.hg38.tsv" +SITES="${EX_DIR}/sites.hg38.bed" +OUTDIR="${EX_DIR}/reports" + +cleanup() { + if [[ -n "${KEEP_REPORTS:-}" ]]; then + echo "(KEEP_REPORTS set — leaving artifacts in ${OUTDIR} for inspection)" + return + fi + rm -rf "${OUTDIR}" "${SHEET}" "${SITES}" "${EX_DIR}/logs" +} +# Only cleanup on success — failures leave artifacts so they can be debugged. +trap 'rc=$?; if [[ $rc -eq 0 ]]; then cleanup; else echo "(scenarios.sh exited $rc — leaving artifacts in ${OUTDIR} for debug)"; fi' EXIT + +# Requires `create_report` (pip install igv-reports) and `samtools` on PATH. +# If you use a conda env, activate it before invoking this script. + +# --- 1. Generate fresh inputs -------------------------------------------------- +# Point-variant style sites: 1-bp wide each, --flanking 300 = ~600 bp windows. +# Keeps BAM slicing fast (seconds) even with 100+ GB ONT BAMs and full +# annotation tracks. We're testing the verifier, not the renderer; tiny +# windows are sufficient. +cat >"${SITES}" <<EOF +#chrom start end name +chr2 25246500 25246501 DNMT3A_SNV +chr7 148884000 148884001 EZH2_SNV +EOF + +printf 'sample\tbam_tumor\tsites_bed\n' >"${SHEET}" +printf 'sample_1\t%s\t%s\n' "${BAM_S1}" "${SITES}" >>"${SHEET}" +printf 'sample_2\t%s\t%s\n' "${BAM_S2}" "${SITES}" >>"${SHEET}" +printf 'sample_3\t%s\t%s\n' "${BAM_S3}" "${SITES}" >>"${SHEET}" + +# --- 2. Build cohort (3 HTMLs + index.html) ----------------------------------- +# Skip rebuild if a complete cohort is already on disk (set REBUILD=1 to force). +# Lets you iterate on the verifier in seconds instead of waiting ~12 min to +# regenerate HTMLs that haven't changed. +if [[ -z "${REBUILD:-}" \ + && -f "${OUTDIR}/sample_1.hg38.html" \ + && -f "${OUTDIR}/sample_2.hg38.html" \ + && -f "${OUTDIR}/sample_3.hg38.html" \ + && -f "${OUTDIR}/index.html" ]]; then + echo "=== reusing existing cohort in ${OUTDIR} (set REBUILD=1 to force rebuild) ===" +else + echo "=== building cohort ===" + python "${BUILD}" \ + --samplesheet "${SHEET}" \ + --genome hg38 \ + --flanking 300 \ + --type mutation \ + --info-columns name \ + --output-dir "${OUTDIR}" \ + --no-apptainer \ + --no-verify # don't auto-verify during build — we exercise the verifier explicitly below +fi +echo + +assert_status() { + # assert_status <sample> <check> <expected_status> <verify_tsv> + local sample="$1" check="$2" expected="$3" tsv="$4" + local actual + actual=$(awk -F'\t' -v s="$sample" -v c="$check" '$1==s && $2==c {print $3; exit}' "$tsv") + if [[ "$actual" != "$expected" ]]; then + echo " FAIL ASSERTION: sample=$sample check=$check expected=$expected actual=${actual:-<missing>}" + return 1 + fi + echo " OK sample=$sample check=$check status=$actual" +} + +# --- 3. Baseline verify (all PASS) -------------------------------------------- +echo "=== scenario 0: baseline (all PASS expected) ===" +python "${VERIFY}" \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --out "${OUTDIR}/baseline.tsv" \ + --fail-on-fail >/dev/null +echo " baseline: all PASS (verify exited 0)" +echo + +# --- 4. Scenario A: missing HTML ---------------------------------------------- +echo "=== scenario A: delete sample_3's HTML — C1 cohort_html_coverage should FAIL ===" +mv "${OUTDIR}/sample_3.hg38.html" "${OUTDIR}/sample_3.hg38.html.bak" +python "${VERIFY}" \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --out "${OUTDIR}/A.tsv" >/dev/null || true +assert_status "*" "cohort_html_coverage" "FAIL" "${OUTDIR}/A.tsv" +assert_status "sample_3" "html_exists" "FAIL" "${OUTDIR}/A.tsv" +mv "${OUTDIR}/sample_3.hg38.html.bak" "${OUTDIR}/sample_3.hg38.html" +echo + +# --- 5. Scenario B: sample swap (sample_1.html now contains sample_2 data) --- +echo "=== scenario B: swap sample_1<-sample_2 — sample_tracks_match + id_embedded + contamination should FAIL on sample_1 ===" +cp "${OUTDIR}/sample_1.hg38.html" "${OUTDIR}/sample_1.hg38.html.bak" +cp "${OUTDIR}/sample_2.hg38.html" "${OUTDIR}/sample_1.hg38.html" +python "${VERIFY}" \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --out "${OUTDIR}/B.tsv" >/dev/null || true +assert_status "sample_1" "sample_tracks_match" "FAIL" "${OUTDIR}/B.tsv" +assert_status "sample_1" "no_cross_sample_contamination" "FAIL" "${OUTDIR}/B.tsv" +assert_status "sample_1" "sample_id_embedded" "FAIL" "${OUTDIR}/B.tsv" +mv "${OUTDIR}/sample_1.hg38.html.bak" "${OUTDIR}/sample_1.hg38.html" +echo + +# --- 6. Scenario C: corrupt index.html ---------------------------------------- +echo "=== scenario C: drop one <li> from index.html — C5 index_consistency should FAIL ===" +cp "${OUTDIR}/index.html" "${OUTDIR}/index.html.bak" +sed -i '/href="sample_2.hg38.html"/d' "${OUTDIR}/index.html" +python "${VERIFY}" \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --out "${OUTDIR}/C.tsv" >/dev/null || true +assert_status "*" "index_consistency" "FAIL" "${OUTDIR}/C.tsv" +mv "${OUTDIR}/index.html.bak" "${OUTDIR}/index.html" +echo + +# --- 7. Scenario D: tiny HTML (truncation) ------------------------------------ +echo "=== scenario D: truncate sample_2.html to 1 KB — html_min_size + parse failures expected ===" +cp "${OUTDIR}/sample_2.hg38.html" "${OUTDIR}/sample_2.hg38.html.bak" +head -c 1024 "${OUTDIR}/sample_2.hg38.html.bak" > "${OUTDIR}/sample_2.hg38.html" +python "${VERIFY}" \ + --samplesheet "${SHEET}" \ + --reports-dir "${OUTDIR}" \ + --genome hg38 \ + --min-size-mb 1.0 \ + --out "${OUTDIR}/D.tsv" >/dev/null || true +assert_status "sample_2" "html_min_size" "FAIL" "${OUTDIR}/D.tsv" +assert_status "sample_2" "region_count" "FAIL" "${OUTDIR}/D.tsv" +mv "${OUTDIR}/sample_2.hg38.html.bak" "${OUTDIR}/sample_2.hg38.html" +echo + +echo "=== all 4 scenarios PASSED — verify_cohort.py behaves as expected ===" diff --git a/igv-reports/tests/integration/end_to_end/README.md b/igv-reports/tests/integration/end_to_end/README.md new file mode 100644 index 0000000..85fad18 --- /dev/null +++ b/igv-reports/tests/integration/end_to_end/README.md @@ -0,0 +1,62 @@ +# tests/integration/end_to_end + +End-to-end smoke test against the **committed** `tests/fixtures/tiny_colo829.hg38.bam` +fixture (457 KB). Unlike the other integration scenarios (`anchor_verify`, +`cohort_verify`), this one needs no shared-storage access and runs in +~30 s — so it ships in CI. + +## What it exercises (not via mocks) + +1. **`build_igvreports.py --bam ...`** actually invokes `create_report` + against the fixture BAM with a synthesized minimal FASTA. Produces a + real ~2 MB HTML. +2. **`verify_report.py`** parses the HTML's `tableJson` + `sessionDictionary`, + confirms region count + track presence. +3. **`verify_anchors.py generate`** counts reads in the source BAM at the + three sites; asserts the counts match the frozen contract documented in + `tests/fixtures/README.md` (`chr2:25246500-25246501 = 5`, + `chr7:148884000-148884001 = 9`). +4. **`verify_anchors.py verify`** decodes the embedded BAM slices from the + freshly-built HTML and confirms the same counts — closes the loop on the + create_report ↔ source-BAM round trip. +5. **`--also-png` (optional)** runs the same pipeline with the PNG sidecar + path. SKIPs cleanly when `igver` isn't installed or fails (the + documented silent-failure mode in `rules/igv.md`). + +## What it catches that unit tests don't + +- `create_report` flag rename / removal on upstream version bumps +- HTML structural changes from upstream (e.g. session-dict layout drift) +- Driver regressions on the **non-mock** code path +- Off-MSKCC portability bugs — the test runs against the committed fixture + with no `shared-storage` dependency, so CI exercises the same code paths + external users would hit + +## Runtime + +| Step | Duration | +|---|---| +| `create_report` (3 regions × 1 BAM × 300 bp flanking) | ~2 s | +| structural `verify_report.py` | <1 s | +| `verify_anchors.py generate` + `verify` | ~5 s | +| `--also-png` (if igver available) | ~5 s | +| **total** | **~14 s** | + +## Prereqs + +- `create_report` on PATH (`pip install -U 'igv-reports>=1.16.0'`) +- `samtools` on PATH (provided by the smoke layer prereqs) +- `python3` on PATH + +If `create_report` is missing the test exits 77 (skipped) rather than +failing — same convention as the other integration scenarios. + +## Knobs + +- `KEEP_REPORTS=1` — leave the `out/` directory in place after a successful + run for manual inspection. +- `IGV_REPORTS_PY=/path/to/python` — pin the python interpreter (the + default search is conda's snakemake env → `python3` on PATH). +- `IGVER_CMD='apptainer exec /path/to/igver.sif igver'` — provide a working + `igver` invocation so step 8 (`--also-png`) actually exercises the PNG + pipeline rather than SKIPping. diff --git a/igv-reports/tests/integration/end_to_end/scenarios.sh b/igv-reports/tests/integration/end_to_end/scenarios.sh new file mode 100755 index 0000000..ad83100 --- /dev/null +++ b/igv-reports/tests/integration/end_to_end/scenarios.sh @@ -0,0 +1,248 @@ +#!/usr/bin/env bash +# scenarios.sh — end-to-end smoke test using the COMMITTED tiny_colo829 fixture. +# +# Author: Samuel Ahuno +# Purpose: +# The other integration scenarios (anchor_verify, cohort_verify) require +# 167 GB lab BAMs and take 6-8 min. This one uses the 457 KB +# tests/fixtures/tiny_colo829.hg38.bam fixture so the full pipeline runs +# in ~30 s on any machine with `create_report` on PATH. +# +# What it exercises end-to-end (not via mocks): +# 1. build_igvreports.py invokes create_report against the fixture +# 2. The resulting HTML is parseable by verify_report.py (structural) +# 3. verify_anchors.py generate → frozen counts (chr2=5, chr7=9 per +# tests/fixtures/README.md) +# 4. verify_anchors.py verify → PASS on the freshly built HTML +# 5. If `igver` is on PATH: --also-png produces non-empty per-region +# PNGs and the manifest. Otherwise that step is SKIPped (logged). +# +# Catches: create_report flag drift, HTML-format upstream changes, driver +# regressions on the non-mock path, off-MSKCC portability bugs (the +# fixture is committed; no shared-storage required). +# +# Runtime: ~30 s. Disk: ~5 MB under ./out/ (auto-cleaned on success). +set -euo pipefail + +EX_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(cd "${EX_DIR}/../../.." && pwd)" +BUILD="${SKILL_DIR}/scripts/build_igvreports.py" +ANCHORS="${SKILL_DIR}/scripts/verify_anchors.py" +VR="${SKILL_DIR}/scripts/verify_report.py" +FIXTURE="${SKILL_DIR}/tests/fixtures/tiny_colo829.hg38.bam" + +# Prerequisite: create_report must be on PATH (provided by `pip install igv-reports`). +if ! command -v create_report >/dev/null 2>&1; then + echo "SKIP: end-to-end test needs create_report on PATH." >&2 + echo " Install with: pip install -U 'igv-reports>=1.16.0'" >&2 + exit 77 +fi + +# Prerequisite: the committed fixture must be readable. +if [[ ! -f "${FIXTURE}" ]] || [[ ! -f "${FIXTURE}.bai" ]]; then + echo "ERROR: fixture missing or unindexed: ${FIXTURE}" >&2 + echo " Regenerate with: bash tests/fixtures/build_fixtures.sh" >&2 + exit 1 +fi + +# Pick the python that can import pyyaml + the same scripts/. Tests/run_all.sh +# already does this dance; we mirror it. +PY="${IGV_REPORTS_PY:-}" +if [[ -z "${PY}" ]]; then + if command -v python3 >/dev/null 2>&1; then + PY=$(command -v python3) + else + echo "ERROR: no python3 on PATH" >&2; exit 2 + fi +fi + +OUTDIR="${EX_DIR}/out" +cleanup() { + if [[ -n "${KEEP_REPORTS:-}" ]]; then + echo "(KEEP_REPORTS set — leaving artifacts in ${OUTDIR})" + return + fi + rm -rf "${OUTDIR}" +} +trap 'rc=$?; if [[ $rc -eq 0 ]]; then cleanup; else echo "(scenarios.sh exited $rc — leaving ${OUTDIR} for debug)"; fi' EXIT + +rm -rf "${OUTDIR}" +mkdir -p "${OUTDIR}" + +# --- 1. Inputs -------------------------------------------------------------- +# Three sites, all within the fixture's two slice regions: +# * chr2:25246500-25246501 (DNMT3A R882 SNV, frozen anchor count = 5) +# * chr7:148884000-148884001 (EZH2 Y646 SNV, frozen anchor count = 9) +# * chr2:25247500-25247501 (second DNMT3A locus, count not frozen) +# Frozen counts are the contract per tests/fixtures/README.md. +SITES="${OUTDIR}/sites.hg38.bed" +cat >"${SITES}" <<EOF +#chrom start end name +chr2 25246500 25246501 DNMT3A_R882 +chr7 148884000 148884001 EZH2_Y646 +chr2 25247500 25247501 DNMT3A_2nd +EOF + +# Reference FASTA: in CI we won't have hg38 locally. Skip the --fasta +# resolution and use --no-default-tracks; create_report will then need a +# --fasta path. We supply a synthesized FASTA covering both regions so +# create_report can compute its own slice without internet access. +FASTA="${OUTDIR}/tiny.hg38.fa" +${PY} -c " +# Minimal multi-contig FASTA covering the fixture's coverage windows. +# Only the size matters for create_report's region slicing — bases don't +# need to be biologically real; the BAM's reads carry the actual signal. +contigs = [ + ('chr2', 30_000_000), + ('chr7', 150_000_000), +] +with open('${FASTA}', 'w') as fh: + for name, length in contigs: + fh.write(f'>{name}\n') + n_per_line = 60 + for i in range(0, length, n_per_line): + fh.write('N' * min(n_per_line, length - i) + '\n') +" +samtools faidx "${FASTA}" + +# --- 2. Build HTML (the actual end-to-end step) ----------------------------- +echo "=== build: invoke create_report against fixture BAM ===" +HTML="${OUTDIR}/sample.hg38.html" +${PY} "${BUILD}" \ + --sites "${SITES}" \ + --bam "${FIXTURE}" \ + --genome hg38 \ + --fasta "${FASTA}" \ + --no-default-tracks \ + --flanking 300 \ + --type mutation \ + --info-columns name \ + --output "${HTML}" \ + --no-apptainer \ + --no-verify 2>&1 | tail -8 +echo + +# --- 3. Assertion: HTML exists, plausible size ------------------------------ +if [[ ! -f "${HTML}" ]]; then + echo "FAIL: HTML not produced at ${HTML}"; exit 1 +fi +size=$(stat -c %s "${HTML}") +if [[ "${size}" -lt 50000 ]]; then + echo "FAIL: HTML suspiciously small (${size} bytes) — expected >= 50 KB" + exit 1 +fi +echo " OK HTML: ${HTML} (${size} bytes)" +echo + +# --- 4. Structural verify --------------------------------------------------- +echo "=== verify_report.py: structural check ===" +${PY} "${VR}" \ + --html "${HTML}" \ + --sites "${SITES}" \ + --tracks "${FIXTURE}" \ + --min-size-mb 0.05 \ + --out "${OUTDIR}/verify_report.tsv" \ + --fail-on-fail >/dev/null +echo " OK structural verify PASS" +echo + +# --- 5. Generate frozen anchors --------------------------------------------- +echo "=== verify_anchors.py generate: BAM read counts ===" +SHEET="${OUTDIR}/samplesheet.tsv" +printf 'sample\tbam_tumor\tsites_bed\n' >"${SHEET}" +printf 'sample\t%s\t%s\n' "${FIXTURE}" "${SITES}" >>"${SHEET}" + +ANCHORS_TSV="${OUTDIR}/anchors.hg38.tsv" +${PY} "${ANCHORS}" generate \ + --samplesheet "${SHEET}" \ + --sites "${SITES}" \ + --out "${ANCHORS_TSV}" 2>&1 | tail -6 +echo + +# --- 6. Assertion: frozen anchor counts match contract ---------------------- +# Contract is in tests/fixtures/README.md. Any drift here is the loudest +# signal that the fixture changed, the BAM filter changed, or the test +# environment is using a different samtools. +expected_chr2=5 +expected_chr7=9 +actual_chr2=$(awk -F'\t' '$4=="chr2" && $5==25246500 {print $7}' "${ANCHORS_TSV}") +actual_chr7=$(awk -F'\t' '$4=="chr7" && $5==148884000 {print $7}' "${ANCHORS_TSV}") +if [[ "${actual_chr2}" != "${expected_chr2}" ]]; then + echo "FAIL: chr2:25246500-25246501 expected=${expected_chr2} got=${actual_chr2}" + exit 1 +fi +if [[ "${actual_chr7}" != "${expected_chr7}" ]]; then + echo "FAIL: chr7:148884000-148884001 expected=${expected_chr7} got=${actual_chr7}" + exit 1 +fi +echo " OK anchor contract: chr2=5 chr7=9 (matches tests/fixtures/README.md)" +echo + +# --- 7. verify_anchors against the just-built HTML -------------------------- +echo "=== verify_anchors.py verify: HTML slice round-trip ===" +${PY} "${ANCHORS}" verify \ + --html "${HTML}" \ + --anchors "${ANCHORS_TSV}" \ + --out "${OUTDIR}/verify_anchors.tsv" \ + --fail-on-fail >/dev/null +echo " OK anchor verify PASS (HTML slice counts match source BAM counts)" +echo + +# --- 8. Optional: --also-png exercises the full HTML+PNG pipeline ----------- +# Skip semantics: this step is best-effort and never causes the test to FAIL. +# `igver` may be on PATH as a `pip install igver` egg-link shim WITHOUT the +# underlying IGV Java binary — exits 0 but produces no PNGs (the documented +# silent-failure mode in rules/igv.md). Our --also-png driver catches that +# via the inline existence check and raises SystemExit. Here we treat any +# such failure as SKIP rather than propagate it, since a non-working igver +# install isn't a regression in this skill's code. +if command -v igver >/dev/null 2>&1 || [[ -n "${IGVER_CMD:-}" ]]; then + echo "=== --also-png: HTML + per-region PNGs (igver available) ===" + HTML_PNG="${OUTDIR}/png_sample.hg38.html" + if ${PY} "${BUILD}" \ + --sites "${SITES}" \ + --bam "${FIXTURE}" \ + --genome hg38 \ + --fasta "${FASTA}" \ + --no-default-tracks \ + --flanking 300 \ + --type mutation \ + --info-columns name \ + --output "${HTML_PNG}" \ + --no-apptainer \ + --no-verify \ + --also-png \ + --png-dpi 100 >"${OUTDIR}/also_png.log" 2>&1; then + # --also-png returned 0 — assert the manifest + PNGs are real. + MANIFEST="${OUTDIR}/png_png_sample.hg38/manifest.tsv" + if [[ ! -f "${MANIFEST}" ]]; then + echo "FAIL: --also-png exited 0 but no manifest at ${MANIFEST}" + exit 1 + fi + n_regions=$(awk -F'\t' 'NR>1 && !/^#/' "${MANIFEST}" | wc -l) + if [[ "${n_regions}" -ne 3 ]]; then + echo "FAIL: manifest has ${n_regions} regions, expected 3" + exit 1 + fi + png_one=$(awk -F'\t' 'NR==2 {print $9}' "${MANIFEST}") + if [[ ! -s "${png_one}" ]]; then + echo "FAIL: PNG missing or empty: ${png_one}" + exit 1 + fi + png_size=$(stat -c %s "${png_one}") + echo " OK manifest: ${n_regions} regions; spot-check ${png_one##*/} = ${png_size} bytes" + else + # Driver caught the silent-failure mode; surface the diagnostic but + # don't fail the test — broken igver install is environment-level. + echo " SKIP (igver invocation failed — likely missing IGV Java binary or wrong PATH)" + echo " see ${OUTDIR}/also_png.log for the driver's diagnostic." + if grep -q "silent exit-0 failure\|Failed to generate all PNG files" "${OUTDIR}/also_png.log" 2>/dev/null; then + echo " (confirmed: this is the documented igver silent-failure mode)" + fi + fi +else + echo "=== --also-png: SKIP (igver not on PATH; set \$IGVER_CMD or install via apptainer SIF) ===" +fi +echo + +echo "=== end-to-end PASS — full pipeline (create_report → verify → optional igver) ===" diff --git a/igv-reports/tests/run_all.sh b/igv-reports/tests/run_all.sh new file mode 100755 index 0000000..1356656 --- /dev/null +++ b/igv-reports/tests/run_all.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# tests/run_all.sh — orchestrate the three test layers in order. +# +# Author: Samuel Ahuno +# Purpose: +# 1. unit (~1 s) — pure-Python parser tests; pytest. +# 2. smoke (~3 s) — samtools subprocess + slice-decode round-trip +# against the committed fixture; pytest. +# 3. integration — full cohort build + verify-cohort + verify-anchors +# end-to-end; bash scenarios.sh under each demo. +# Skipped (exit 77) when the IGV_REPORTS_TEST_BAM_* +# env vars are unset AND the MSKCC default paths +# don't exist. +# +# Usage: +# bash tests/run_all.sh # all three layers +# bash tests/run_all.sh --unit-only # layer 1 only — instant feedback +# bash tests/run_all.sh --no-integration # layers 1 + 2 (fast everywhere) +# bash tests/run_all.sh --integration-only # layer 3 only — for the slow lane +# +# Exit code: +# 0 — every requested layer passed (or was legitimately skipped). +# 1+ — at least one layer failed; output preserved for debugging. +set -euo pipefail + +TESTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(cd "${TESTS_DIR}/.." && pwd)" + +RUN_UNIT=1 +RUN_SMOKE=1 +RUN_INTEGRATION=1 + +for arg in "$@"; do + case "$arg" in + --unit-only) RUN_SMOKE=0; RUN_INTEGRATION=0 ;; + --no-integration) RUN_INTEGRATION=0 ;; + --integration-only) RUN_UNIT=0; RUN_SMOKE=0 ;; + -h|--help) + sed -n '3,20p' "$0" + exit 0 + ;; + *) + echo "ERROR: unknown flag: $arg" >&2 + echo " Use --help to see options." >&2 + exit 2 + ;; + esac +done + +# Pick a Python with pytest. Prefer the snakemake conda env (where all +# project tooling lives); fall back to PATH `python3`. +PY="${IGV_REPORTS_PY:-}" +if [[ -z "${PY}" ]]; then + if [[ -x /home/ahunos/miniforge3/envs/snakemake/bin/python ]]; then + PY=/home/ahunos/miniforge3/envs/snakemake/bin/python + elif command -v python3 >/dev/null 2>&1; then + PY=$(command -v python3) + else + echo "ERROR: no python3 available. Set IGV_REPORTS_PY=<path-to-python>" >&2 + exit 2 + fi +fi + +FAILS=0 +SKIPS=0 + +run_layer() { + local name="$1"; shift + local desc="$1"; shift + echo "=== ${name}: ${desc} ===" + if "$@"; then + echo " ${name} PASS" + else + local rc=$? + if [[ $rc -eq 77 ]]; then + echo " ${name} SKIP (exit 77 — see message above)" + SKIPS=$((SKIPS + 1)) + else + echo " ${name} FAIL (exit ${rc})" + FAILS=$((FAILS + 1)) + fi + fi + echo +} + +# --- Layer 1: unit --------------------------------------------------------- +if [[ $RUN_UNIT -eq 1 ]]; then + run_layer "unit" "pure-Python parsers" \ + "${PY}" -m pytest "${TESTS_DIR}/unit/" -q +fi + +# --- Layer 2: smoke -------------------------------------------------------- +if [[ $RUN_SMOKE -eq 1 ]]; then + run_layer "smoke" "samtools + slice-decode round-trip" \ + "${PY}" -m pytest "${TESTS_DIR}/smoke/" -q +fi + +# --- Layer 3: integration -------------------------------------------------- +# Each scenarios.sh exits 77 if its required BAMs aren't available; we treat +# that as a skip rather than a failure so the suite is portable. +if [[ $RUN_INTEGRATION -eq 1 ]]; then + # end_to_end: uses the committed tiny_colo829 fixture (~30 s, runs in CI). + run_layer "integration / end_to_end" "full pipeline against committed fixture" \ + bash "${TESTS_DIR}/integration/end_to_end/scenarios.sh" + run_layer "integration / cohort_verify" "cohort structural verifier scenarios" \ + bash "${TESTS_DIR}/integration/cohort_verify/scenarios.sh" + run_layer "integration / anchor_verify" "anchor content verifier scenarios" \ + bash "${TESTS_DIR}/integration/anchor_verify/scenarios.sh" +fi + +echo "=== summary ===" +echo " failures: ${FAILS}" +echo " skips: ${SKIPS}" + +if [[ $FAILS -gt 0 ]]; then + exit 1 +fi +exit 0 diff --git a/igv-reports/tests/smoke/test_slice_count.py b/igv-reports/tests/smoke/test_slice_count.py new file mode 100644 index 0000000..9c72586 --- /dev/null +++ b/igv-reports/tests/smoke/test_slice_count.py @@ -0,0 +1,154 @@ +"""Smoke tests for verify_anchors.py — exercises the samtools subprocess +path and the end-to-end slice-decode-and-count flow against the committed +COLO829 BAM fixture. + +Author: Samuel Ahuno +Purpose: + The unit tests cover the parsers with synthetic inputs. These smoke + tests confirm the verifier's samtools shellouts actually work end-to-end: + + 1. `samtools_count` returns the right integer for a real BAM region. + 2. `samtools_index` produces a usable index on a fresh BAM. + 3. The full decode round-trip (read fixture BAM → base64 wrap into a + fake data: URL → decode_track_slice writes it back out → re-index + → re-count) preserves the original count exactly. + + Skipped (not failed) if samtools is unavailable — so this works in a + CI sandbox without the SIF or PATH samtools. + +Run: + cd claude/skills/igv-reports + pytest tests/smoke/ -v +""" + +from __future__ import annotations + +import base64 +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest + +SCRIPTS = Path(__file__).resolve().parents[2] / "scripts" +sys.path.insert(0, str(SCRIPTS)) +import verify_anchors as va # noqa: E402 + +FIXTURE = Path(__file__).resolve().parents[1] / "fixtures" / "tiny_colo829.hg38.bam" + +# Anchor sanity counts — must match fixtures/README.md and build_fixtures.sh. +ANCHOR_CHR2 = ("chr2:25246500-25246501", 5) +ANCHOR_CHR7 = ("chr7:148884000-148884001", 9) + + +def _samtools_cmd() -> list[str] | None: + """Resolve samtools the same way verify_anchors does, but return None + instead of raising when nothing is available. Lets us SKIP gracefully.""" + try: + return va.resolve_samtools(None) + except SystemExit: + # No SIF and no PATH samtools — environment can't run smoke tests. + return None + + +@pytest.fixture(scope="module") +def samtools_cmd(): + cmd = _samtools_cmd() + if cmd is None: + pytest.skip("no samtools available (set SAMTOOLS_SIF or install samtools)") + if not FIXTURE.exists(): + pytest.skip( + f"fixture missing: {FIXTURE} — regenerate with " + "bash tests/fixtures/build_fixtures.sh" + ) + return cmd + + +# --------------------------------------------------------------------------- +# samtools_count + samtools_index against the committed fixture +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("region,expected", [ANCHOR_CHR2, ANCHOR_CHR7]) +def test_samtools_count_matches_fixture_anchor(samtools_cmd, region, expected): + observed = va.samtools_count(samtools_cmd, FIXTURE, region) + assert observed == expected, ( + f"fixture anchor drift: {region} should be {expected}, got {observed}. " + "Either the committed BAM was regenerated with different params " + "(see tests/fixtures/build_fixtures.sh) or the count filter changed." + ) + + +def test_samtools_index_creates_usable_index(samtools_cmd, tmp_path): + """Copy the fixture to tmp_path WITHOUT its .bai, then have verify_anchors + re-index it. After indexing, samtools_count must succeed.""" + bam_copy = tmp_path / "no_bai.bam" + shutil.copy(FIXTURE, bam_copy) + # confirm no index exists yet + assert not (tmp_path / "no_bai.bam.bai").exists() + va.samtools_index(samtools_cmd, bam_copy) + assert (tmp_path / "no_bai.bam.bai").exists() + # count works now + assert va.samtools_count(samtools_cmd, bam_copy, ANCHOR_CHR2[0]) == ANCHOR_CHR2[1] + + +def test_samtools_count_missing_bam_raises(samtools_cmd, tmp_path): + """A missing BAM path should produce a clear RuntimeError, not crash silently. + + Note: samtools tolerates malformed region strings (treats them as unknown + references and returns 0 with a stderr warning + exit 0). The only + reliable error trigger is a missing/unreadable BAM file.""" + missing = tmp_path / "does_not_exist.bam" + with pytest.raises(RuntimeError, match="samtools view -c failed"): + va.samtools_count(samtools_cmd, missing, "chr1:1-100") + + +# --------------------------------------------------------------------------- +# Slice decode round-trip (the central correctness claim of verify_anchors) +# --------------------------------------------------------------------------- + +def test_full_decode_roundtrip(samtools_cmd, tmp_path): + """End-to-end: emulate what igv-reports does to embed a BAM slice in an + HTML data: URL, then have verify_anchors decode it back out and confirm + the read count is preserved. + + This is the critical correctness claim: if the verifier's slice decode + silently corrupts the BAM bytes, every anchor verify would silently + pass when it shouldn't. Catching that here means we trust the + integration tests further down.""" + # 1. Read the committed BAM raw, wrap it in a data: URL exactly the + # way igv_reports/datauri.py does (mediatype application/gzip + # because BAM is BGZF gzip — see decision in datauri.get_data_uri). + raw = FIXTURE.read_bytes() + data_url = "data:application/gzip;base64," + base64.b64encode(raw).decode() + + # 2. Decode it via the production code path. + decoded = tmp_path / "decoded.bam" + va.decode_track_slice(data_url, decoded) + + # 3. Bytes must match exactly. + assert decoded.read_bytes() == raw + + # 4. samtools should treat the decoded file as a real BAM — index it + # and count the same anchors. + va.samtools_index(samtools_cmd, decoded) + for region, expected in (ANCHOR_CHR2, ANCHOR_CHR7): + assert va.samtools_count(samtools_cmd, decoded, region) == expected + + +def test_resolve_samtools_explicit_sif_missing(tmp_path): + """Passing a non-existent SIF path explicitly must fail loudly, not fall + back silently to PATH samtools.""" + fake_sif = tmp_path / "does_not_exist.sif" + with pytest.raises(SystemExit, match="samtools SIF not found"): + va.resolve_samtools(fake_sif) + + +def test_resolve_samtools_env_var(tmp_path, monkeypatch): + """$SAMTOOLS_SIF env var honored when no --samtools-sif passed.""" + # Use a real-ish path that doesn't exist to verify it's the *path* the + # env-resolution picks up, not some unrelated SIF. + fake = tmp_path / "env_sif.sif" + monkeypatch.setenv("SAMTOOLS_SIF", str(fake)) + with pytest.raises(SystemExit, match=str(fake)): + va.resolve_samtools(None) diff --git a/igv-reports/tests/unit/test_build_pngs.py b/igv-reports/tests/unit/test_build_pngs.py new file mode 100644 index 0000000..54a913c --- /dev/null +++ b/igv-reports/tests/unit/test_build_pngs.py @@ -0,0 +1,338 @@ +"""Unit tests for the --also-png plumbing in build_igvreports.py. + +Author: Samuel Ahuno +Purpose: + Exercises the helpers that bridge the HTML build to igver: sites BED + parsing + UID assignment, flanked regions BED writer, input.txt writer, + igver-cmd resolution, and manifest writing. + + We don't actually invoke igver here — the manifest writer reconstructs + filenames from the same convention igver uses (validated against + igver's _parse_bed_file source: `<chrom>-<start>-<end>.<name>.<ext>`). + Cross-artifact consistency depends on this filename contract; if igver + ever changes it, this test plus verify_cohort will catch the drift. + +Run: + pytest tests/unit/test_build_pngs.py -v +""" + +from __future__ import annotations + +import logging +import os +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +SCRIPTS = Path(__file__).resolve().parents[2] / "scripts" +sys.path.insert(0, str(SCRIPTS)) + +import build_igvreports as b # noqa: E402 + + +def _write_bed(path: Path, rows: list[tuple]) -> None: + """Helper — `rows` is a list of (chrom, start, end[, name][, ...]).""" + with path.open("w") as fh: + for r in rows: + fh.write("\t".join(str(x) for x in r) + "\n") + + +# ----- _read_sites_bed_rows ----- + + +def test_read_sites_bed_assigns_uids_when_name_missing(tmp_path): + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [("chr1", 100, 200), ("chr2", 300, 400)]) + rows = b._read_sites_bed_rows(bed) + assert len(rows) == 2 + # Auto-UIDs are zero-padded to 3 digits so directory listings sort right + # and `region_010` doesn't sort before `region_2`. + assert rows[0]["name"] == "region_001" + assert rows[1]["name"] == "region_002" + assert rows[0]["bed_row_idx"] == 1 + assert rows[1]["bed_row_idx"] == 2 + + +def test_read_sites_bed_preserves_existing_names(tmp_path): + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [("chr2", 100, 200, "DNMT3A_full_gene"), ("chr7", 300, 400, "TP53")]) + rows = b._read_sites_bed_rows(bed) + assert rows[0]["name"] == "DNMT3A_full_gene" + assert rows[1]["name"] == "TP53" + + +def test_read_sites_bed_skips_comment_and_track_lines(tmp_path): + bed = tmp_path / "sites.hg38.bed" + bed.write_text( + "#chrom\tstart\tend\tname\n" + "track name=foo\n" + "browser dense\n" + "chr1\t100\t200\treal_row\n" + ) + rows = b._read_sites_bed_rows(bed) + assert len(rows) == 1 + assert rows[0]["name"] == "real_row" + + +def test_read_sites_bed_handles_mixed_named_and_unnamed(tmp_path): + # If some rows have names and others don't, unnamed ones still get + # auto-UIDs based on file position so manifests stay deterministic. + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [ + ("chr1", 100, 200, "named_first"), + ("chr2", 300, 400), + ("chr3", 500, 600, "named_third"), + ]) + rows = b._read_sites_bed_rows(bed) + assert [r["name"] for r in rows] == ["named_first", "region_002", "named_third"] + + +# ----- _write_igver_regions_bed ----- + + +def test_write_igver_regions_bed_applies_flanking(tmp_path): + rows = [ + {"chrom": "chr1", "start": 100, "end": 200, "name": "A", "bed_row_idx": 1}, + {"chrom": "chr2", "start": 50, "end": 150, "name": "B", "bed_row_idx": 2}, + ] + out = tmp_path / "igver_regions.bed" + b._write_igver_regions_bed(rows, flanking=300, out=out) + lines = out.read_text().splitlines() + # Row 1: 100-300=−200, clamped to 0; end 200+300=500. + assert lines[0] == "chr1\t0\t500\tA" + # Row 2: 50−300=−250, clamped to 0; end 150+300=450. + assert lines[1] == "chr2\t0\t450\tB" + + +def test_write_igver_regions_bed_zero_flanking_passes_rows_verbatim(tmp_path): + rows = [{"chrom": "chrX", "start": 1000, "end": 2000, "name": "promoter", "bed_row_idx": 1}] + out = tmp_path / "igver_regions.bed" + b._write_igver_regions_bed(rows, flanking=0, out=out) + assert out.read_text().strip() == "chrX\t1000\t2000\tpromoter" + + +# ----- _write_igver_input_list ----- + + +def test_write_igver_input_list_one_path_per_line(tmp_path): + tracks = ["/path/to/tumor.bam", "/path/to/normal.bam", "/path/to/calls.vcf"] + out = tmp_path / "igver_input.txt" + b._write_igver_input_list(tracks, out) + assert out.read_text().splitlines() == tracks + + +# ----- _resolve_igver_cmd ----- + + +def test_resolve_igver_cmd_explicit_override_wins(): + override = "apptainer exec /path/to/igver.sif igver" + assert b._resolve_igver_cmd(override) == override.split() + + +def test_resolve_igver_cmd_env_var_falls_back(monkeypatch): + monkeypatch.setenv("IGVER_CMD", "/usr/local/bin/igver --debug") + # which() must not find igver for this branch to fire; mock it to None. + with patch.object(b.shutil, "which", return_value=None), \ + patch.object(b.Path, "exists", return_value=False): + assert b._resolve_igver_cmd(None) == ["/usr/local/bin/igver", "--debug"] + + +def test_resolve_igver_cmd_path_lookup(monkeypatch): + monkeypatch.delenv("IGVER_CMD", raising=False) + with patch.object(b.shutil, "which", return_value="/usr/bin/igver"): + assert b._resolve_igver_cmd(None) == ["/usr/bin/igver"] + + +def test_resolve_igver_cmd_raises_when_not_found(monkeypatch): + monkeypatch.delenv("IGVER_CMD", raising=False) + with patch.object(b.shutil, "which", return_value=None), \ + patch.object(b.Path, "exists", return_value=False): + with pytest.raises(SystemExit, match="igver not found"): + b._resolve_igver_cmd(None) + + +# ----- build_pngs_with_igver — mocked subprocess ----- + + +def _fake_igver_run(cmd, **kwargs): + """Stand-in for subprocess.run that mimics a successful igver invocation: + parses the regions BED out of `-r`, parses the output dir out of `-o`, + and writes a non-empty fake PNG at each expected filename — same + `<chr>-<start>-<end>.<uid>.<ext>` convention real igver uses.""" + import subprocess + out_dir = Path(cmd[cmd.index("-o") + 1]) + out_dir.mkdir(parents=True, exist_ok=True) + regions_bed = Path(cmd[cmd.index("-r") + 1]) + fmt = cmd[cmd.index("-f") + 1] if "-f" in cmd else "png" + ext = "svg" if fmt in ("svg", "pdf") else fmt + for line in regions_bed.read_text().splitlines(): + if not line or line.startswith("#"): + continue + chrom, start, end, name = line.split("\t")[:4] + (out_dir / f"{chrom}-{start}-{end}.{name}.{ext}").write_bytes(b"PNG\x00" * 4096) + return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="", stderr="") + + +def test_build_pngs_with_igver_writes_manifest_and_inputs(tmp_path, monkeypatch): + # Set up a synthetic sites BED + tracks list + mock igver that actually + # writes the expected output files (the inline existence check rejects + # an igver run that produces zero PNGs). + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [ + ("chr1", 100, 200, "alpha"), + ("chr2", 300, 400, "beta"), + ]) + tracks = ["/data/sample.bam", "/data/calls.vcf"] + html_path = tmp_path / "sample.hg38.html" + html_path.write_text("<html/>") + out_dir = tmp_path / "png_sample.hg38" + log = logging.getLogger("test") + + monkeypatch.setenv("IGVER_CMD", "/usr/bin/true") + with patch.object(b.shutil, "which", return_value="/usr/bin/true"), \ + patch.object(b.subprocess, "run", side_effect=_fake_igver_run): + manifest = b.build_pngs_with_igver( + sites=bed, + tracks=tracks, + genome="hg38", + flanking=300, + out_dir=out_dir, + log=log, + html_path=html_path, + igver_cmd=None, + dpi=300, + display_mode="collapse", + ) + + # 1. Intermediate files exist with the expected content. + regions_bed = out_dir / "igver_regions.bed" + input_txt = out_dir / "igver_input.txt" + assert regions_bed.exists() + assert input_txt.exists() + assert regions_bed.read_text() == "chr1\t0\t500\talpha\nchr2\t0\t700\tbeta\n" + assert input_txt.read_text() == "/data/sample.bam\n/data/calls.vcf\n" + + # 2. Manifest has one row per region with the right schema and the + # expected PNG-filename convention (validated against igver source). + lines = manifest.read_text().splitlines() + assert lines[0].startswith("#bed_row_idx\tuid\tchrom\t") + data_rows = lines[1:] + assert len(data_rows) == 2 + + cols0 = data_rows[0].split("\t") + assert cols0[0] == "1" + assert cols0[1] == "alpha" + assert cols0[2] == "chr1" + assert cols0[3] == "100" # start_orig + assert cols0[4] == "200" # end_orig + assert cols0[5] == "0" # start_flanked (clamped) + assert cols0[6] == "500" # end_flanked + assert cols0[7] == "chr1:0-500" + assert cols0[8].endswith("/png/chr1-0-500.alpha.png"), cols0[8] + assert cols0[9].endswith("/sample.hg38.html"), cols0[9] + assert cols0[10] == "1" # html_table_row matches bed_row_idx + + +def test_build_pngs_with_igver_detects_silent_exit_0_failure(tmp_path, monkeypatch): + # The motivating bug: igver via `pip install` egg-link prints + # `[ERROR] Failed to generate all PNG files after 2 iterations.` then + # exits 0 with an empty output dir. proc.returncode != 0 misses it. + # Inline check must catch this regardless of exit code. + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [("chr1", 100, 200, "alpha"), ("chr2", 300, 400, "beta")]) + html_path = tmp_path / "sample.hg38.html"; html_path.write_text("<html/>") + log = logging.getLogger("test") + + # /usr/bin/true returns 0 but creates no files — the exact failure mode. + monkeypatch.setenv("IGVER_CMD", "/usr/bin/true") + with patch.object(b.shutil, "which", return_value="/usr/bin/true"): + with pytest.raises(SystemExit, match="silent exit-0 failure"): + b.build_pngs_with_igver( + sites=bed, tracks=["/data/sample.bam"], genome="hg38", + flanking=0, out_dir=tmp_path / "out", log=log, html_path=html_path, + ) + + +def test_build_pngs_with_igver_detects_partial_silent_failure(tmp_path, monkeypatch): + # Mid-batch failure: 1 of 2 PNGs produced, 1 missing, exit 0. Inline + # check must fail because the manifest would otherwise reference a + # non-existent PNG. + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [("chr1", 100, 200, "alpha"), ("chr2", 300, 400, "beta")]) + html_path = tmp_path / "sample.hg38.html"; html_path.write_text("<html/>") + log = logging.getLogger("test") + + def partial_run(cmd, **kwargs): + # Write only the first region's PNG, skip the second. + import subprocess + out_dir = Path(cmd[cmd.index("-o") + 1]) + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "chr1-100-200.alpha.png").write_bytes(b"PNG\x00" * 4096) + return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="", stderr="") + + monkeypatch.setenv("IGVER_CMD", "/usr/bin/true") + with patch.object(b.shutil, "which", return_value="/usr/bin/true"), \ + patch.object(b.subprocess, "run", side_effect=partial_run): + with pytest.raises(SystemExit, match="silent exit-0 failure"): + b.build_pngs_with_igver( + sites=bed, tracks=["/data/sample.bam"], genome="hg38", + flanking=0, out_dir=tmp_path / "out", log=log, html_path=html_path, + ) + + +def test_build_pngs_with_igver_detects_zero_byte_png(tmp_path, monkeypatch): + # Disk-full / truncated-write: PNG exists but is empty. Inline check + # must fail because the file is on disk but unusable. + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [("chr1", 100, 200, "alpha")]) + html_path = tmp_path / "sample.hg38.html"; html_path.write_text("<html/>") + log = logging.getLogger("test") + + def zero_byte_run(cmd, **kwargs): + import subprocess + out_dir = Path(cmd[cmd.index("-o") + 1]) + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "chr1-100-200.alpha.png").write_bytes(b"") # zero-byte + return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="", stderr="") + + monkeypatch.setenv("IGVER_CMD", "/usr/bin/true") + with patch.object(b.shutil, "which", return_value="/usr/bin/true"), \ + patch.object(b.subprocess, "run", side_effect=zero_byte_run): + with pytest.raises(SystemExit, match="silent exit-0 failure"): + b.build_pngs_with_igver( + sites=bed, tracks=["/data/sample.bam"], genome="hg38", + flanking=0, out_dir=tmp_path / "out", log=log, html_path=html_path, + ) + + +def test_build_pngs_with_igver_propagates_igver_failure(tmp_path, monkeypatch): + # If igver itself returns non-zero, the driver must SystemExit so the + # caller (and verify_cohort) sees the build as failed — silent success + # would let an empty PNG dir slip into a "verified" cohort. + bed = tmp_path / "sites.hg38.bed" + _write_bed(bed, [("chr1", 100, 200, "alpha")]) + html_path = tmp_path / "sample.hg38.html"; html_path.write_text("<html/>") + log = logging.getLogger("test") + + # /usr/bin/false always exits non-zero — perfect stand-in for a failing igver. + monkeypatch.setenv("IGVER_CMD", "/usr/bin/false") + with patch.object(b.shutil, "which", return_value="/usr/bin/false"): + with pytest.raises(SystemExit) as exc: + b.build_pngs_with_igver( + sites=bed, tracks=["/data/sample.bam"], genome="hg38", flanking=0, + out_dir=tmp_path / "out", log=log, html_path=html_path, + ) + assert exc.value.code != 0 + + +def test_build_pngs_with_igver_errors_on_empty_bed(tmp_path): + bed = tmp_path / "sites.hg38.bed" + bed.write_text("# header only\n") + log = logging.getLogger("test") + with pytest.raises(SystemExit, match="no data rows"): + b.build_pngs_with_igver( + sites=bed, tracks=["/data/sample.bam"], genome="hg38", flanking=0, + out_dir=tmp_path / "out", log=log, html_path=tmp_path / "x.html", + ) diff --git a/igv-reports/tests/unit/test_generate_tracks_json.py b/igv-reports/tests/unit/test_generate_tracks_json.py new file mode 100644 index 0000000..bf562ee --- /dev/null +++ b/igv-reports/tests/unit/test_generate_tracks_json.py @@ -0,0 +1,226 @@ +"""Unit tests for generate_tracks_json.py — annotation-default resolver. + +Author: Samuel Ahuno +Purpose: + Exercises the `default:` shortcut path added in the methylation-pathway + polish round. Without these tests a future Claude session could easily + break the resolver by adding a 6th key without updating the lookup. + +Covers: + * Happy path: each known default key resolves against a synthetic cfg. + * hg38 gencode-sibling preference (.gff3.gz over .gtf.gz when present). + * indexURL: included when .tbi exists; omitted otherwise. + * Unknown default key -> SystemExit with valid-keys hint. + * Missing genome in cfg -> SystemExit. + * Missing path on disk -> SystemExit. + * build_annotation_tracks() routes `default:` entries through the resolver + and preserves backwards compat for explicit `url:` entries. + * `default:` entry without top-level `genome:` -> SystemExit. + +Run: + cd igv-reports-skill && pytest tests/unit/test_generate_tracks_json.py -v +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +SCRIPTS = Path(__file__).resolve().parents[2] / "scripts" +sys.path.insert(0, str(SCRIPTS)) + +import generate_tracks_json as g # noqa: E402 + + +def _fake_cfg(genome: str, paths: dict[str, str]) -> dict: + """Build a minimal databases YAML mirror keyed by genome. + + `paths` maps YAML-keys (CpGIslands, gtf, repMaskerBed, EPDnewCoding, + EPDnewNonCoding) to filesystem paths.""" + return {"reference_genomes": {"local": {genome: paths}}} + + +def _touch(path: Path) -> Path: + """Create an empty file at `path`, parents auto-created.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b"") + return path + + +# ----- happy path: each known default key resolves ----- + + +def test_resolve_cgi(tmp_path): + cgi = _touch(tmp_path / "hg38_CpGIslands.bed") + cfg = _fake_cfg("hg38", {"CpGIslands": str(cgi)}) + t = g.resolve_annotation_default("cgi", "hg38", cfg) + assert t["url"] == str(cgi) + assert t["display_name"] == "CpG islands" + assert t["format"] == "bed" + assert t["color"] == "rgb(0,158,115)" + assert "indexURL" not in t # no .tbi alongside + + +def test_resolve_repmasker(tmp_path): + rmsk = _touch(tmp_path / "rmsk.bed.gz") + _touch(tmp_path / "rmsk.bed.gz.tbi") + cfg = _fake_cfg("hg38", {"repMaskerBed": str(rmsk)}) + t = g.resolve_annotation_default("repmasker", "hg38", cfg) + assert t["url"] == str(rmsk) + assert t["indexURL"] == str(rmsk) + ".tbi" + assert t["displayMode"] == "COLLAPSED" + + +def test_resolve_gencode_hg38_prefers_gff3_sibling(tmp_path): + # When the YAML's `gtf` points at a .gtf.gz, but a sibling + # gencode.v47.annotation.gff3.gz + .tbi exists in the same dir, + # the resolver should switch to the bgzip+tabix .gff3.gz file. + gtf = _touch(tmp_path / "gencode.v47.annotation.gtf.gz") + sibling = _touch(tmp_path / "gencode.v47.annotation.gff3.gz") + _touch(tmp_path / "gencode.v47.annotation.gff3.gz.tbi") + cfg = _fake_cfg("hg38", {"gtf": str(gtf)}) + t = g.resolve_annotation_default("gencode", "hg38", cfg) + assert t["url"] == str(sibling), "expected hg38 gencode to prefer .gff3.gz sibling" + assert t["indexURL"] == str(sibling) + ".tbi" + + +def test_resolve_gencode_mm10_uses_gtf(tmp_path): + # The sibling-preference logic only fires for hg38. For mm10 the + # resolver should use the YAML-named gtf path verbatim. + gtf = _touch(tmp_path / "gencode.vM25.annotation.gtf.gz") + cfg = _fake_cfg("mm10", {"gtf": str(gtf)}) + t = g.resolve_annotation_default("gencode", "mm10", cfg) + assert t["url"] == str(gtf) + + +def test_resolve_epdnew_coding_and_noncoding(tmp_path): + coding = _touch(tmp_path / "Hs_EPDnew.hg38.bed.gz") + noncoding = _touch(tmp_path / "HsNC_EPDnew.hg38.bed.gz") + cfg = _fake_cfg("hg38", { + "EPDnewCoding": str(coding), + "EPDnewNonCoding": str(noncoding), + }) + tc = g.resolve_annotation_default("epdnew_coding", "hg38", cfg) + tn = g.resolve_annotation_default("epdnew_noncoding", "hg38", cfg) + assert tc["url"] == str(coding) + assert tn["url"] == str(noncoding) + # Distinct Okabe-Ito colors so coding vs non-coding read separately. + assert tc["color"] != tn["color"] + + +# ----- error paths ----- + + +def test_unknown_default_key_lists_valid_keys(): + with pytest.raises(SystemExit) as exc: + g.resolve_annotation_default("DOES_NOT_EXIST", "hg38", _fake_cfg("hg38", {})) + # Error should enumerate the valid keys so the user can fix the typo + # without having to read the source. + msg = str(exc.value) + for key in ("cgi", "gencode", "repmasker", "epdnew_coding", "epdnew_noncoding"): + assert key in msg + + +def test_missing_genome_in_cfg(tmp_path): + cgi = _touch(tmp_path / "hg38_CpGIslands.bed") + cfg = _fake_cfg("hg38", {"CpGIslands": str(cgi)}) + with pytest.raises(SystemExit, match="no entry for genome 'GRCh37'"): + g.resolve_annotation_default("cgi", "GRCh37", cfg) + + +def test_missing_yaml_key_for_genome(tmp_path): + # mm39 famously has no repMaskerBed configured — `default: repmasker` + # must fail with a clear error rather than silently emitting no track. + cfg = _fake_cfg("mm39", {"CpGIslands": "/tmp/fake_cgi"}) + with pytest.raises(SystemExit, match="repMaskerBed"): + g.resolve_annotation_default("repmasker", "mm39", cfg) + + +def test_missing_path_on_disk(tmp_path): + cfg = _fake_cfg("hg38", {"CpGIslands": str(tmp_path / "nonexistent.bed")}) + with pytest.raises(SystemExit, match="resolved path missing on disk"): + g.resolve_annotation_default("cgi", "hg38", cfg) + + +# ----- build_annotation_tracks() integration ----- + + +def test_build_annotation_tracks_shortcut(tmp_path): + cgi = _touch(tmp_path / "hg38_CpGIslands.bed") + cfg = _fake_cfg("hg38", {"CpGIslands": str(cgi)}) + spec = {"genome": "hg38", "annotation": [{"default": "cgi"}]} + out = g.build_annotation_tracks(spec, tmp_path, cfg) + assert len(out) == 1 + assert out[0]["name"] == "CpG islands" + assert out[0]["url"] == str(cgi) + assert out[0]["type"] == "annotation" + assert out[0]["color"] == "rgb(0,158,115)" + + +def test_build_annotation_tracks_shortcut_with_overrides(tmp_path): + # The user can override the canned display name + color while still + # using `default:` for path resolution. + cgi = _touch(tmp_path / "hg38_CpGIslands.bed") + cfg = _fake_cfg("hg38", {"CpGIslands": str(cgi)}) + spec = { + "genome": "hg38", + "annotation": [{ + "default": "cgi", + "name": "My CpG view", + "color": "rgb(0,0,0)", + "displayMode": "COLLAPSED", + }], + } + out = g.build_annotation_tracks(spec, tmp_path, cfg) + assert out[0]["name"] == "My CpG view" + assert out[0]["color"] == "rgb(0,0,0)" + assert out[0]["displayMode"] == "COLLAPSED" + # url still resolved by the shortcut + assert out[0]["url"] == str(cgi) + + +def test_build_annotation_tracks_explicit_path_unchanged(tmp_path): + # Backwards-compat: an explicit `url:` entry must not need a cfg and + # must produce the same shape as before this round of changes. + explicit = _touch(tmp_path / "my_custom.bed") + spec = { + "annotation": [{ + "name": "My custom track", + "url": str(explicit), + "format": "bed", + "color": "rgb(1,2,3)", + }], + } + out = g.build_annotation_tracks(spec, tmp_path, {}) + assert out[0] == { + "name": "My custom track", + "url": str(explicit), + "format": "bed", + "type": "annotation", + "displayMode": "EXPANDED", + "color": "rgb(1,2,3)", + } + + +def test_build_annotation_tracks_mixed(tmp_path): + # Explicit + shortcut entries can coexist; order is preserved. + cgi = _touch(tmp_path / "hg38_CpGIslands.bed") + explicit = _touch(tmp_path / "custom.bed") + cfg = _fake_cfg("hg38", {"CpGIslands": str(cgi)}) + spec = { + "genome": "hg38", + "annotation": [ + {"name": "Custom first", "url": str(explicit)}, + {"default": "cgi"}, + ], + } + out = g.build_annotation_tracks(spec, tmp_path, cfg) + assert [t["name"] for t in out] == ["Custom first", "CpG islands"] + + +def test_shortcut_without_top_level_genome(tmp_path): + spec = {"annotation": [{"default": "cgi"}]} # missing `genome:` + with pytest.raises(SystemExit, match="top-level `genome:`"): + g.build_annotation_tracks(spec, tmp_path, {}) diff --git a/igv-reports/tests/unit/test_verify_anchors.py b/igv-reports/tests/unit/test_verify_anchors.py new file mode 100644 index 0000000..fa7b514 --- /dev/null +++ b/igv-reports/tests/unit/test_verify_anchors.py @@ -0,0 +1,534 @@ +"""Unit tests for verify_anchors.py — parser layer only. + +Author: Samuel Ahuno +Purpose: + Fast pytest suite covering the pure-Python parsing/decision logic in + verify_anchors.py. No subprocess, no real BAM, no shared-storage + dependency. Runs in ~1 s on any machine with pytest. + + These tests catch the parser regressions that bit during the original + iteration: status-taxonomy conflation between SKIP and FAIL, mis-tabbed + TSV rows being silently mis-parsed, decode_status confusing tolerance + with notes when columns are out of order. + +Run: + cd claude/skills/igv-reports + pytest tests/unit/ -v +""" + +from __future__ import annotations + +import base64 +import gzip +import json +import sys +from pathlib import Path + +import pytest + +# Make scripts/ importable without installing the skill as a package. +SCRIPTS = Path(__file__).resolve().parents[2] / "scripts" +sys.path.insert(0, str(SCRIPTS)) +import verify_anchors as va # noqa: E402 + + +# --------------------------------------------------------------------------- +# load_anchors +# --------------------------------------------------------------------------- + +def _write_tsv(tmp_path: Path, body: str) -> Path: + p = tmp_path / "anchors.tsv" + p.write_text(body) + return p + + +def test_load_anchors_full_row(tmp_path): + """All 10 columns populated, including notes.""" + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\ttumor\tchr2\t25246500\t25246501\t56\t0.05\t\t\tDNMT3A\n" + )) + rows = va.load_anchors(p) + assert len(rows) == 1 + r = rows[0] + assert (r.sample, r.track_name, r.chrom, r.start, r.end) == ("s1", "tumor", "chr2", 25246500, 25246501) + assert r.expected == 56 + assert r.tolerance == "0.05" + assert r.min_count == "" + assert r.max_count == "" + assert r.notes == "DNMT3A" + + +def test_load_anchors_min_max_row(tmp_path): + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\ttumor\tchrX\t100\t200\t50\t\t20\t100\thigh-conf\n" + )) + rows = va.load_anchors(p) + assert rows[0].min_count == "20" + assert rows[0].max_count == "100" + + +def test_load_anchors_missing_header_errors(tmp_path): + """Data row before any header must abort with a clear error.""" + p = _write_tsv(tmp_path, "s1\ttumor\tchr1\t0\t100\t10\t\t\t\t\n") + with pytest.raises(SystemExit, match="data row before header"): + va.load_anchors(p) + + +def test_load_anchors_bad_tolerance_fails_fast(tmp_path): + """Mis-tabbed row where notes value falls into tolerance must fail at + load time with a hint, not crash later inside decide_status.""" + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\ttumor\tchr2\t100\t200\t10\tNOT_A_NUMBER\t\t\tDNMT3A\n" + )) + with pytest.raises(SystemExit) as excinfo: + va.load_anchors(p) + msg = str(excinfo.value) + assert "malformed anchor row" in msg + assert "awk" in msg # hint about -F'\t' + + +def test_load_anchors_bad_min_fails_fast(tmp_path): + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\ttumor\tchr2\t100\t200\t10\t\tNAH\t\t\n" + )) + with pytest.raises(SystemExit, match="malformed anchor row"): + va.load_anchors(p) + + +def test_load_anchors_missing_file(tmp_path): + with pytest.raises(SystemExit, match="anchors TSV not found"): + va.load_anchors(tmp_path / "does_not_exist.tsv") + + +def test_load_anchors_skips_blank_lines(tmp_path): + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "\n" + "s1\ttumor\tchr1\t0\t100\t10\t\t\t\t\n" + "\n" + )) + rows = va.load_anchors(p) + assert len(rows) == 1 + + +# --------------------------------------------------------------------------- +# decide_status +# --------------------------------------------------------------------------- + +def _anchor(expected=10, tolerance="", min_count="", max_count=""): + return va.AnchorRow( + sample="s", track_name="t", chrom="chr1", start=0, end=100, + expected=expected, tolerance=tolerance, + min_count=min_count, max_count=max_count, + ) + + +def test_decide_status_pass_within_default_tolerance(): + a = _anchor(expected=100) + status, _ = va.decide_status(a, observed=104, default_tol=0.05) + assert status == "PASS" + + +def test_decide_status_fail_outside_default_tolerance(): + a = _anchor(expected=100) + status, details = va.decide_status(a, observed=110, default_tol=0.05) + assert status == "FAIL" + assert "diff_ratio" in details + + +def test_decide_status_per_row_tolerance_overrides_default(): + """Row tolerance 0.20 should pass observed=115 even though default 0.05 wouldn't.""" + a = _anchor(expected=100, tolerance="0.20") + status, _ = va.decide_status(a, observed=115, default_tol=0.05) + assert status == "PASS" + + +def test_decide_status_min_bound_pass(): + a = _anchor(expected=50, min_count="20") + status, details = va.decide_status(a, observed=50, default_tol=0.05) + assert status == "PASS" + assert "min=20 OK" in details + + +def test_decide_status_min_bound_fail(): + a = _anchor(expected=50, min_count="100") + status, details = va.decide_status(a, observed=50, default_tol=0.05) + assert status == "FAIL" + assert "min=100 FAIL" in details + + +def test_decide_status_min_max_combined(): + a = _anchor(min_count="20", max_count="80") + status, _ = va.decide_status(a, observed=50, default_tol=0.05) + assert status == "PASS" + status, _ = va.decide_status(a, observed=10, default_tol=0.05) + assert status == "FAIL" + status, _ = va.decide_status(a, observed=100, default_tol=0.05) + assert status == "FAIL" + + +def test_decide_status_bounds_override_tolerance(): + """When min/max present, tolerance is ignored.""" + # observed within tolerance of expected, but violates min + a = _anchor(expected=50, tolerance="0.50", min_count="100") + status, _ = va.decide_status(a, observed=52, default_tol=0.05) + assert status == "FAIL" # min wins over tolerance + + +def test_decide_status_zero_expected_exact(): + a = _anchor(expected=0) + status, _ = va.decide_status(a, observed=0, default_tol=0.05) + assert status == "PASS" + status, _ = va.decide_status(a, observed=1, default_tol=0.05) + assert status == "FAIL" + + +# --------------------------------------------------------------------------- +# decode_track_slice +# --------------------------------------------------------------------------- + +def test_decode_track_slice_roundtrip(tmp_path): + """data: URL → bytes round-trip preserves the payload.""" + payload = b"BAM\x01some bytes here" + url = "data:application/gzip;base64," + base64.b64encode(payload).decode() + dest = tmp_path / "out.bin" + va.decode_track_slice(url, dest) + assert dest.read_bytes() == payload + + +def test_decode_track_slice_other_mediatype_accepted(tmp_path): + """We don't validate the mediatype — payload bytes are what matter.""" + payload = b"\x1f\x8b\x08compressed body" + url = "data:application/octet-stream;base64," + base64.b64encode(payload).decode() + dest = tmp_path / "out.bin" + va.decode_track_slice(url, dest) + assert dest.read_bytes() == payload + + +def test_decode_track_slice_not_a_data_url_raises(tmp_path): + with pytest.raises(ValueError, match="not a data: base64 URL"): + va.decode_track_slice("http://example.com/blob.bam", tmp_path / "out.bin") + + +# --------------------------------------------------------------------------- +# find_track +# --------------------------------------------------------------------------- + +def test_find_track_hit(): + session = {"tracks": [ + {"name": "ann.bed"}, + {"name": "sample.sorted", "url": "data:..."}, + ]} + t = va.find_track(session, "sample.sorted") + assert t is not None and t["url"] == "data:..." + + +def test_find_track_miss(): + session = {"tracks": [{"name": "other"}]} + assert va.find_track(session, "missing") is None + + +def test_find_track_empty(): + assert va.find_track({}, "x") is None + assert va.find_track({"tracks": []}, "x") is None + + +# --------------------------------------------------------------------------- +# locate_session_entry — status taxonomy split (was the v1 regression) +# --------------------------------------------------------------------------- + +def _make_table_json(rows): + return {"headers": ["Chrom", "Start", "End", "Name"], "rows": rows} + + +def _make_session_dict(entries): + """Build a sessionDictionary mapping str(idx) -> a gzipped+b64 data URL + that decodes to the given entry dict.""" + out = {} + for idx, entry in entries.items(): + raw = gzip.compress(json.dumps(entry).encode()) + out[str(idx)] = "data:application/gzip;base64," + base64.b64encode(raw).decode() + return out + + +def test_locate_session_entry_ok(): + tj = _make_table_json([["chr2", 25246501, 25246501, "x"]]) + sd = _make_session_dict({0: {"tracks": [{"name": "t"}]}}) + outcome, sess, det = va.locate_session_entry(sd, tj, "chr2", 25246500, 25246501) + assert outcome == "ok" + assert sess == {"tracks": [{"name": "t"}]} + assert det == "" + + +def test_locate_session_entry_absent_returns_skip_signal(): + """Anchor for a region that's not in the HTML — caller should SKIP.""" + tj = _make_table_json([["chr2", 25246501, 25246501, "x"]]) + sd = _make_session_dict({0: {"tracks": []}}) + outcome, _, det = va.locate_session_entry(sd, tj, "chr2", 99999999, 99999999) + assert outcome == "absent" + assert "no tableJson row matched" in det + + +def test_locate_session_entry_broken_missing_session(): + """Row in tableJson but no corresponding sessionDictionary entry — FAIL.""" + tj = _make_table_json([["chr2", 25246501, 25246501, "x"]]) + sd = {} # no entries at all + outcome, _, det = va.locate_session_entry(sd, tj, "chr2", 25246500, 25246501) + assert outcome == "broken" + assert "no entry for row index" in det + + +def test_locate_session_entry_broken_undecodable(): + """Row + session entry present but the session blob can't be gunzipped — FAIL.""" + tj = _make_table_json([["chr2", 25246501, 25246501, "x"]]) + sd = {"0": "data:application/gzip;base64,NOT_VALID_BASE64"} + outcome, _, det = va.locate_session_entry(sd, tj, "chr2", 25246500, 25246501) + assert outcome == "broken" + assert "failed to gunzip/decode" in det + + +def test_locate_session_entry_broken_bad_headers(): + """tableJson missing the Chrom/Start/End columns we need.""" + tj = {"headers": ["foo", "bar"], "rows": [["x", "y"]]} + sd = {} + outcome, _, det = va.locate_session_entry(sd, tj, "chr2", 100, 200) + assert outcome == "broken" + assert "missing expected column" in det + + +# --------------------------------------------------------------------------- +# sample_bam_paths — samplesheet column handling +# --------------------------------------------------------------------------- + +def test_sample_bam_paths_tumor_only(): + row = {"sample": "s1", "bam_tumor": "/x/tumor.sorted.bam"} + out = va.sample_bam_paths(row) + assert out == [("tumor.sorted", Path("/x/tumor.sorted.bam"))] + + +def test_sample_bam_paths_tumor_and_normal(): + row = {"sample": "s1", "bam_tumor": "/x/t.bam", "bam_normal": "/x/n.bam"} + out = va.sample_bam_paths(row) + names = [n for n, _ in out] + assert names == ["t", "n"] + + +def test_sample_bam_paths_extras_filtered_to_bam_cram(): + row = { + "sample": "s1", + "bam_tumor": "/x/t.bam", + "extra_tracks": "/y/extra.bam,/y/annot.bed,/y/other.cram", + } + out = va.sample_bam_paths(row) + names = [n for n, _ in out] + # bam_tumor + the .bam + the .cram from extras; .bed should be filtered out + assert names == ["t", "extra", "other"] + + +def test_sample_bam_paths_blank_row(): + row = {"sample": "s1"} + assert va.sample_bam_paths(row) == [] + + +# --------------------------------------------------------------------------- +# write_anchors round-trip +# --------------------------------------------------------------------------- + +def test_write_load_round_trip(tmp_path): + anchors_in = [ + va.AnchorRow(sample="s1", track_name="t1", chrom="chr1", + start=0, end=100, expected=42, notes="hi"), + va.AnchorRow(sample="s2", track_name="t2", chrom="chr2", + start=200, end=300, expected=7, min_count="3", max_count="20"), + ] + out = tmp_path / "anchors.tsv" + va.write_anchors(anchors_in, out) + rows = va.load_anchors(out) + assert len(rows) == 2 + assert rows[0].notes == "hi" + assert rows[1].min_count == "3" + assert rows[1].max_count == "20" + + +# --------------------------------------------------------------------------- +# bedGraph / wig anchors (methylation-aware path added 2026-05-19) +# --------------------------------------------------------------------------- + +def _write_bedgraph(path: Path, rows: list[tuple]) -> Path: + """Write a 4-col bedGraph (chrom/start/end/value), no header.""" + path.write_text("".join(f"{r[0]}\t{r[1]}\t{r[2]}\t{r[3]}\n" for r in rows)) + return path + + +def test_is_wig_data_line(): + assert va._is_wig_data_line("chr1\t100\t101\t0.5") is True + assert va._is_wig_data_line("track name=meth") is False + assert va._is_wig_data_line("browser dense") is False + assert va._is_wig_data_line("fixedStep chrom=chr1 start=1 step=1") is False + assert va._is_wig_data_line("variableStep chrom=chr1") is False + assert va._is_wig_data_line("# comment") is False + assert va._is_wig_data_line("") is False + assert va._is_wig_data_line(" ") is False + + +def test_bedgraph_count_source_plain_text_in_region(tmp_path): + # 3 of 4 rows overlap [100, 200); the 4th is on a different chrom. + bg = _write_bedgraph(tmp_path / "sample.hg38.bedgraph", [ + ("chr1", 100, 101, 0.5), + ("chr1", 150, 151, 0.8), + ("chr1", 199, 200, 0.3), # r_end > q_start? 200 > 100 yes; r_start < q_end? 199 < 200 yes + ("chr2", 100, 101, 0.9), # different chrom + ]) + assert va.bedgraph_count_source(bg, "chr1", 100, 200) == 3 + + +def test_bedgraph_count_source_excludes_out_of_region(tmp_path): + # Rows must overlap [start, end). Boundary cases. + bg = _write_bedgraph(tmp_path / "sample.hg38.bedgraph", [ + ("chr1", 50, 100, 0.1), # r_end == q_start -> doesn't overlap (half-open) + ("chr1", 100, 150, 0.2), # r_start == q_start -> overlaps + ("chr1", 195, 200, 0.3), # r_start < q_end == 200 -> overlaps + ("chr1", 200, 250, 0.4), # r_start == q_end -> doesn't overlap (half-open) + ("chr1", 1000, 1001, 0.5), # way out + ]) + assert va.bedgraph_count_source(bg, "chr1", 100, 200) == 2 + + +def test_bedgraph_count_source_skips_headers_and_comments(tmp_path): + bg = tmp_path / "sample.hg38.bedgraph" + bg.write_text( + "#header comment\n" + "track name=test\n" + "browser dense\n" + "chr1\t100\t101\t0.5\n" + "chr1\t150\t151\t0.6\n" + ) + assert va.bedgraph_count_source(bg, "chr1", 0, 1000) == 2 + + +def test_bedgraph_count_source_handles_gzipped_input(tmp_path): + # Plain-gzip (not bgzip+tabix). Linear-scan path. + import gzip + bg = tmp_path / "sample.hg38.bedgraph.gz" + with gzip.open(bg, "wt") as fh: + fh.write("chr3\t100\t101\t0.5\n") + fh.write("chr3\t150\t151\t0.6\n") + fh.write("chr3\t999\t1000\t0.7\n") + assert va.bedgraph_count_source(bg, "chr3", 100, 200) == 2 + assert va.bedgraph_count_source(bg, "chr3", 0, 10000) == 3 + assert va.bedgraph_count_source(bg, "chr4", 0, 10000) == 0 + + +def test_bedgraph_count_source_missing_file_raises(tmp_path): + with pytest.raises(FileNotFoundError, match="bedGraph track not found"): + va.bedgraph_count_source(tmp_path / "does_not_exist.bg", "chr1", 0, 100) + + +def test_bedgraph_count_slice_decodes_gzipped_payload(): + # Mimics how igv_reports/datauri.py encodes a wig/bedGraph slice: + # gzip(text) base64-encoded. verify_anchors only sees the gzipped + # bytes after base64 decoding, so we test the bytes-in entry point. + text = ( + "track name=meth\n" + "chr1\t100\t101\t0.5\n" + "chr1\t150\t151\t0.6\n" + "chr1\t200\t201\t0.7\n" + ) + assert va.bedgraph_count_slice(gzip.compress(text.encode())) == 3 + + +def test_bedgraph_count_slice_falls_back_to_uncompressed(): + # Some create_report versions write small wig slices uncompressed — + # the fallback path must accept raw text bytes. + text = "chr1\t100\t101\t0.5\nchr1\t200\t201\t0.6\n" + assert va.bedgraph_count_slice(text.encode()) == 2 + + +def test_bedgraph_count_slice_zero_when_empty(): + # No data rows in the slice = silent empty-methylation-slice failure. + # Caller (verify_one_html) compares to expected via decide_status. + assert va.bedgraph_count_slice(gzip.compress(b"track name=meth\n")) == 0 + assert va.bedgraph_count_slice(b"") == 0 + + +# --------------------------------------------------------------------------- +# Anchor schema: track_type column with backwards compat +# --------------------------------------------------------------------------- + +def test_load_anchors_legacy_no_track_type_defaults_to_bam(tmp_path): + # Pre-2026-05-19 anchor files lack the track_type column. Loader must + # accept them and default each row to track_type='bam'. + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\ttumor\tchr2\t100\t200\t42\t\t\t\t\n" + )) + rows = va.load_anchors(p) + assert rows[0].track_type == "bam" + + +def test_load_anchors_with_track_type_bedgraph(tmp_path): + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\ttrack_type\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\tmeth_track\tbedgraph\tchr2\t100\t200\t8\t\t\t\tDNMT3A_CpGs\n" + )) + rows = va.load_anchors(p) + assert rows[0].track_type == "bedgraph" + assert rows[0].expected == 8 + assert rows[0].notes == "DNMT3A_CpGs" + + +def test_load_anchors_rejects_unknown_track_type(tmp_path): + p = _write_tsv(tmp_path, ( + "#sample\ttrack_name\ttrack_type\tchrom\tstart\tend\texpected\ttolerance\tmin\tmax\tnotes\n" + "s1\tt1\tcraaam\tchr1\t0\t100\t5\t\t\t\t\n" + )) + with pytest.raises(SystemExit, match="unknown track_type 'craaam'"): + va.load_anchors(p) + + +def test_write_load_round_trip_preserves_track_type(tmp_path): + anchors_in = [ + va.AnchorRow(sample="s1", track_name="tumor", track_type="bam", + chrom="chr1", start=0, end=100, expected=42), + va.AnchorRow(sample="s1", track_name="tumor.5mC", track_type="bedgraph", + chrom="chr1", start=0, end=100, expected=12), + ] + out = tmp_path / "anchors.tsv" + va.write_anchors(anchors_in, out) + rows = va.load_anchors(out) + assert [r.track_type for r in rows] == ["bam", "bedgraph"] + + +# --------------------------------------------------------------------------- +# sample_bedgraph_paths: samplesheet → (track_name, bedgraph_path) iteration +# --------------------------------------------------------------------------- + +def test_sample_bedgraph_paths_picks_bedgraph_from_extras(): + row = {"sample": "s1", "extra_tracks": "/data/x.5mC.bedgraph,/data/x.5hmC.bg"} + pairs = va.sample_bedgraph_paths(row) + assert pairs == [("x.5mC", Path("/data/x.5mC.bedgraph")), + ("x.5hmC", Path("/data/x.5hmC.bg"))] + + +def test_sample_bedgraph_paths_strips_gz_suffix_from_track_name(): + # Path.stem of foo.bedgraph.gz is "foo.bedgraph"; igv-reports renders + # it as just "foo", so we strip one more level. + row = {"sample": "s1", "extra_tracks": "/data/foo.bedgraph.gz"} + pairs = va.sample_bedgraph_paths(row) + assert pairs[0][0] == "foo" + + +def test_sample_bedgraph_paths_skips_non_bedgraph_extras(): + # bam/vcf in extra_tracks are NOT bedgraphs — sample_bam_paths handles them. + row = {"sample": "s1", "extra_tracks": "/data/x.5mC.bedgraph,/data/y.bam,/data/z.vcf"} + pairs = va.sample_bedgraph_paths(row) + assert pairs == [("x.5mC", Path("/data/x.5mC.bedgraph"))] + + +def test_sample_bedgraph_paths_empty_when_no_extras(): + assert va.sample_bedgraph_paths({"sample": "s1"}) == [] + assert va.sample_bedgraph_paths({"sample": "s1", "extra_tracks": ""}) == [] diff --git a/igv-reports/tests/unit/test_verify_cohort_png.py b/igv-reports/tests/unit/test_verify_cohort_png.py new file mode 100644 index 0000000..2ac1bc1 --- /dev/null +++ b/igv-reports/tests/unit/test_verify_cohort_png.py @@ -0,0 +1,190 @@ +"""Unit tests for the PNG-side checks in verify_cohort.py. + +Author: Samuel Ahuno +Purpose: + When build_igvreports.py runs with --also-png, verify_cohort.py picks up + the manifest TSV and runs three additional checks. These tests synthesize + a valid manifest + matching PNG files in tmp_path, then mutate one thing + at a time to confirm each check fires on the right defect. + +Run: + pytest tests/unit/test_verify_cohort_png.py -v +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +SCRIPTS = Path(__file__).resolve().parents[2] / "scripts" +sys.path.insert(0, str(SCRIPTS)) + +import verify_cohort as vc # noqa: E402 + + +def _write_sites_bed(path: Path, rows: list[tuple]) -> None: + with path.open("w") as fh: + for r in rows: + fh.write("\t".join(str(x) for x in r) + "\n") + + +def _write_manifest(path: Path, entries: list[dict]) -> None: + """Write a manifest TSV matching the schema build_pngs_with_igver emits.""" + header = ("#bed_row_idx\tuid\tchrom\tstart_orig\tend_orig\t" + "start_flanked\tend_flanked\tregion\tpng_path\thtml_path\thtml_table_row\n") + with path.open("w") as fh: + fh.write(header) + for e in entries: + fh.write( + f"{e['bed_row_idx']}\t{e['uid']}\t{e['chrom']}\t" + f"{e['start_orig']}\t{e['end_orig']}\t" + f"{e['start_flanked']}\t{e['end_flanked']}\t" + f"{e['region']}\t{e['png_path']}\t{e['html_path']}\t" + f"{e['html_table_row']}\n" + ) + + +def _make_png(path: Path, size_bytes: int = 50_000) -> None: + """Create a fake PNG file of the requested size (default 50 KB, above + the 10 KB threshold).""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (size_bytes - 8)) + + +@pytest.fixture +def cohort(tmp_path): + """Two-region cohort with a valid manifest + matching PNGs.""" + bed = tmp_path / "sites.hg38.bed" + _write_sites_bed(bed, [ + ("chr1", 100, 200, "alpha"), + ("chr2", 300, 400, "beta"), + ]) + html = tmp_path / "sample.hg38.html" + html.write_text("<html/>") + + png_dir = tmp_path / "png_sample.hg38" / "png" + png1 = png_dir / "chr1-0-500.alpha.png" + png2 = png_dir / "chr2-0-700.beta.png" + _make_png(png1) + _make_png(png2) + + manifest = tmp_path / "png_sample.hg38" / "manifest.tsv" + _write_manifest(manifest, [ + {"bed_row_idx": 1, "uid": "alpha", "chrom": "chr1", + "start_orig": 100, "end_orig": 200, + "start_flanked": 0, "end_flanked": 500, + "region": "chr1:0-500", "png_path": str(png1.resolve()), + "html_path": str(html.resolve()), "html_table_row": 1}, + {"bed_row_idx": 2, "uid": "beta", "chrom": "chr2", + "start_orig": 300, "end_orig": 400, + "start_flanked": 0, "end_flanked": 700, + "region": "chr2:0-700", "png_path": str(png2.resolve()), + "html_path": str(html.resolve()), "html_table_row": 2}, + ]) + return {"bed": bed, "html": html, "manifest": manifest, "png_dir": png_dir, + "png1": png1, "png2": png2, "tmp": tmp_path} + + +# ----- find_png_manifest ----- + + +def test_find_png_manifest_returns_path_when_present(cohort, tmp_path): + # Manifest lives at <reports_dir>/png_<sample>.<genome>/manifest.tsv + # so we point reports_dir at tmp_path and check `sample` for genome `hg38`. + found = vc.find_png_manifest(tmp_path, "sample", "hg38") + assert found == cohort["manifest"] + + +def test_find_png_manifest_returns_none_when_absent(tmp_path): + assert vc.find_png_manifest(tmp_path, "sample", "hg38") is None + + +# ----- P1: png count matches BED ----- + + +def test_p1_pass(cohort): + c = vc.check_png_count_matches_bed("sample", cohort["manifest"], cohort["bed"]) + assert c.status == "PASS" + assert c.observed == "2" + assert c.expected == "2" + + +def test_p1_fail_when_manifest_short(cohort): + # Truncate the BED so it has 3 rows but manifest only has 2. + _write_sites_bed(cohort["bed"], [ + ("chr1", 100, 200, "alpha"), + ("chr2", 300, 400, "beta"), + ("chr3", 500, 600, "gamma"), + ]) + c = vc.check_png_count_matches_bed("sample", cohort["manifest"], cohort["bed"]) + assert c.status == "FAIL" + assert c.observed == "2" + assert c.expected == "3" + + +# ----- P2: pngs exist and non-empty ----- + + +def test_p2_pass(cohort): + c = vc.check_pngs_exist_and_nonempty("sample", cohort["manifest"]) + assert c.status == "PASS" + + +def test_p2_fail_on_missing_png(cohort): + cohort["png1"].unlink() + c = vc.check_pngs_exist_and_nonempty("sample", cohort["manifest"]) + assert c.status == "FAIL" + assert "missing" in c.details + + +def test_p2_fail_on_tiny_png(cohort): + # Re-write png1 as a 2 KB file — below the 10 KB threshold. + cohort["png1"].write_bytes(b"\x00" * 2048) + c = vc.check_pngs_exist_and_nonempty("sample", cohort["manifest"]) + assert c.status == "FAIL" + assert "below threshold" in c.details + + +def test_p2_threshold_can_be_lowered(cohort): + # The lab's smallest legitimate igver PNG can be ~5 KB on a no-data + # region. Users should be able to opt down without rewriting the check. + cohort["png1"].write_bytes(b"\x00" * 6144) + c = vc.check_pngs_exist_and_nonempty("sample", cohort["manifest"], min_size_kb=5.0) + assert c.status == "PASS" + + +# ----- P3: html-row alignment ----- + + +def test_p3_pass(cohort): + c = vc.check_png_html_row_alignment("sample", cohort["manifest"], cohort["html"]) + assert c.status == "PASS" + + +def test_p3_fail_when_html_path_diverges(cohort, tmp_path): + # Pass a different HTML path than the manifest references — should fail. + other_html = tmp_path / "other.hg38.html" + other_html.write_text("<html/>") + c = vc.check_png_html_row_alignment("sample", cohort["manifest"], other_html) + assert c.status == "FAIL" + assert "different HTML" in c.details + + +def test_p3_fail_when_row_indices_not_contiguous(cohort, tmp_path): + # Rewrite the manifest with non-contiguous html_table_row indices. + png1, png2 = cohort["png1"], cohort["png2"] + _write_manifest(cohort["manifest"], [ + {"bed_row_idx": 1, "uid": "alpha", "chrom": "chr1", + "start_orig": 100, "end_orig": 200, "start_flanked": 0, "end_flanked": 500, + "region": "chr1:0-500", "png_path": str(png1.resolve()), + "html_path": str(cohort["html"].resolve()), "html_table_row": 1}, + {"bed_row_idx": 2, "uid": "beta", "chrom": "chr2", + "start_orig": 300, "end_orig": 400, "start_flanked": 0, "end_flanked": 700, + "region": "chr2:0-700", "png_path": str(png2.resolve()), + "html_path": str(cohort["html"].resolve()), "html_table_row": 5}, # gap + ]) + c = vc.check_png_html_row_alignment("sample", cohort["manifest"], cohort["html"]) + assert c.status == "FAIL" + assert "contiguous" in c.details diff --git a/igv-reports/tests/unit/test_verify_report.py b/igv-reports/tests/unit/test_verify_report.py new file mode 100644 index 0000000..6ce4e43 --- /dev/null +++ b/igv-reports/tests/unit/test_verify_report.py @@ -0,0 +1,297 @@ +"""Unit tests for verify_report.py — pure-Python parser helpers. + +Author: Samuel Ahuno +Purpose: + Covers the HTML-extraction helpers and individual checks in + verify_report.py without needing a real create_report HTML on disk: + synthesized fixtures in tmp_path exercise every parser branch. + +Run: + cd claude/skills/igv-reports + pytest tests/unit/ -v +""" + +from __future__ import annotations + +import base64 +import gzip +import json +import sys +from pathlib import Path + +import pytest + +SCRIPTS = Path(__file__).resolve().parents[2] / "scripts" +sys.path.insert(0, str(SCRIPTS)) +import verify_report as vr # noqa: E402 + + +# --------------------------------------------------------------------------- +# load_sites_bed +# --------------------------------------------------------------------------- + +def test_load_sites_bed_basic(tmp_path): + p = tmp_path / "sites.bed" + p.write_text( + "#chrom\tstart\tend\tname\n" + "chr2\t25246500\t25246501\tDNMT3A\n" + "chr7\t148884000\t148884001\tEZH2\n" + ) + rows = vr.load_sites_bed(p) + assert len(rows) == 2 + assert rows[0] == {"chrom": "chr2", "start": 25246500, "end": 25246501, "name": "DNMT3A"} + assert rows[1]["name"] == "EZH2" + + +def test_load_sites_bed_skips_track_line(tmp_path): + p = tmp_path / "sites.bed" + p.write_text( + 'track name=foo description="bar"\n' + "chr1\t100\t200\n" + ) + rows = vr.load_sites_bed(p) + assert len(rows) == 1 + assert rows[0]["name"] is None # 3-col bed; no name + + +def test_load_sites_bed_blank_lines_ok(tmp_path): + p = tmp_path / "sites.bed" + p.write_text( + "#header\n" + "\n" + "chr1\t100\t200\tx\n" + "\n" + ) + assert len(vr.load_sites_bed(p)) == 1 + + +def test_load_sites_bed_too_few_cols(tmp_path): + p = tmp_path / "sites.bed" + p.write_text("chr1\t100\n") + with pytest.raises(SystemExit, match="<3 columns"): + vr.load_sites_bed(p) + + +def test_load_sites_bed_non_numeric(tmp_path): + p = tmp_path / "sites.bed" + p.write_text("chr1\tNOPE\t200\n") + with pytest.raises(SystemExit, match="non-numeric"): + vr.load_sites_bed(p) + + +# --------------------------------------------------------------------------- +# parse_table_json + parse_session_dictionary (regex extraction) +# --------------------------------------------------------------------------- + +def _fake_html(table_json: dict, session_dict: dict) -> str: + """Build a minimal HTML whose JS literals match what create_report emits.""" + return ( + "<html><body><script>\n" + f"var tableJson = {json.dumps(table_json)};\n" + f"var sessionDictionary = {json.dumps(session_dict)};\n" + "</script></body></html>\n" + ) + + +def test_parse_table_json_extracts_dict(): + tj = {"headers": ["Chrom"], "rows": [["chr1"]]} + html = _fake_html(tj, {}) + out = vr.parse_table_json(html) + assert out == tj + + +def test_parse_session_dictionary_extracts_dict(): + sd = {"0": "data:application/gzip;base64,xxx"} + html = _fake_html({"headers": [], "rows": []}, sd) + out = vr.parse_session_dictionary(html) + assert out == sd + + +def test_parse_table_json_missing_returns_none(): + assert vr.parse_table_json("<html>nothing here</html>") is None + + +def test_parse_balanced_blob_handles_braces_in_strings(): + """The brace-balancing scanner must not be tricked by '{' inside string literals.""" + html = "tableJson = {\"name\": \"value with { brace }\", \"n\": 1};" + out = vr.parse_table_json(html) + assert out["name"] == "value with { brace }" + assert out["n"] == 1 + + +def test_parse_balanced_blob_handles_escaped_quotes(): + """Backslash-escaped quotes must not flip the in_str state prematurely.""" + html = 'tableJson = {"name": "has \\" quote", "n": 2};' + out = vr.parse_table_json(html) + assert out["name"] == 'has " quote' + assert out["n"] == 2 + + +# --------------------------------------------------------------------------- +# decode_session_entry +# --------------------------------------------------------------------------- + +def _make_data_url(payload: dict) -> str: + raw = gzip.compress(json.dumps(payload).encode()) + return "data:application/gzip;base64," + base64.b64encode(raw).decode() + + +def test_decode_session_entry_roundtrip(): + payload = {"tracks": [{"name": "t", "url": "data:..."}]} + url = _make_data_url(payload) + assert vr.decode_session_entry(url) == payload + + +def test_decode_session_entry_bad_prefix_returns_none(): + assert vr.decode_session_entry("http://example.com/file.bam") is None + + +def test_decode_session_entry_bad_base64_returns_none(): + """Non-fatal — corrupted entries return None so caller can SKIP gracefully.""" + assert vr.decode_session_entry("data:application/gzip;base64,!!!notb64!!!") is None + + +# --------------------------------------------------------------------------- +# expected_track_labels (covers the Path.stem rule) +# --------------------------------------------------------------------------- + +def test_expected_track_labels_from_paths(): + """Positional --tracks mode: igv-reports auto-names by Path.stem (strips + ONE final suffix). Verified against create_report 1.16.2 in the script.""" + labs = vr.expected_track_labels( + ["/x/sample.5mC.bedgraph", "/y/gencode.v47.annotation.gff3.gz", "/z/x.bam"], + track_config=None, + ) + assert labs == ["sample.5mC", "gencode.v47.annotation.gff3", "x"] + + +def test_expected_track_labels_from_track_config(tmp_path): + """--track-config mode: use the `name` field from the JSON, not the path.""" + cfg = tmp_path / "tracks.json" + cfg.write_text(json.dumps([ + {"name": "tumor", "url": "/x/tumor.bam"}, + {"name": "normal", "url": "/x/normal.bam"}, + {"url": "/x/no-name-track.bam"}, # entries without `name` are silently dropped + ])) + labs = vr.expected_track_labels([], track_config=cfg) + assert labs == ["tumor", "normal"] + + +def test_expected_track_labels_empty(): + assert vr.expected_track_labels([], None) == [] + assert vr.expected_track_labels(None, None) == [] + + +# --------------------------------------------------------------------------- +# Individual checks — drive them with synthetic inputs +# --------------------------------------------------------------------------- + +def test_check_html_exists_pass(tmp_path): + p = tmp_path / "r.html"; p.write_text("x") + c = vr.check_html_exists(p) + assert c.status == "PASS" + + +def test_check_html_exists_fail(tmp_path): + c = vr.check_html_exists(tmp_path / "missing.html") + assert c.status == "FAIL" + + +def test_check_html_min_size_pass_fail(tmp_path): + p = tmp_path / "r.html" + p.write_bytes(b"x" * (2 * 1024 * 1024)) # 2 MB + assert vr.check_html_min_size(p, 1.0).status == "PASS" + assert vr.check_html_min_size(p, 3.0).status == "FAIL" + + +def test_check_region_count_pass(): + bed = [{"chrom": "chr1", "start": 0, "end": 100, "name": "x"}] + tj = {"headers": ["Chrom"], "rows": [["chr1"]]} + assert vr.check_region_count(bed, tj).status == "PASS" + + +def test_check_region_count_fail_when_html_short(): + bed = [{"chrom": "chr1", "start": 0, "end": 100, "name": None}] * 3 + tj = {"headers": ["Chrom"], "rows": [["chr1"]]} + c = vr.check_region_count(bed, tj) + assert c.status == "FAIL" + assert c.observed == "1" + assert c.expected == "3" + + +def test_check_region_count_missing_table_json_fails(): + assert vr.check_region_count([{"chrom": "x", "start": 0, "end": 1, "name": None}], None).status == "FAIL" + + +def test_check_region_coords_match_with_name(): + """HTML stores 1-based start, BED is 0-based half-open.""" + bed = [{"chrom": "chr2", "start": 25246500, "end": 25246501, "name": "DNMT3A"}] + tj = { + "headers": ["Chrom", "Start", "End", "Name"], + "rows": [["chr2", 25246501, 25246501, "DNMT3A"]], + } + assert vr.check_region_coords(bed, tj).status == "PASS" + + +def test_check_region_coords_name_mismatch_fails(): + bed = [{"chrom": "chr2", "start": 100, "end": 200, "name": "EXPECTED"}] + tj = { + "headers": ["Chrom", "Start", "End", "Name"], + "rows": [["chr2", 101, 200, "DIFFERENT"]], + } + c = vr.check_region_coords(bed, tj) + assert c.status == "FAIL" + assert "name mismatch" in c.details + + +def test_check_region_coords_off_by_one_aware(): + """BED 0-based start 100 must match HTML 1-based start 101.""" + bed = [{"chrom": "chr1", "start": 100, "end": 200, "name": None}] + tj = { + "headers": ["Chrom", "Start", "End"], + "rows": [["chr1", 101, 200]], + } + assert vr.check_region_coords(bed, tj).status == "PASS" + + +def test_check_region_sessions_pass(): + tj = {"headers": ["Chrom"], "rows": [["chr1"], ["chr2"]]} + sd = {"0": "data:...", "1": "data:..."} + assert vr.check_region_sessions(tj, sd).status == "PASS" + + +def test_check_region_sessions_missing_key(): + tj = {"headers": ["Chrom"], "rows": [["chr1"], ["chr2"]]} + sd = {"0": "data:..."} # missing "1" + c = vr.check_region_sessions(tj, sd) + assert c.status == "FAIL" + + +def test_check_tracks_present_pass(): + """Decode the first session entry and confirm all expected labels in tracks[].name.""" + sd = {"0": _make_data_url({"tracks": [{"name": "tumor"}, {"name": "cpg"}, {"name": "rmsk"}]})} + c = vr.check_tracks_present(sd, ["tumor", "cpg"]) + assert c.status == "PASS" + + +def test_check_tracks_present_missing_track(): + sd = {"0": _make_data_url({"tracks": [{"name": "tumor"}]})} + c = vr.check_tracks_present(sd, ["tumor", "missing_track"]) + assert c.status == "FAIL" + assert "missing_track" in c.details + + +def test_check_tracks_present_skip_when_no_labels(): + sd = {"0": _make_data_url({"tracks": []})} + assert vr.check_tracks_present(sd, []).status == "SKIP" + + +def test_check_tracks_present_empty_session_fails(): + c = vr.check_tracks_present({}, ["x"]) + assert c.status == "FAIL" + + +def test_check_tracks_present_undecodable_session_fails(): + c = vr.check_tracks_present({"0": "data:application/gzip;base64,!!!"}, ["x"]) + assert c.status == "FAIL" + assert "gunzip" in c.details or "decode" in c.details