diff --git a/subworkflows/nf-core/bam_impute_quilt2/main.nf b/subworkflows/nf-core/bam_impute_quilt2/main.nf new file mode 100644 index 000000000000..78a1aad1f672 --- /dev/null +++ b/subworkflows/nf-core/bam_impute_quilt2/main.nf @@ -0,0 +1,76 @@ +include { QUILT_QUILT2 } from '../../../modules/nf-core/quilt/quilt2/main' +include { GLIMPSE2_LIGATE } from '../../../modules/nf-core/glimpse2/ligate/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' + +workflow BAM_IMPUTE_QUILT2 { + take: + ch_input // channel (mandatory): [ meta, bam/cram, bai/crai, bampaths, bamnames ] + ch_reference_panel // channel (mandatory): [ meta, reference_vcf, reference_index ] + ch_chunks // channel (optional): [ meta, chr, start, end ] + ch_map // channel (optional): [ meta, genetic_map ] + ch_fasta // channel (optional): [ meta, fasta, fai ] + n_gen // integer: Number of generations since founding or mixing + buffer // integer: Buffer of region to perform imputation over + + main: + + + ch_parameters = ch_reference_panel + .combine(ch_map, by: 0) + .combine(ch_chunks, by: 0) + + ch_parameters.ifEmpty { + error("ERROR: join operation resulted in an empty channel. Please provide a valid ch_chunks and ch_map channel as input.") + } + + ch_bam_params = ch_input + .combine(ch_parameters) + .map { meta_input, bam, bai, bampath, bamname, meta_panel, reference_vcf, reference_index, genetic_map, chr, start, end -> + def regionout = "${chr}" + if (start != [] && end != []) { + regionout = "${chr}:${start}-${end}" + } + + [ + meta_panel + meta_input + [regionout: regionout], + bam, + bai, + bampath, + bamname, + reference_vcf, + reference_index, + [], + [], + [], + chr, + start, + end, + n_gen, + buffer, + genetic_map, + ] + } + + QUILT_QUILT2(ch_bam_params, ch_fasta) + + ligate_input = QUILT_QUILT2.out.vcf + .join(QUILT_QUILT2.out.tbi) + .map { meta, vcf, index -> + def keys_to_keep = meta.keySet() - ['regionout'] + [meta.subMap(keys_to_keep), vcf, index] + } + .groupTuple() + + GLIMPSE2_LIGATE(ligate_input) + + BCFTOOLS_INDEX(GLIMPSE2_LIGATE.out.merged_variants) + + ch_vcf_index = GLIMPSE2_LIGATE.out.merged_variants.join( + BCFTOOLS_INDEX.out.tbi.mix(BCFTOOLS_INDEX.out.csi), + failOnMismatch: true, + failOnDuplicate: true, + ) + + emit: + vcf_index = ch_vcf_index // channel: [ meta, vcf, tbi/csi ] +} diff --git a/subworkflows/nf-core/bam_impute_quilt2/meta.yml b/subworkflows/nf-core/bam_impute_quilt2/meta.yml new file mode 100644 index 000000000000..90b060587921 --- /dev/null +++ b/subworkflows/nf-core/bam_impute_quilt2/meta.yml @@ -0,0 +1,128 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "bam_impute_quilt2" +description: | + Impute low-coverage BAM or CRAM inputs with QUILT2 and ligate chunked outputs per chromosome. 'regionout' key will be used to store temporarily the region and therefore shouldn't be used in the meta maps. +keywords: + - bam + - cram + - imputation + - quilt + - quilt2 + - vcf +components: + - quilt/quilt2 + - glimpse2/ligate + - bcftools/index +input: + - ch_input: + description: | + Channel with target sequencing data and optional rename files. + structure: + - meta: + type: map + description: Metadata map for the input batch. + - bam_or_cram: + type: file + description: Input BAM or CRAM file, or a list of BAM or CRAM files. + pattern: "*.{bam,cram}" + - index: + type: file + description: Index for the BAM or CRAM input. + pattern: "*.{bai,crai}" + - bampaths: + type: file + description: | + Optional text file listing BAM or CRAM paths to impute. + One file per line. + pattern: "*.{txt,tsv}" + - bamnames: + type: file + description: | + Optional text file listing sample names in the same order as `bampaths`. + One file per line. + pattern: "*.{txt,tsv}" + - ch_reference_panel: + description: | + Channel with phased reference panel VCFs per chromosome. + structure: + - meta: + type: map + description: | + Metadata map that will be combined with the input metadata. + Must include `id` for the panel name and `chr` for the chromosome. + - vcf: + type: file + description: Reference panel VCF or BCF. + pattern: "*.{vcf,vcf.gz,bcf}" + - index: + type: file + description: Index for the reference panel VCF or BCF. + pattern: "*.{tbi,csi}" + - ch_chunks: + description: | + Channel with optional imputation chunks per chromosome. + structure: + - meta: + type: map + description: Metadata map with the panel id and chromosome. + - chr: + type: string + description: Chromosome name. + - start: + type: integer + description: Start position of the chunk. + - end: + type: integer + description: End position of the chunk. + - ch_map: + description: | + Channel with optional genetic maps per chromosome. + structure: + - meta: + type: map + description: Metadata map with the panel id and chromosome. + - map: + type: file + description: Genetic map used by QUILT2. + pattern: "*.{txt,map}{,gz}" + - ch_fasta: + description: | + Channel with optional reference FASTA data, required for CRAM inputs. + structure: + - meta: + type: map + description: Metadata map for the reference genome. + - fasta: + type: file + description: Reference genome FASTA. + pattern: "*.{fa,fasta}" + - fai: + type: file + description: FASTA index file. + pattern: "*.fai" + - n_gen: + type: integer + description: Number of generations since founding or mixing. + - buffer: + type: integer + description: Buffer of region to perform imputation over. +output: + - vcf_index: + description: | + Imputed and indexed VCFs after ligating chunked outputs per chromosome. + structure: + - meta: + type: map + description: Metadata map combined from the input batch and panel metadata. + - vcf: + type: file + description: Imputed VCF file. + pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" + - index: + type: file + description: Index for the imputed VCF or BCF file. + pattern: "*.{tbi,csi}" +authors: + - "@atrigila" +maintainers: + - "@atrigila" diff --git a/subworkflows/nf-core/bam_impute_quilt2/tests/main.nf.test b/subworkflows/nf-core/bam_impute_quilt2/tests/main.nf.test new file mode 100644 index 000000000000..3354cf046bce --- /dev/null +++ b/subworkflows/nf-core/bam_impute_quilt2/tests/main.nf.test @@ -0,0 +1,206 @@ +nextflow_workflow { + + name "Test Subworkflow BAM_IMPUTE_QUILT2" + script "../main.nf" + config "./nextflow.config" + + workflow "BAM_IMPUTE_QUILT2" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/bam_impute_quilt2" + + tag "quilt" + tag "quilt/quilt2" + tag "bcftools" + tag "bcftools/index" + tag "glimpse2" + tag "glimpse2/ligate" + + test("Impute with quilt2 one individual, one region, map and fasta") { + when { + params { + quilt_args = "--save_prepared_reference=TRUE --seed=1" + } + workflow { + """ + bampath = channel.of("NA12878.chr21_22.1X.bam").collectFile(name: "bampath.txt", newLine: true) + input[0] = channel.of([ + [id: "allid"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam.bai", checkIfExists: true) + ]) + .combine(bampath) + .combine(channel.of([[]])) + input[1] = channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz.csi", checkIfExists: true) + ]) + input[2] = channel.of( + [[id: "1000GP", chr: "chr22"], "chr22", "16570000", "16610000"] + ) + input[3] = channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.chr22.stitch.map", checkIfExists: true) + ]) + input[4] = channel.of([ + [id: "GRCh38"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genome.fasta", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genome.fasta.fai", checkIfExists: true) + ]).collect() + input[5] = 100 + input[6] = 10000 + """ + } + } + then { + assert workflow.success + assert snapshot( + workflow.out.vcf_index.collect { meta, vcf, index -> [ + path(vcf).getFileName().toString(), + path(index).getFileName().toString(), + path(vcf).vcf.summary, + path(vcf).vcf.header.getGenotypeSamples().sort(), + path(vcf).vcf.variantsMD5 + ]}, + ).match() + } + } + + test("Impute with quilt2 one individual, one region, map, fasta and bamnames") { + when { + params { + quilt_args = "--seed=1" + } + workflow { + """ + bampath = channel.of("NA12878.chr21_22.1X.bam").collectFile(name: "bampath.txt", newLine: true) + bamname = channel.of("MySample1").collectFile(name: "bamname.txt", newLine: true) + input[0] = channel.of([ + [id: "allid"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam.bai", checkIfExists: true) + ]) + .combine(bampath) + .combine(bamname) + input[1] = channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz.csi", checkIfExists: true) + ]) + input[2] = channel.of( + [[id: "1000GP", chr: "chr22"], "chr22", "16570000", "16610000"] + ) + input[3] = channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.chr22.stitch.map", checkIfExists: true) + ]) + input[4] = channel.of([ + [id: "GRCh38"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genome.fasta", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genome.fasta.fai", checkIfExists: true) + ]).collect() + input[5] = 100 + input[6] = 10000 + """ + } + } + then { + assert workflow.success + assert snapshot( + workflow.out.vcf_index.collect { meta, vcf, index -> [ + path(vcf).getFileName().toString(), + path(index).getFileName().toString(), + path(vcf).vcf.summary, + path(vcf).vcf.header.getGenotypeSamples().sort(), + path(vcf).vcf.variantsMD5 + ]}, + ).match() + } + } + + test("homo_sapiens - empty channels - stub") { + options "-stub" + when { + params { + quilt_args = "--seed=1" + } + workflow { + """ + bampath = channel.of("NA12878.chr21_22.1X.bam").collectFile(name: "bampath.txt", newLine: true) + bamname = channel.of("MySample1").collectFile(name: "bamname.txt", newLine: true) + ch_samples = channel.of([ + [id: "allid"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam.bai", checkIfExists: true) + ]) + input[0] = ch_samples.combine(bampath).combine(bamname) + input[1] = channel.of( + [[id: "1000GP", chr: "chr22"], [], []], + [[id: "1000GP", chr: "chr21"], [], []] + ) + input[2] = channel.of( + [[id: "1000GP", chr: "chr22"], "chr22", 16570065, 16592216], + [[id: "1000GP", chr: "chr22"], "chr22", 16592229, 16609999], + [[id: "1000GP", chr: "chr21"], "chr21", 16570065, 16592216], + [[id: "1000GP", chr: "chr21"], "chr21", 16592229, 16609999] + ) + input[3] = channel.of( + [[id: "1000GP", chr: "chr22"], []], + [[id: "1000GP", chr: "chr21"], []] + ) + input[4] = channel.of([[id: "GRCh38"], [], []]).collect() + input[5] = 100 + input[6] = 10000 + """ + } + } + then { + assert workflow.success + assert snapshot(sanitizeOutput(workflow.out)).match() + } + } + + test("homo_sapiens - error empty join - stub") { + options "-stub" + when { + params { + quilt_args = "--seed=1" + } + workflow { + """ + bampath = channel.of("NA12878.chr21_22.1X.bam").collectFile(name: "bampath.txt", newLine: true) + bamname = channel.of("MySample1").collectFile(name: "bamname.txt", newLine: true) + ch_samples = channel.of([ + [id: "allid"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.1X.bam.bai", checkIfExists: true) + ]) + input[0] = ch_samples.combine(bampath).combine(bamname) + input[1] = channel.of( + [[id: "otherpanel", chr: "chr22"], [], []], + [[id: "1000GP", chr: "chr21"], [], []] + ) + input[2] = channel.of( + [[id: "1000GP", chr: "chr22"], "chr22", 16570065, 16592216], + [[id: "1000GP", chr: "chr22"], "chr22", 16592229, 16609999], + [[id: "1000GP", chr: "chr21"], "chr21", 16570065, 16592216], + [[id: "1000GP", chr: "chr21"], "chr21", 16592229, 16609999] + ) + input[3] = channel.of( + [[id: "1000GP", chr: "chr22"], []], + [[id: "otherpanel", chr: "chr21"], []] + ) + input[4] = channel.of([[id: "GRCh38"], [], []]).collect() + input[5] = 100 + input[6] = 10000 + """ + } + } + then { + assert workflow.failed + assert workflow.errorMessage.contains("ERROR: join operation resulted in an empty channel. Please provide a valid ch_chunks and ch_map channel as input.") + } + } +} diff --git a/subworkflows/nf-core/bam_impute_quilt2/tests/main.nf.test.snap b/subworkflows/nf-core/bam_impute_quilt2/tests/main.nf.test.snap new file mode 100644 index 000000000000..d53730c9e804 --- /dev/null +++ b/subworkflows/nf-core/bam_impute_quilt2/tests/main.nf.test.snap @@ -0,0 +1,71 @@ +{ + "homo_sapiens - empty channels - stub": { + "content": [ + { + "vcf_index": [ + [ + { + "id": "allid", + "chr": "chr21" + }, + "allid_chr21.ligate.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "allid_chr21.ligate.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "allid", + "chr": "chr22" + }, + "allid_chr22.ligate.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "allid_chr22.ligate.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "timestamp": "2026-05-09T16:40:01.487040417", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "Impute with quilt2 one individual, one region, map, fasta and bamnames": { + "content": [ + [ + [ + "allid_chr22.ligate.vcf.gz", + "allid_chr22.ligate.vcf.gz.tbi", + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=false]", + [ + "MySample1" + ], + "de865f8128d121d8ae4b5e4c3aa662e2" + ] + ] + ], + "timestamp": "2026-05-09T16:39:50.933749879", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "Impute with quilt2 one individual, one region, map and fasta": { + "content": [ + [ + [ + "allid_chr22.ligate.vcf.gz", + "allid_chr22.ligate.vcf.gz.tbi", + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=false]", + [ + "NA12878" + ], + "de865f8128d121d8ae4b5e4c3aa662e2" + ] + ] + ], + "timestamp": "2026-05-09T16:39:30.814000823", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} diff --git a/subworkflows/nf-core/bam_impute_quilt2/tests/nextflow.config b/subworkflows/nf-core/bam_impute_quilt2/tests/nextflow.config new file mode 100644 index 000000000000..c9280911b4a0 --- /dev/null +++ b/subworkflows/nf-core/bam_impute_quilt2/tests/nextflow.config @@ -0,0 +1,15 @@ +process { + withName: QUILT_QUILT2 { + cpus = 1 + ext.args = { "${params.quilt_args}" } + ext.prefix = { "${meta.id}_${meta.chr}_${meta.regionout}.quilt2" } + } + + withName: GLIMPSE2_LIGATE { + ext.prefix = { "${meta.id}_${meta.chr}.ligate" } + } + + withName: BCFTOOLS_INDEX { + ext.args = "--tbi" + } +}