Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion beagle/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.91.8"
__version__ = "1.91.9"
1 change: 1 addition & 0 deletions runner/operator/access/heme/chip_var/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .chip_var_operator import HemeChipVarOperator
207 changes: 207 additions & 0 deletions runner/operator/access/heme/chip_var/chip_var_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import os
import json
import uuid
import logging
from pathlib import Path
from jinja2 import Template
from beagle import settings
from runner.operator.operator import Operator
from runner.models import RunStatus, Port, Run
from runner.run.objects.run_creator_object import RunCreator
from file_system.models import File, FileGroup, FileType


logger = logging.getLogger(__name__)

WORKDIR = os.path.dirname(os.path.abspath(__file__))


meta_fields = [
"igoId",
settings.CMO_SAMPLE_TAG_METADATA_KEY,
settings.CMO_SAMPLE_NAME_METADATA_KEY,
settings.CMO_SAMPLE_CLASS_METADATA_KEY,
settings.PATIENT_ID_METADATA_KEY,
"investigatorSampleId",
settings.ONCOTREE_METADATA_KEY,
"tumorOrNormal",
"tissueLocation",
settings.SAMPLE_CLASS_METADATA_KEY,
"sampleOrigin",
"preservation",
"collectionYear",
"sex",
"species",
"tubeId",
"cfDNA2dBarcode",
"baitSet",
"qcReports",
"barcodeId",
"barcodeIndex",
settings.LIBRARY_ID_METADATA_KEY,
"libraryVolume",
"libraryConcentrationNgul",
"dnaInputNg",
"captureConcentrationNm",
"captureInputNg",
"captureName",
]


class HemeChipVarOperator(Operator):
"""
Operator for the Heme chipvar workflow:

https://github.com/msk-access/chip-var/blob/develop/chip-var.cwl

This Operator will search for Nucleo Bam files based on an IGO Request ID
"""

def get_jobs(self):

sample_inputs = self.get_nucleo_outputs()

return [
RunCreator(
**{
"name": "Heme ChipVar: %s, %i of %i" % (self.request_id, i + 1, len(sample_inputs)),
"app": self.get_pipeline_id(),
"inputs": job,
"output_metadata": output_metadata,
"tags": {settings.REQUEST_ID_METADATA_KEY: self.request_id, "cmoSampleId": job["sample_name"]},
}
)
for i, (job, output_metadata) in enumerate(sample_inputs)
]

def get_nucleo_outputs(self):
# was the pipeline triggered from a chaining operator (runs) or executed with the api via the request id
if not self.request_id:
most_recent_runs_for_request = Run.objects.filter(pk__in=self.run_ids)
self.request_id = most_recent_runs_for_request[0].tags["igoRequestId"]
else:
# Use most recent set of runs that completed successfully
most_recent_runs_for_request = (
Run.objects.filter(
app__name="heme nucleo",
tags__igoRequestId=self.request_id,
status=RunStatus.COMPLETED,
operator_run__status=RunStatus.COMPLETED,
)
.order_by("-created_date")
.first()
.operator_run.runs.all()
)
if not len(most_recent_runs_for_request):
raise Exception("No matching Nucleo runs found for request {}".format(self.request_id))
inputs = []
for r in most_recent_runs_for_request:
inp = self.construct_sample_inputs(r)
output_metadata = r.output_metadata
inputs.append((inp, output_metadata))
return inputs

def parse_nucleo_output_ports(self, run, port_name):
bam_bai = Port.objects.get(name=port_name, run=run.pk)
if not len(bam_bai.files.all()) in [1, 2]:
raise Exception("Port {} for run {} should have just 1 bam or 1 (bam/bai) pair".format(port_name, run.id))

bam = [b for b in bam_bai.files.all() if b.file_name.endswith(".bam")][0]
bai = [b for b in bam_bai.files.all() if b.file_name.endswith(".bai")]
bam = self.create_cwl_file_object(bam.path)
if len(bai):
bam["secondaryFiles"] = [self.create_cwl_file_object(bai[0].path)]

return bam

def construct_sample_inputs(self, run):
with open(os.path.join(WORKDIR, "input_template.json.jinja2")) as file:
template = Template(file.read())

nucleo_output_port_names = [
"fgbio_filter_consensus_reads_duplex_bam",
]
qc_input_port_names = [
"input_bam_case",
]
bams = {}
for o, i in zip(nucleo_output_port_names, qc_input_port_names):
# We are running a multi-sample workflow on just one sample,
# so we create single-element lists here
bam = self.parse_nucleo_output_ports(run, o)
bams[i] = json.dumps(bam)
sample_name = run.output_metadata[settings.CMO_SAMPLE_NAME_METADATA_KEY]
concat_output_name = sample_name + "_concat.vcf.gz"
vardict_output_vcf_name = sample_name + "_vardict.vcf"
snpsift_countOpName = sample_name + "_snpsift_cosmic.vcf"
snpsift_prevalOpName = sample_name + "_snpsift_preval.vcf"
opOncoKbMafName = sample_name + "_oncoKB.maf"
output_mappability_filename = sample_name + "_mappability.maf"
output_complexity_filename = sample_name + "_complexity.maf"
output_vcf2mafName = sample_name + "_vcf2maf.maf"
output_panmyeloid_maf = sample_name + "_panmyeloid.maf"
output_47kchpd_maf = sample_name + "_47kchpd.maf"
output_hotspot_maf = sample_name + "_hotspot.maf"
output_filter_maf = sample_name + "_filter.maf"
output_tag_maf = sample_name + "_tag.maf"
samples_json_content = self.create_sample_json(run)

input_file = template.render(
sample_name=json.dumps(sample_name),
concat_output_name=json.dumps(concat_output_name),
vardict_output_vcf_name=json.dumps(vardict_output_vcf_name),
snpsift_countOpName=json.dumps(snpsift_countOpName),
snpsift_prevalOpName=json.dumps(snpsift_prevalOpName),
opOncoKbMafName=json.dumps(opOncoKbMafName),
output_mappability_filename=json.dumps(output_mappability_filename),
output_complexity_filename=json.dumps(output_complexity_filename),
output_vcf2mafName=json.dumps(output_vcf2mafName),
samples_json_content=json.dumps(samples_json_content),
output_maf_name_panmyeloid=json.dumps(output_panmyeloid_maf),
output_47kchpd_maf_name=json.dumps(output_47kchpd_maf),
output_hotspot_maf_name=json.dumps(output_hotspot_maf),
output_maf_name_filter=json.dumps(output_filter_maf),
output_maf_name_tag=json.dumps(output_tag_maf),
**bams,
)
sample_input = json.loads(input_file)
return sample_input

@staticmethod
def create_cwl_file_object(file_path):
return {"class": "File", "location": "juno://" + file_path}

def create_sample_json(self, run):
j = run.output_metadata
# todo: cmoSampleName in output_metadata for Nucleo appears to be the igo ID?
j[settings.CMO_SAMPLE_TAG_METADATA_KEY] = run.output_metadata[settings.CMO_SAMPLE_NAME_METADATA_KEY]

for f in meta_fields:
# Use None for missing fields
if not f in j:
j[f] = None
for f in j:
# MultiQC cannot handle cells with ","
if type(j[f]) is str and "," in j[f]:
j[f] = j[f].replace(",", ";")
# Use some double quotes to make JSON compatible
j["qcReports"] = []
out = json.dumps([j])

tmpdir = os.path.join(settings.BEAGLE_SHARED_TMPDIR, str(uuid.uuid4()))
Path(tmpdir).mkdir(parents=True, exist_ok=True)
output = os.path.join(tmpdir, "samples_json.json")

with open(output, "w+") as fh:
fh.write(out)

os.chmod(output, 0o777)

fname = os.path.basename(output)
temp_file_group = FileGroup.objects.get(slug="temp")
file_type = FileType.objects.get(name="unknown")

f = File(file_name=fname, path=output, file_type=file_type, file_group=temp_file_group)
f.save()

return self.create_cwl_file_object(f.path)
74 changes: 74 additions & 0 deletions runner/operator/access/heme/chip_var/input_template.json.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"input_bam_case": {{ input_bam_case }},
"output_vcf2mafName": {{ output_vcf2mafName }},
"output_mappability_filename": {{ output_mappability_filename }},
"output_complexity_filename": {{ output_complexity_filename }},
"concat_output_name": {{ concat_output_name }},
"sample_name": {{ sample_name }},
"vardict_output_vcf_name": {{ vardict_output_vcf_name }},
"snpsift_countOpName": {{ snpsift_countOpName }},
"snpsift_prevalOpName": {{ snpsift_prevalOpName }},
"opOncoKbMafName": {{ opOncoKbMafName }},
"output_maf_name_panmyeloid": {{ output_maf_name_panmyeloid }},
"output_47kchpd_maf_name": {{ output_47kchpd_maf_name }},
"output_hotspot_maf_name": {{ output_hotspot_maf_name }},
"output_maf_name_filter": {{ output_maf_name_filter }},
"output_maf_name_tag": {{ output_maf_name_tag }},
"bedfile": {
"class": "File",
"location": "juno:///juno/work/access/production/resources/tools/resources/access-heme/region_of_interest/versions/v1.0/MSK-Heme-v1_0_canonicaltargets_MEN1.bed"
},
"input_cosmicCountDB_vcf": {
"class": "File",
"location": "juno:///work/cch/production/resources/cosmic/current/v96/CosmicCodingMuts.vcf.gz",
"secondaryFiles": [
{ "class": "File", "location": "juno:///work/cch/production/resources/cosmic/current/v96/CosmicCodingMuts.vcf.gz.tbi" }
]
},
"input_cosmicprevalenceDB_vcf": {
"class": "File",
"location": "juno:///work/cch/production/resources/cosmic/current/v96/CosmicCodingMuts_GRCh37_processed.vcf.gz",
"secondaryFiles": [
{ "class": "File", "location": "juno:///work/cch/production/resources/cosmic/current/v96/CosmicCodingMuts_GRCh37_processed.vcf.gz.tbi" }
]
},
"reference_fasta": {
"class": "File",
"location": "juno:///work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta",
"secondaryFiles": [
{"class": "File", "location": "juno:///work/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta.fai"}
]
},
"oncoKbApiToken": {
"class": "File",
"location": "juno:///work/cch/production/resources/oncoKB/current/apiToken.txt"
},
"input_mappability_bed": {
"class": "File",
"location": "juno:///work/cch/production/resources/beds/current/wgEncodeDacMapabilityConsensusExcludable_4cols.bed"
},
"input_complexity_bed": {
"class": "File",
"location": "juno:///work/cch/production/resources/beds/current/rmsk_mod.bed"
},
"input_47kchpd_tsv_file": {
"class": "File",
"location": "juno:///work/cch/production/resources/ch_47k_impact/versions/v1/chpd47k_prevalence.tsv"
},
"input_panmeloid_tsv_file": {
"class": "File",
"location": "juno:///work/cch/production/resources/pan_myeloid/versions/v1/pan_myeloid_variant_count.tsv"
},
"input_hotspot_tsv_file": {
"class": "File",
"location": "juno:///work/cch/production/resources/ch_47k_impact/versions/v1/hotspots_47kchpd.tsv"
},
"retain_info": "CNT,TUMOR_TYPE",
"vardict_allele_frequency_threshold": 0,
"oncoKbAnnotateHotspots": true,
"column_name_mappability": "mappability",
"column_name_complexity": "complexity",
"output_column_name_panmyeloid": "panmyeloid",
"output_column_name_47kchpd": "47kchpd",
"output_column_name_hotpsot": "hotspot"
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
"library": "{{ library_id }}",
"platform-model": "novaseq",
"platform-unit": "{{ barcode_id }}",

"UBG_abra2_targets": {
"class": "File",
"location": "juno:///juno/work/access/production/resources/msk-access/v2.0/regions_of_interest/current/MSK-ACCESS-v2_targetsAllwFP_500buffer.bed"
},
"BC_picard_fixmate_information_output_file_name": "{{ sample_id }}_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX.bam",
"UBG_abra2_output_bams": "{{ sample_id }}_uncollapsed_IR.bam",
"UBG_bwa_mem_output": "{{ sample_id }}_uncollapsed.sam",
Expand Down
10 changes: 5 additions & 5 deletions runner/operator/access/v1_0_0/snps_and_indels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
ACCESS_DEFAULT_NORMAL_ID = "DONOR22-TP"
ACCESS_DEFAULT_NORMAL_FILENAME = "DONOR22-TP_cl_aln_srt_MD_IR_FX_BR__aln_srt_IR_FX-duplex.bam"
NORMAL_SAMPLE_SEARCH = "-N0"
TUMOR_SAMPLE_SEARCH = "-L0"
TUMOR_SAMPLE_SEARCH = "(-L0|-T0)"
DUPLEX_BAM_SEARCH = "__aln_srt_IR_FX-duplex.bam"
SIMPLEX_BAM_SEARCH = "__aln_srt_IR_FX-simplex.bam"
UNFILTERED_BAM_SEARCH = "__aln_srt_IR_FX.bam"
Expand Down Expand Up @@ -110,7 +110,7 @@ def make_pairs(d, s):

self.fillout_duplex_tumors = (
File.objects.filter(
file_name__contains=TUMOR_SAMPLE_SEARCH,
file_name__regex=TUMOR_SAMPLE_SEARCH,
file_name__endswith=DUPLEX_BAM_SEARCH,
port__run__tags__igoRequestId=self.request_id,
)
Expand All @@ -120,7 +120,7 @@ def make_pairs(d, s):

self.fillout_simplex_tumors = (
File.objects.filter(
file_name__contains=TUMOR_SAMPLE_SEARCH,
file_name__regex=TUMOR_SAMPLE_SEARCH,
file_name__endswith=SIMPLEX_BAM_SEARCH,
port__run__tags__igoRequestId=self.request_id,
)
Expand Down Expand Up @@ -337,8 +337,8 @@ def get_geno_samples(self, tumor_sample_id, tumor_duplex_bam, tumor_simplex_bam,
# Add patient matched Tumors samples
patient_id = "-".join(tumor_sample_id.split("-")[0:2])
matched_tumor_search = patient_id + TUMOR_SAMPLE_SEARCH
duplex_matched_q = Q(file_name__endswith=DUPLEX_BAM_SEARCH) & Q(file_name__startswith=matched_tumor_search)
simplex_matched_q = Q(file_name__endswith=SIMPLEX_BAM_SEARCH) & Q(file_name__startswith=matched_tumor_search)
duplex_matched_q = Q(file_name__endswith=DUPLEX_BAM_SEARCH) & Q(file_name__regex=matched_tumor_search)
simplex_matched_q = Q(file_name__endswith=SIMPLEX_BAM_SEARCH) & Q(file_name__regex=matched_tumor_search)

duplex_matched_samples = (
File.objects.filter(duplex_matched_q)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
ACCESS_CURATED_BAMS_FILE_GROUP_SLUG = "accessv2_curated_normals"
ACCESS_DEFAULT_NORMAL_ID = "Donor19F21c2206-TP01_ACCESSv2-VAL-20230004R"
NORMAL_SAMPLE_SEARCH = "-N0"
TUMOR_SAMPLE_SEARCH = "-L0"
TUMOR_SAMPLE_SEARCH = "(-L0|-T0)"
DUPLEX_BAM_SEARCH = "__aln_srt_IR_FX-duplex.bam"
SIMPLEX_BAM_SEARCH = "__aln_srt_IR_FX-simplex.bam"
UNFILTERED_BAM_SEARCH = "__aln_srt_IR_FX.bam"
Expand Down Expand Up @@ -413,8 +413,8 @@ def get_geno_samples(tumor_sample_id, matched_normal_id, fillout_simplex_tumors,
# Add patient matched Tumors samples
patient_id = "-".join(tumor_sample_id.split("-")[0:2])
matched_tumor_search = patient_id + TUMOR_SAMPLE_SEARCH
duplex_matched_q = Q(file_name__endswith=DUPLEX_BAM_SEARCH) & Q(file_name__startswith=matched_tumor_search)
simplex_matched_q = Q(file_name__endswith=SIMPLEX_BAM_SEARCH) & Q(file_name__startswith=matched_tumor_search)
duplex_matched_q = Q(file_name__endswith=DUPLEX_BAM_SEARCH) & Q(file_name__regex=matched_tumor_search)
simplex_matched_q = Q(file_name__endswith=SIMPLEX_BAM_SEARCH) & Q(file_name__regex=matched_tumor_search)

duplex_matched_samples = (
File.objects.filter(duplex_matched_q)
Expand Down
Loading