From 1a4cc7f6a350cf79f638f51c9eec40f917ff1918 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Tue, 23 Jun 2026 21:07:30 -0700 Subject: [PATCH] add back arguments and remove separate ensemble_twas cell --- .../mnm_methods/mnm_regression.ipynb | 153 ++++++------------ .../script/pecotmr_integration/fine_mapping.R | 56 ++++++- .../qtl_dataset_construct.R | 32 +++- .../script/pecotmr_integration/twas_weights.R | 88 ++++++++-- 4 files changed, 214 insertions(+), 115 deletions(-) diff --git a/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb b/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb index 29394d6e..70bb2808 100644 --- a/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb +++ b/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb @@ -660,8 +660,16 @@ "# Optional genotype-derived covariates applied uniformly across contexts;\n", "# per-context covariates come from the manifest cov_path column.\n", "parameter: genotype_covariates = path('.')\n", + "# Construct-time variant/sample QC (stored on the QtlDataset; cutoff 0 = off).\n", "parameter: maf_cutoff = 0.0\n", "parameter: xvar_cutoff = 0.0\n", + "parameter: mac_cutoff = 0.0\n", + "parameter: imiss_cutoff = 0.0\n", + "# Drop indel variants (default keeps them, matching QtlDataset()).\n", + "parameter: drop_indel = False\n", + "# Optional whitespace-delimited ID files to restrict samples / variants (\".\" = none).\n", + "parameter: keep_samples = \".\"\n", + "parameter: keep_variants = \".\"\n", "output: f\"{cwd:a}/qtl_dataset/{study}.qtl_dataset.rds\"\n", "task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n", "bash: expand = '${ }', stderr = f\"{_output}.stderr\", stdout = f\"{_output}.stdout\", container = container\n", @@ -673,6 +681,11 @@ " ${'--transpose-covariates' if transpose_covariates else ''} \\\n", " --maf-cutoff ${maf_cutoff} \\\n", " --xvar-cutoff ${xvar_cutoff} \\\n", + " --mac-cutoff ${mac_cutoff} \\\n", + " --imiss-cutoff ${imiss_cutoff} \\\n", + " ${'--drop-indel' if drop_indel else ''} \\\n", + " ${('--keep-samples ' + str(keep_samples)) if keep_samples != '.' else ''} \\\n", + " ${('--keep-variants ' + str(keep_variants)) if keep_variants != '.' else ''} \\\n", " --output ${_output}" ] }, @@ -701,12 +714,43 @@ "parameter: fine_mapping_coverage = 0.95\n", "# Comma-separated context names to restrict both passes to; empty = all contexts.\n", "parameter: contexts = \"\"\n", + "# Fine-mapping SER pre-screen (0 = off, <0 = adaptive 3/nVariants).\n", + "parameter: pip_cutoff_to_skip = 0.0\n", + "# TWAS weight-learning knobs.\n", + "parameter: min_twas_maf = 0.01\n", + "parameter: min_twas_xvar = 0.01\n", + "parameter: max_cv_variants = 5000\n", + "parameter: cv_folds = 5\n", + "parameter: cv_threads = 1\n", + "# Reproducibility: integer RNG seed; -1 = unset.\n", + "parameter: seed = -1\n", + "# Opt-in per-analysis overrides of the QtlDataset's construct-time filters,\n", + "# applied to both passes. Sentinels: a negative cutoff / drop_indel False /\n", + "# \".\" path = leave the dataset's stored value untouched.\n", + "parameter: maf_cutoff = -1.0\n", + "parameter: mac_cutoff = -1.0\n", + "parameter: xvar_cutoff = -1.0\n", + "parameter: imiss_cutoff = -1.0\n", + "parameter: drop_indel = False\n", + "parameter: keep_samples = \".\"\n", + "parameter: keep_variants = \".\"\n", "# Optional JSON of per-method kwargs forwarded to the pipelines, e.g.\n", "# '{\"susie\":{\"L\":10}}' for fine-mapping or '{\"lasso\":{\"nfolds\":10}}' for TWAS.\n", "parameter: fine_mapping_method_args = \"\"\n", "parameter: twas_method_args = \"\"\n", "fail_if(len(region_name) == 0, \"susie_twas: pass --region-name [ ...] to choose which gene(s) to analyze.\")\n", "qtl_dataset = path(f\"{cwd:a}/qtl_dataset/{name}.qtl_dataset.rds\")\n", + "# Assemble the shared opt-in worker flags once (command construction only).\n", + "seed_arg = f\"--seed {seed}\" if seed >= 0 else \"\"\n", + "filter_overrides = \" \".join(filter(None, [\n", + " f\"--maf-cutoff {maf_cutoff}\" if maf_cutoff >= 0 else \"\",\n", + " f\"--mac-cutoff {mac_cutoff}\" if mac_cutoff >= 0 else \"\",\n", + " f\"--xvar-cutoff {xvar_cutoff}\" if xvar_cutoff >= 0 else \"\",\n", + " f\"--imiss-cutoff {imiss_cutoff}\" if imiss_cutoff >= 0 else \"\",\n", + " \"--drop-indel\" if drop_indel else \"\",\n", + " f\"--keep-samples {keep_samples}\" if keep_samples != \".\" else \"\",\n", + " f\"--keep-variants {keep_variants}\" if keep_variants != \".\" else \"\",\n", + "]))\n", "input: qtl_dataset, for_each = \"region_name\"\n", "output: fine_mapping = f\"{cwd:a}/fine_mapping/{name}.{_region_name}.univariate_bvsr.rds\",\n", " twas_weights = f\"{cwd:a}/twas_weights/{name}.{_region_name}.univariate_twas_weights.rds\"\n", @@ -718,6 +762,8 @@ " --cis-window ${cis_window} \\\n", " --methods ${fine_mapping_methods} \\\n", " --coverage ${fine_mapping_coverage} \\\n", + " --pip-cutoff-to-skip ${pip_cutoff_to_skip} \\\n", + " ${seed_arg} ${filter_overrides} \\\n", " ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n", " ${(\"--method-args '\" + fine_mapping_method_args + \"'\") if fine_mapping_method_args else \"\"} \\\n", " --output ${_output[\"fine_mapping\"]}\n", @@ -727,113 +773,18 @@ " --gene-id ${_region_name} \\\n", " --cis-window ${cis_window} \\\n", " --methods ${twas_methods} \\\n", + " --min-twas-maf ${min_twas_maf} \\\n", + " --min-twas-xvar ${min_twas_xvar} \\\n", + " --max-cv-variants ${max_cv_variants} \\\n", + " --cv-folds ${cv_folds} \\\n", + " --cv-threads ${cv_threads} \\\n", " --fine-mapping-result ${_output[\"fine_mapping\"]} \\\n", + " ${seed_arg} ${filter_overrides} \\\n", " ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n", " ${(\"--method-args '\" + twas_method_args + \"'\") if twas_method_args else \"\"} \\\n", " --output ${_output[\"twas_weights\"]}" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[ensemble_twas_weight]\n", - "# Ensemble multiple per-method TWAS weights (susie/susie_inf/mrash/enet/lasso/mcp/scad/l0learn/bayes_r/bayes_c)\n", - "# into a single stacked-regression ensemble weight, using pecotmr::ensemble_weights().\n", - "# Input: the *.univariate_twas_weights.rds produced by the [susie_twas] step (one per region/peak).\n", - "# Output: *.ensemble_twas_weights.rds, identical structure but with an added `ensemble_weights` column.\n", - "# Default -1e9 keeps every method (no R^2 filtering) so the ensemble always\n", - "# computes; raise this (e.g. 0.01) on real data to drop poorly-predicting methods.\n", - "parameter: ensemble_r2_threshold = -1e9\n", - "parameter: ensemble_solver = \"quadprog\"\n", - "parameter: ensemble_alpha = 1.0\n", - "depends: sos_variable(\"regional_data\")\n", - "meta_info = regional_data['meta_info']\n", - "input: regional_data[\"data\"], group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n", - "output_files = [f'{cwd:a}/twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.ensemble_twas_weights.rds']\n", - "output: output_files\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", - "R: expand = '${ }', stdout = f\"{_output[0]:nn}.ensemble_twas_weight.stdout\", stderr = f\"{_output[0]:nn}.ensemble_twas_weight.stderr\", container = container\n", - " options(warn=1)\n", - " library(pecotmr)\n", - " # ---- locate the upstream univariate_twas_weights.rds for this region ----\n", - " region_tag <- \"${_meta_info[0].split(':')[0]}_${_meta_info[2]}\"\n", - " in_rds <- paste0(\"${cwd:a}\", \"/twas_weights/\", \"${name}\", \".\", region_tag, \".univariate_twas_weights.rds\")\n", - " if (!file.exists(in_rds)) stop(paste0(\"Missing upstream TWAS weights file: \", in_rds, \" -- run the susie_twas step first.\"))\n", - " obj <- readRDS(in_rds)\n", - " # ---- reload the regional phenotype y (needed as the stacking response) ----\n", - " phenotype_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[1:len(_input)//2+1]])})\n", - " covariate_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[len(_input)//2+1:]])})\n", - " conditions = c(${\",\".join(['\"%s\"' % x for x in _meta_info[4:]])})\n", - " fdat <- load_regional_univariate_data(genotype = ${_input[0]:anr},\n", - " phenotype = phenotype_files,\n", - " covariate = covariate_files,\n", - " region = ${(\"'%s'\" % _meta_info[0]) if not trans_analysis else 'NULL'},\n", - " association_window = \"${_meta_info[1]}\",\n", - " conditions = conditions,\n", - " maf_cutoff = ${maf},\n", - " mac_cutoff = ${mac},\n", - " imiss_cutoff = ${imiss},\n", - " keep_indel = ${\"TRUE\" if indel else \"FALSE\"},\n", - " extract_region_name = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])}),\n", - " phenotype_header = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"},\n", - " region_name_col = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"},\n", - " scale_residuals = FALSE)\n", - " # ---- per region in this object, run the ensemble ----\n", - " ensemble_r2_threshold <- ${ensemble_r2_threshold}\n", - " ensemble_solver <- \"${ensemble_solver}\"\n", - " ensemble_alpha <- ${ensemble_alpha}\n", - " for (rg in names(obj)) {\n", - " for (ph in names(obj[[rg]])) {\n", - " res <- obj[[rg]][[ph]]\n", - " if (is.null(res$twas_cv_result) || is.null(res$twas_cv_result$performance) || is.null(res$twas_weights)) {\n", - " message(\"Skipping \", rg, \"/\", ph, \": no twas_cv_result/twas_weights present.\"); next\n", - " }\n", - " # observed phenotype y for this condition (matched by name to the loaded fdat)\n", - " y <- tryCatch(fdat$residual_Y[[ph]], error = function(e) NULL)\n", - " if (is.null(y)) y <- tryCatch(fdat$residual_Y[[1]], error = function(e) NULL)\n", - " if (is.null(y)) { message(\"Skipping \", rg, \"/\", ph, \": could not recover phenotype y.\"); next }\n", - " y <- as.numeric(y)\n", - " # rank methods by CV R-squared (same logic as twas_weights_pipeline)\n", - " method_rsq <- vapply(res$twas_cv_result$performance, function(perf) perf[1, \"rsq\"], numeric(1))\n", - " names(method_rsq) <- gsub(\"_performance$\", \"\", names(method_rsq))\n", - " passing <- !is.na(method_rsq) & method_rsq >= ensemble_r2_threshold\n", - " n_passing <- sum(passing)\n", - " if (n_passing < 2) {\n", - " surviving <- if (n_passing == 1) names(method_rsq)[passing] else NA\n", - " message(\"Ensemble skipped for \", rg, \"/\", ph, \": \", n_passing, \" of \", length(method_rsq),\n", - " \" methods passed R-squared cutoff \", ensemble_r2_threshold, \" (need >= 2).\")\n", - " obj[[rg]][[ph]]$ensemble_weights <- NULL\n", - " next\n", - " }\n", - " passing_base <- names(method_rsq)[passing]\n", - " passing_pred_names <- paste0(passing_base, \"_predicted\")\n", - " passing_weight_names <- paste0(passing_base, \"_weights\")\n", - " filtered_cv <- res$twas_cv_result\n", - " filtered_cv$prediction <- filtered_cv$prediction[passing_pred_names]\n", - " filtered_weights <- res$twas_weights[passing_weight_names]\n", - " message(\"Computing ensemble TWAS weights via stacked regression for \", rg, \"/\", ph,\n", - " \" using \", n_passing, \" methods: \", paste(passing_base, collapse=\", \"))\n", - " ens_result <- ensemble_weights(cv_results = filtered_cv, Y = y,\n", - " twas_weight_list = filtered_weights,\n", - " solver = ensemble_solver, alpha = ensemble_alpha)\n", - " if (!is.null(ens_result$ensemble_twas_weights)) {\n", - " ens_wt <- ens_result$ensemble_twas_weights\n", - " if (!is.matrix(ens_wt)) ens_wt <- matrix(ens_wt, ncol = 1)\n", - " obj[[rg]][[ph]]$twas_weights$ensemble_weights <- ens_wt\n", - " obj[[rg]][[ph]]$ensemble <- ens_result\n", - " }\n", - " }\n", - " }\n", - " dir.create(dirname(\"${_output[0]}\"), showWarnings = FALSE, recursive = TRUE)\n", - " saveRDS(obj, \"${_output[0]}\", compress = \"xz\")\n", - " message(\"Saved ensemble TWAS weights to: \", \"${_output[0]}\")\n" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/code/script/pecotmr_integration/fine_mapping.R b/code/script/pecotmr_integration/fine_mapping.R index 4c8144ba..5baf570f 100644 --- a/code/script/pecotmr_integration/fine_mapping.R +++ b/code/script/pecotmr_integration/fine_mapping.R @@ -85,10 +85,56 @@ parser <- add_argument(parser, "--L-greedy", parser <- add_argument(parser, "--method-args", help = "JSON object {token: {kwarg: value, ...}, ...} for fineMappingPipeline()", type = "character", default = "") +parser <- add_argument(parser, "--seed", + help = "Integer RNG seed set before fitting (reproducibility); unset = no seeding", + type = "integer", default = NA) +parser <- add_argument(parser, "--pip-cutoff-to-skip", + help = "Single-effect (SER) pre-screen cutoff (fineMappingPipeline pipCutoffToSkip), QTL mode; 0 = off, <0 = adaptive 3/nVariants", + type = "numeric", default = 0) +# --- Per-analysis overrides of the QtlDataset's construct-time filters. Each is +# opt-in (unset leaves the dataset's stored value); applied to the loaded object +# before the pipeline runs (QTL mode only). +parser <- add_argument(parser, "--maf-cutoff", + help = "Override QtlDataset mafCutoff for this analysis", + type = "numeric", default = NA) +parser <- add_argument(parser, "--mac-cutoff", + help = "Override QtlDataset macCutoff", + type = "numeric", default = NA) +parser <- add_argument(parser, "--xvar-cutoff", + help = "Override QtlDataset xvarCutoff", + type = "numeric", default = NA) +parser <- add_argument(parser, "--imiss-cutoff", + help = "Override QtlDataset imissCutoff", + type = "numeric", default = NA) +parser <- add_argument(parser, "--drop-indel", + help = "Drop indels (sets keepIndel = FALSE); omit to use the dataset's stored value", + flag = TRUE) +parser <- add_argument(parser, "--keep-samples", + help = "Path to a whitespace-delimited file of sample IDs to restrict to (overrides keepSamples)", + type = "character", default = "") +parser <- add_argument(parser, "--keep-variants", + help = "Path to a whitespace-delimited file of variant IDs to restrict to (overrides keepVariants)", + type = "character", default = "") parser <- add_argument(parser, "--output", help = "Output RDS path", type = "character") argv <- parse_args(parser) +# Opt-in overrides of a loaded QtlDataset's construct-time filter slots. Filters +# apply lazily at extraction, so mutating the slot re-filters without rebuilding +# the RDS. Shared by both QTL workers (see twas_weights.R). +apply_qd_filter_overrides <- function(qd, argv) { + if (!is.na(argv$maf_cutoff)) qd@mafCutoff <- argv$maf_cutoff + if (!is.na(argv$mac_cutoff)) qd@macCutoff <- argv$mac_cutoff + if (!is.na(argv$xvar_cutoff)) qd@xvarCutoff <- argv$xvar_cutoff + if (!is.na(argv$imiss_cutoff)) qd@imissCutoff <- argv$imiss_cutoff + if (isTRUE(argv$drop_indel)) qd@keepIndel <- FALSE + read_ids <- function(p) if (nzchar(p) && p != "." && file.exists(p)) + unique(trimws(unlist(strsplit(readLines(p), "[[:space:]]+")))) else NULL + ks <- read_ids(argv$keep_samples); if (!is.null(ks)) qd@keepSamples <- ks + kv <- read_ids(argv$keep_variants); if (!is.null(kv)) qd@keepVariants <- kv + qd +} + # Parse --method-args into a nested named list of per-method kwargs. # Empty / '.' / '{}' all yield NULL (which makes us pass the char-vector # form of methods= to fineMappingPipeline). @@ -119,6 +165,9 @@ secondary_cov <- as.numeric(trimws(strsplit(argv$secondary_coverage, ",", fixed median_abs_corr <- if (length(argv$median_abs_corr) != 1L || is.na(argv$median_abs_corr)) NULL else argv$median_abs_corr +# Seed up front for reproducible fits (mirrors the legacy susie_twas set.seed). +if (length(argv$seed) == 1L && !is.na(argv$seed)) set.seed(as.integer(argv$seed)) + parse_region <- function(s) { m <- regmatches(s, regexec("^([^:]+):([0-9]+)-([0-9]+)$", s))[[1L]] if (length(m) != 4L) @@ -186,10 +235,13 @@ if (has_gwas) { if (!has_gene && !has_region) stop("QTL mode requires --gene-id (with --cis-window) or --region.") qd <- readRDS(argv$qtl_dataset) - # `contexts` is QTL-mode only (NULL = all contexts), so it rides on qtl_args + qd <- apply_qd_filter_overrides(qd, argv) + # `contexts` and `pipCutoffToSkip` are QTL-mode only, so they ride on qtl_args # rather than the mode-shared cs_args. qtl_args <- c(list(qd), cs_args, - list(cisWindow = argv$cis_window, contexts = contexts_arg)) + list(cisWindow = argv$cis_window, + contexts = contexts_arg, + pipCutoffToSkip = argv$pip_cutoff_to_skip)) res <- if (has_region) { do.call(fineMappingPipeline, c(qtl_args, list(region = parse_region(argv$region)))) } else { diff --git a/code/script/pecotmr_integration/qtl_dataset_construct.R b/code/script/pecotmr_integration/qtl_dataset_construct.R index 4f879819..606b024d 100644 --- a/code/script/pecotmr_integration/qtl_dataset_construct.R +++ b/code/script/pecotmr_integration/qtl_dataset_construct.R @@ -61,6 +61,21 @@ parser <- add_argument(parser, "--maf-cutoff", parser <- add_argument(parser, "--xvar-cutoff", help = "QtlDataset() variance cutoff", type = "numeric", default = 0) +parser <- add_argument(parser, "--mac-cutoff", + help = "QtlDataset() minor-allele-count cutoff", + type = "numeric", default = 0) +parser <- add_argument(parser, "--imiss-cutoff", + help = "QtlDataset() per-variant missingness cutoff", + type = "numeric", default = 0) +parser <- add_argument(parser, "--drop-indel", + help = "Drop indel variants (sets keepIndel = FALSE); default keeps them, matching QtlDataset()", + flag = TRUE) +parser <- add_argument(parser, "--keep-samples", + help = "Path to a whitespace-delimited file of sample IDs to restrict to (QtlDataset keepSamples)", + type = "character", default = "") +parser <- add_argument(parser, "--keep-variants", + help = "Path to a whitespace-delimited file of variant IDs to restrict to (QtlDataset keepVariants)", + type = "character", default = "") parser <- add_argument(parser, "--output", help = "Output RDS path", type = "character") argv <- parse_args(parser) @@ -193,13 +208,26 @@ genoCov <- if (has_geno_cov) { } # ----- Construct + save ------------------------------------------------------ -qd <- QtlDataset( +read_id_file <- function(p) if (nzchar(p) && p != "." && file.exists(p)) + unique(trimws(unlist(strsplit(readLines(p), "[[:space:]]+")))) else character(0) +keep_samples <- read_id_file(argv$keep_samples) +keep_variants <- read_id_file(argv$keep_variants) + +qd_args <- list( study = argv$study, genotypes = geno_handle, phenotypes = phenotypes, genotypeCovariates = genoCov, mafCutoff = argv$maf_cutoff, - xvarCutoff = argv$xvar_cutoff) + macCutoff = argv$mac_cutoff, + xvarCutoff = argv$xvar_cutoff, + imissCutoff = argv$imiss_cutoff, + keepIndel = !isTRUE(argv$drop_indel)) +# keepSamples / keepVariants only when a file was given (else the constructor +# default = keep all). +if (length(keep_samples) > 0L) qd_args$keepSamples <- keep_samples +if (length(keep_variants) > 0L) qd_args$keepVariants <- keep_variants +qd <- do.call(QtlDataset, qd_args) dir.create(dirname(argv$output), showWarnings = FALSE, recursive = TRUE) saveRDS(qd, argv$output) diff --git a/code/script/pecotmr_integration/twas_weights.R b/code/script/pecotmr_integration/twas_weights.R index 0225127b..fc6117da 100644 --- a/code/script/pecotmr_integration/twas_weights.R +++ b/code/script/pecotmr_integration/twas_weights.R @@ -65,10 +65,71 @@ parser <- add_argument(parser, "--fine-mapping-result", parser <- add_argument(parser, "--method-args", help = "JSON object {token: {kwarg: value, ...}, ...} for twasWeightsPipeline()", type = "character", default = "") +parser <- add_argument(parser, "--min-twas-maf", + help = "Minimum MAF for the variants used to learn TWAS weights (twasWeightsPipeline minTwasMaf), applied on top of the dataset's construct-time mafCutoff", + type = "numeric", default = 0.01) +parser <- add_argument(parser, "--min-twas-xvar", + help = "Minimum per-variant genotype variance for TWAS weight learning (twasWeightsPipeline minTwasXvar)", + type = "numeric", default = 0.01) +parser <- add_argument(parser, "--max-cv-variants", + help = "Cap on the number of variants used in cross-validation (twasWeightsPipeline maxCvVariants); -1 = no cap", + type = "integer", default = 5000L) +parser <- add_argument(parser, "--cv-folds", + help = "Cross-validation folds for TWAS weight evaluation (twasWeightsPipeline cvFolds)", + type = "integer", default = 5L) +parser <- add_argument(parser, "--cv-threads", + help = "Threads for the cross-validation refits (twasWeightsPipeline cvThreads)", + type = "integer", default = 1L) +parser <- add_argument(parser, "--seed", + help = "Integer RNG seed set before fitting (reproducibility); unset = no seeding", + type = "integer", default = NA) +# --- Per-analysis overrides of the QtlDataset's construct-time filters. Each is +# opt-in (unset leaves the dataset's stored value); applied to the loaded object +# before the pipeline runs. +parser <- add_argument(parser, "--maf-cutoff", + help = "Override QtlDataset mafCutoff for this analysis", + type = "numeric", default = NA) +parser <- add_argument(parser, "--mac-cutoff", + help = "Override QtlDataset macCutoff", + type = "numeric", default = NA) +parser <- add_argument(parser, "--xvar-cutoff", + help = "Override QtlDataset xvarCutoff", + type = "numeric", default = NA) +parser <- add_argument(parser, "--imiss-cutoff", + help = "Override QtlDataset imissCutoff", + type = "numeric", default = NA) +parser <- add_argument(parser, "--drop-indel", + help = "Drop indels (sets keepIndel = FALSE); omit to use the dataset's stored value", + flag = TRUE) +parser <- add_argument(parser, "--keep-samples", + help = "Path to a whitespace-delimited file of sample IDs to restrict to (overrides keepSamples)", + type = "character", default = "") +parser <- add_argument(parser, "--keep-variants", + help = "Path to a whitespace-delimited file of variant IDs to restrict to (overrides keepVariants)", + type = "character", default = "") parser <- add_argument(parser, "--output", help = "Output RDS path", type = "character") argv <- parse_args(parser) +# Opt-in overrides of a loaded QtlDataset's construct-time filter slots. Filters +# apply lazily at extraction, so mutating the slot re-filters without rebuilding +# the RDS. Mirrors fine_mapping.R. +apply_qd_filter_overrides <- function(qd, argv) { + if (!is.na(argv$maf_cutoff)) qd@mafCutoff <- argv$maf_cutoff + if (!is.na(argv$mac_cutoff)) qd@macCutoff <- argv$mac_cutoff + if (!is.na(argv$xvar_cutoff)) qd@xvarCutoff <- argv$xvar_cutoff + if (!is.na(argv$imiss_cutoff)) qd@imissCutoff <- argv$imiss_cutoff + if (isTRUE(argv$drop_indel)) qd@keepIndel <- FALSE + read_ids <- function(p) if (nzchar(p) && p != "." && file.exists(p)) + unique(trimws(unlist(strsplit(readLines(p), "[[:space:]]+")))) else NULL + ks <- read_ids(argv$keep_samples); if (!is.null(ks)) qd@keepSamples <- ks + kv <- read_ids(argv$keep_variants); if (!is.null(kv)) qd@keepVariants <- kv + qd +} + +# Seed up front for reproducible fits (mirrors the legacy susie_twas set.seed). +if (length(argv$seed) == 1L && !is.na(argv$seed)) set.seed(as.integer(argv$seed)) + # Parse --method-args into a nested named list of per-method kwargs. parsed_method_args <- if (nzchar(argv$method_args) && argv$method_args != "." && argv$method_args != "{}") { @@ -135,6 +196,7 @@ if (!has_gene && !has_region) stop("Specify either --gene-id (with --cis-window) or --region.") qd <- readRDS(argv$qtl_dataset) +qd <- apply_qd_filter_overrides(qd, argv) fmr_path <- argv$fine_mapping_result fmr <- if (nzchar(fmr_path) && fmr_path != "." && file.exists(fmr_path)) { @@ -143,18 +205,24 @@ fmr <- if (nzchar(fmr_path) && fmr_path != "." && file.exists(fmr_path)) { NULL } +# Shared args for both modes. minTwas* tighten the variant set used to learn +# the weights (on top of the QtlDataset's construct-time cutoffs); the cv* knobs +# control the cross-validated predictive-performance refits. +tw_args <- list(methods = methods_arg, + cisWindow = argv$cis_window, + contexts = contexts_arg, + minTwasMaf = argv$min_twas_maf, + minTwasXvar = argv$min_twas_xvar, + maxCvVariants = argv$max_cv_variants, + cvFolds = argv$cv_folds, + cvThreads = argv$cv_threads, + fineMappingResult = fmr) res <- if (has_region) { - twasWeightsPipeline(qd, methods = methods_arg, - region = parse_region(argv$region), - cisWindow = argv$cis_window, - contexts = contexts_arg, - fineMappingResult = fmr) + do.call(twasWeightsPipeline, + c(list(qd), tw_args, list(region = parse_region(argv$region)))) } else { - twasWeightsPipeline(qd, methods = methods_arg, - traitId = argv$gene_id, - cisWindow = argv$cis_window, - contexts = contexts_arg, - fineMappingResult = fmr) + do.call(twasWeightsPipeline, + c(list(qd), tw_args, list(traitId = argv$gene_id))) } dir.create(dirname(argv$output), showWarnings = FALSE, recursive = TRUE)