Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 52 additions & 101 deletions code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,16 @@
"# Optional genotype-derived covariates applied uniformly across contexts;\n",
"# per-context covariates come from the manifest cov_path column.\n",
"parameter: genotype_covariates = path('.')\n",
"# Construct-time variant/sample QC (stored on the QtlDataset; cutoff 0 = off).\n",
"parameter: maf_cutoff = 0.0\n",
"parameter: xvar_cutoff = 0.0\n",
"parameter: mac_cutoff = 0.0\n",
"parameter: imiss_cutoff = 0.0\n",
"# Drop indel variants (default keeps them, matching QtlDataset()).\n",
"parameter: drop_indel = False\n",
"# Optional whitespace-delimited ID files to restrict samples / variants (\".\" = none).\n",
"parameter: keep_samples = \".\"\n",
"parameter: keep_variants = \".\"\n",
"output: f\"{cwd:a}/qtl_dataset/{study}.qtl_dataset.rds\"\n",
"task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n",
"bash: expand = '${ }', stderr = f\"{_output}.stderr\", stdout = f\"{_output}.stdout\", container = container\n",
Expand All @@ -673,6 +681,11 @@
" ${'--transpose-covariates' if transpose_covariates else ''} \\\n",
" --maf-cutoff ${maf_cutoff} \\\n",
" --xvar-cutoff ${xvar_cutoff} \\\n",
" --mac-cutoff ${mac_cutoff} \\\n",
" --imiss-cutoff ${imiss_cutoff} \\\n",
" ${'--drop-indel' if drop_indel else ''} \\\n",
" ${('--keep-samples ' + str(keep_samples)) if keep_samples != '.' else ''} \\\n",
" ${('--keep-variants ' + str(keep_variants)) if keep_variants != '.' else ''} \\\n",
" --output ${_output}"
]
},
Expand Down Expand Up @@ -701,12 +714,43 @@
"parameter: fine_mapping_coverage = 0.95\n",
"# Comma-separated context names to restrict both passes to; empty = all contexts.\n",
"parameter: contexts = \"\"\n",
"# Fine-mapping SER pre-screen (0 = off, <0 = adaptive 3/nVariants).\n",
"parameter: pip_cutoff_to_skip = 0.0\n",
"# TWAS weight-learning knobs.\n",
"parameter: min_twas_maf = 0.01\n",
"parameter: min_twas_xvar = 0.01\n",
"parameter: max_cv_variants = 5000\n",
"parameter: cv_folds = 5\n",
"parameter: cv_threads = 1\n",
"# Reproducibility: integer RNG seed; -1 = unset.\n",
"parameter: seed = -1\n",
"# Opt-in per-analysis overrides of the QtlDataset's construct-time filters,\n",
"# applied to both passes. Sentinels: a negative cutoff / drop_indel False /\n",
"# \".\" path = leave the dataset's stored value untouched.\n",
"parameter: maf_cutoff = -1.0\n",
"parameter: mac_cutoff = -1.0\n",
"parameter: xvar_cutoff = -1.0\n",
"parameter: imiss_cutoff = -1.0\n",
"parameter: drop_indel = False\n",
"parameter: keep_samples = \".\"\n",
"parameter: keep_variants = \".\"\n",
"# Optional JSON of per-method kwargs forwarded to the pipelines, e.g.\n",
"# '{\"susie\":{\"L\":10}}' for fine-mapping or '{\"lasso\":{\"nfolds\":10}}' for TWAS.\n",
"parameter: fine_mapping_method_args = \"\"\n",
"parameter: twas_method_args = \"\"\n",
"fail_if(len(region_name) == 0, \"susie_twas: pass --region-name <gene-id> [<gene-id> ...] to choose which gene(s) to analyze.\")\n",
"qtl_dataset = path(f\"{cwd:a}/qtl_dataset/{name}.qtl_dataset.rds\")\n",
"# Assemble the shared opt-in worker flags once (command construction only).\n",
"seed_arg = f\"--seed {seed}\" if seed >= 0 else \"\"\n",
"filter_overrides = \" \".join(filter(None, [\n",
" f\"--maf-cutoff {maf_cutoff}\" if maf_cutoff >= 0 else \"\",\n",
" f\"--mac-cutoff {mac_cutoff}\" if mac_cutoff >= 0 else \"\",\n",
" f\"--xvar-cutoff {xvar_cutoff}\" if xvar_cutoff >= 0 else \"\",\n",
" f\"--imiss-cutoff {imiss_cutoff}\" if imiss_cutoff >= 0 else \"\",\n",
" \"--drop-indel\" if drop_indel else \"\",\n",
" f\"--keep-samples {keep_samples}\" if keep_samples != \".\" else \"\",\n",
" f\"--keep-variants {keep_variants}\" if keep_variants != \".\" else \"\",\n",
"]))\n",
"input: qtl_dataset, for_each = \"region_name\"\n",
"output: fine_mapping = f\"{cwd:a}/fine_mapping/{name}.{_region_name}.univariate_bvsr.rds\",\n",
" twas_weights = f\"{cwd:a}/twas_weights/{name}.{_region_name}.univariate_twas_weights.rds\"\n",
Expand All @@ -718,6 +762,8 @@
" --cis-window ${cis_window} \\\n",
" --methods ${fine_mapping_methods} \\\n",
" --coverage ${fine_mapping_coverage} \\\n",
" --pip-cutoff-to-skip ${pip_cutoff_to_skip} \\\n",
" ${seed_arg} ${filter_overrides} \\\n",
" ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n",
" ${(\"--method-args '\" + fine_mapping_method_args + \"'\") if fine_mapping_method_args else \"\"} \\\n",
" --output ${_output[\"fine_mapping\"]}\n",
Expand All @@ -727,113 +773,18 @@
" --gene-id ${_region_name} \\\n",
" --cis-window ${cis_window} \\\n",
" --methods ${twas_methods} \\\n",
" --min-twas-maf ${min_twas_maf} \\\n",
" --min-twas-xvar ${min_twas_xvar} \\\n",
" --max-cv-variants ${max_cv_variants} \\\n",
" --cv-folds ${cv_folds} \\\n",
" --cv-threads ${cv_threads} \\\n",
" --fine-mapping-result ${_output[\"fine_mapping\"]} \\\n",
" ${seed_arg} ${filter_overrides} \\\n",
" ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n",
" ${(\"--method-args '\" + twas_method_args + \"'\") if twas_method_args else \"\"} \\\n",
" --output ${_output[\"twas_weights\"]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "SoS"
},
"outputs": [],
"source": [
"[ensemble_twas_weight]\n",
"# Ensemble multiple per-method TWAS weights (susie/susie_inf/mrash/enet/lasso/mcp/scad/l0learn/bayes_r/bayes_c)\n",
"# into a single stacked-regression ensemble weight, using pecotmr::ensemble_weights().\n",
"# Input: the *.univariate_twas_weights.rds produced by the [susie_twas] step (one per region/peak).\n",
"# Output: *.ensemble_twas_weights.rds, identical structure but with an added `ensemble_weights` column.\n",
"# Default -1e9 keeps every method (no R^2 filtering) so the ensemble always\n",
"# computes; raise this (e.g. 0.01) on real data to drop poorly-predicting methods.\n",
"parameter: ensemble_r2_threshold = -1e9\n",
"parameter: ensemble_solver = \"quadprog\"\n",
"parameter: ensemble_alpha = 1.0\n",
"depends: sos_variable(\"regional_data\")\n",
"meta_info = regional_data['meta_info']\n",
"input: regional_data[\"data\"], group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n",
"output_files = [f'{cwd:a}/twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.ensemble_twas_weights.rds']\n",
"output: output_files\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
"R: expand = '${ }', stdout = f\"{_output[0]:nn}.ensemble_twas_weight.stdout\", stderr = f\"{_output[0]:nn}.ensemble_twas_weight.stderr\", container = container\n",
" options(warn=1)\n",
" library(pecotmr)\n",
" # ---- locate the upstream univariate_twas_weights.rds for this region ----\n",
" region_tag <- \"${_meta_info[0].split(':')[0]}_${_meta_info[2]}\"\n",
" in_rds <- paste0(\"${cwd:a}\", \"/twas_weights/\", \"${name}\", \".\", region_tag, \".univariate_twas_weights.rds\")\n",
" if (!file.exists(in_rds)) stop(paste0(\"Missing upstream TWAS weights file: \", in_rds, \" -- run the susie_twas step first.\"))\n",
" obj <- readRDS(in_rds)\n",
" # ---- reload the regional phenotype y (needed as the stacking response) ----\n",
" phenotype_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[1:len(_input)//2+1]])})\n",
" covariate_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[len(_input)//2+1:]])})\n",
" conditions = c(${\",\".join(['\"%s\"' % x for x in _meta_info[4:]])})\n",
" fdat <- load_regional_univariate_data(genotype = ${_input[0]:anr},\n",
" phenotype = phenotype_files,\n",
" covariate = covariate_files,\n",
" region = ${(\"'%s'\" % _meta_info[0]) if not trans_analysis else 'NULL'},\n",
" association_window = \"${_meta_info[1]}\",\n",
" conditions = conditions,\n",
" maf_cutoff = ${maf},\n",
" mac_cutoff = ${mac},\n",
" imiss_cutoff = ${imiss},\n",
" keep_indel = ${\"TRUE\" if indel else \"FALSE\"},\n",
" extract_region_name = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])}),\n",
" phenotype_header = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"},\n",
" region_name_col = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"},\n",
" scale_residuals = FALSE)\n",
" # ---- per region in this object, run the ensemble ----\n",
" ensemble_r2_threshold <- ${ensemble_r2_threshold}\n",
" ensemble_solver <- \"${ensemble_solver}\"\n",
" ensemble_alpha <- ${ensemble_alpha}\n",
" for (rg in names(obj)) {\n",
" for (ph in names(obj[[rg]])) {\n",
" res <- obj[[rg]][[ph]]\n",
" if (is.null(res$twas_cv_result) || is.null(res$twas_cv_result$performance) || is.null(res$twas_weights)) {\n",
" message(\"Skipping \", rg, \"/\", ph, \": no twas_cv_result/twas_weights present.\"); next\n",
" }\n",
" # observed phenotype y for this condition (matched by name to the loaded fdat)\n",
" y <- tryCatch(fdat$residual_Y[[ph]], error = function(e) NULL)\n",
" if (is.null(y)) y <- tryCatch(fdat$residual_Y[[1]], error = function(e) NULL)\n",
" if (is.null(y)) { message(\"Skipping \", rg, \"/\", ph, \": could not recover phenotype y.\"); next }\n",
" y <- as.numeric(y)\n",
" # rank methods by CV R-squared (same logic as twas_weights_pipeline)\n",
" method_rsq <- vapply(res$twas_cv_result$performance, function(perf) perf[1, \"rsq\"], numeric(1))\n",
" names(method_rsq) <- gsub(\"_performance$\", \"\", names(method_rsq))\n",
" passing <- !is.na(method_rsq) & method_rsq >= ensemble_r2_threshold\n",
" n_passing <- sum(passing)\n",
" if (n_passing < 2) {\n",
" surviving <- if (n_passing == 1) names(method_rsq)[passing] else NA\n",
" message(\"Ensemble skipped for \", rg, \"/\", ph, \": \", n_passing, \" of \", length(method_rsq),\n",
" \" methods passed R-squared cutoff \", ensemble_r2_threshold, \" (need >= 2).\")\n",
" obj[[rg]][[ph]]$ensemble_weights <- NULL\n",
" next\n",
" }\n",
" passing_base <- names(method_rsq)[passing]\n",
" passing_pred_names <- paste0(passing_base, \"_predicted\")\n",
" passing_weight_names <- paste0(passing_base, \"_weights\")\n",
" filtered_cv <- res$twas_cv_result\n",
" filtered_cv$prediction <- filtered_cv$prediction[passing_pred_names]\n",
" filtered_weights <- res$twas_weights[passing_weight_names]\n",
" message(\"Computing ensemble TWAS weights via stacked regression for \", rg, \"/\", ph,\n",
" \" using \", n_passing, \" methods: \", paste(passing_base, collapse=\", \"))\n",
" ens_result <- ensemble_weights(cv_results = filtered_cv, Y = y,\n",
" twas_weight_list = filtered_weights,\n",
" solver = ensemble_solver, alpha = ensemble_alpha)\n",
" if (!is.null(ens_result$ensemble_twas_weights)) {\n",
" ens_wt <- ens_result$ensemble_twas_weights\n",
" if (!is.matrix(ens_wt)) ens_wt <- matrix(ens_wt, ncol = 1)\n",
" obj[[rg]][[ph]]$twas_weights$ensemble_weights <- ens_wt\n",
" obj[[rg]][[ph]]$ensemble <- ens_result\n",
" }\n",
" }\n",
" }\n",
" dir.create(dirname(\"${_output[0]}\"), showWarnings = FALSE, recursive = TRUE)\n",
" saveRDS(obj, \"${_output[0]}\", compress = \"xz\")\n",
" message(\"Saved ensemble TWAS weights to: \", \"${_output[0]}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
56 changes: 54 additions & 2 deletions code/script/pecotmr_integration/fine_mapping.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,56 @@ parser <- add_argument(parser, "--L-greedy",
parser <- add_argument(parser, "--method-args",
help = "JSON object {token: {kwarg: value, ...}, ...} for fineMappingPipeline()",
type = "character", default = "")
parser <- add_argument(parser, "--seed",
help = "Integer RNG seed set before fitting (reproducibility); unset = no seeding",
type = "integer", default = NA)
parser <- add_argument(parser, "--pip-cutoff-to-skip",
help = "Single-effect (SER) pre-screen cutoff (fineMappingPipeline pipCutoffToSkip), QTL mode; 0 = off, <0 = adaptive 3/nVariants",
type = "numeric", default = 0)
# --- Per-analysis overrides of the QtlDataset's construct-time filters. Each is
# opt-in (unset leaves the dataset's stored value); applied to the loaded object
# before the pipeline runs (QTL mode only).
parser <- add_argument(parser, "--maf-cutoff",
help = "Override QtlDataset mafCutoff for this analysis",
type = "numeric", default = NA)
parser <- add_argument(parser, "--mac-cutoff",
help = "Override QtlDataset macCutoff",
type = "numeric", default = NA)
parser <- add_argument(parser, "--xvar-cutoff",
help = "Override QtlDataset xvarCutoff",
type = "numeric", default = NA)
parser <- add_argument(parser, "--imiss-cutoff",
help = "Override QtlDataset imissCutoff",
type = "numeric", default = NA)
parser <- add_argument(parser, "--drop-indel",
help = "Drop indels (sets keepIndel = FALSE); omit to use the dataset's stored value",
flag = TRUE)
parser <- add_argument(parser, "--keep-samples",
help = "Path to a whitespace-delimited file of sample IDs to restrict to (overrides keepSamples)",
type = "character", default = "")
parser <- add_argument(parser, "--keep-variants",
help = "Path to a whitespace-delimited file of variant IDs to restrict to (overrides keepVariants)",
type = "character", default = "")
parser <- add_argument(parser, "--output",
help = "Output RDS path", type = "character")
argv <- parse_args(parser)

# Opt-in overrides of a loaded QtlDataset's construct-time filter slots. Filters
# apply lazily at extraction, so mutating the slot re-filters without rebuilding
# the RDS. Shared by both QTL workers (see twas_weights.R).
apply_qd_filter_overrides <- function(qd, argv) {
if (!is.na(argv$maf_cutoff)) qd@mafCutoff <- argv$maf_cutoff
if (!is.na(argv$mac_cutoff)) qd@macCutoff <- argv$mac_cutoff
if (!is.na(argv$xvar_cutoff)) qd@xvarCutoff <- argv$xvar_cutoff
if (!is.na(argv$imiss_cutoff)) qd@imissCutoff <- argv$imiss_cutoff
if (isTRUE(argv$drop_indel)) qd@keepIndel <- FALSE
read_ids <- function(p) if (nzchar(p) && p != "." && file.exists(p))
unique(trimws(unlist(strsplit(readLines(p), "[[:space:]]+")))) else NULL
ks <- read_ids(argv$keep_samples); if (!is.null(ks)) qd@keepSamples <- ks
kv <- read_ids(argv$keep_variants); if (!is.null(kv)) qd@keepVariants <- kv
qd
}

# Parse --method-args into a nested named list of per-method kwargs.
# Empty / '.' / '{}' all yield NULL (which makes us pass the char-vector
# form of methods= to fineMappingPipeline).
Expand Down Expand Up @@ -119,6 +165,9 @@ secondary_cov <- as.numeric(trimws(strsplit(argv$secondary_coverage, ",", fixed
median_abs_corr <- if (length(argv$median_abs_corr) != 1L || is.na(argv$median_abs_corr))
NULL else argv$median_abs_corr

# Seed up front for reproducible fits (mirrors the legacy susie_twas set.seed).
if (length(argv$seed) == 1L && !is.na(argv$seed)) set.seed(as.integer(argv$seed))

parse_region <- function(s) {
m <- regmatches(s, regexec("^([^:]+):([0-9]+)-([0-9]+)$", s))[[1L]]
if (length(m) != 4L)
Expand Down Expand Up @@ -186,10 +235,13 @@ if (has_gwas) {
if (!has_gene && !has_region)
stop("QTL mode requires --gene-id (with --cis-window) or --region.")
qd <- readRDS(argv$qtl_dataset)
# `contexts` is QTL-mode only (NULL = all contexts), so it rides on qtl_args
qd <- apply_qd_filter_overrides(qd, argv)
# `contexts` and `pipCutoffToSkip` are QTL-mode only, so they ride on qtl_args
# rather than the mode-shared cs_args.
qtl_args <- c(list(qd), cs_args,
list(cisWindow = argv$cis_window, contexts = contexts_arg))
list(cisWindow = argv$cis_window,
contexts = contexts_arg,
pipCutoffToSkip = argv$pip_cutoff_to_skip))
res <- if (has_region) {
do.call(fineMappingPipeline, c(qtl_args, list(region = parse_region(argv$region))))
} else {
Expand Down
Loading
Loading